Fix #4 - deal with bug in date parsing
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 import logging
13 import multiprocessing
14 import enum
15 import datetime
16 from shutil import copyfile
17 from bs4 import BeautifulSoup
18 from dataclasses import dataclass
19 import selenium
20 from selenium import webdriver
21 from selenium.webdriver.common.by import By
22 from selenium.webdriver.support.ui import WebDriverWait
23 from selenium.webdriver.support import expected_conditions as EC
24 from selenium.webdriver.firefox.options import Options
25
26 URL_BASE = "https://www.thingiverse.com"
27 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
28 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
29
30 ID_REGEX = re.compile(r'"id":(\d*),')
31 TOTAL_REGEX = re.compile(r'"total":(\d*),')
32 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
33 # This appears to be fixed at 12, but if it changes would screw the rest up.
34 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
35 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
36
37 DOWNLOADER_COUNT = 1
38 RETRY_COUNT = 3
39
40 VERSION = "0.8.1"
41
42
43 #BROWSER = webdriver.PhantomJS('./phantomjs')
44 options = Options()
45 options.add_argument("--headless")
46 BROWSER = webdriver.Firefox(options=options)
47
48 BROWSER.set_window_size(1980, 1080)
49
50
51 @dataclass
52 class FileLink:
53 name: str
54 last_update: str
55 link: datetime.datetime
56
57
58 class State(enum.Enum):
59 OK = enum.auto()
60 FAILED = enum.auto()
61 ALREADY_DOWNLOADED = enum.auto()
62
63
64 def strip_ws(value):
65 """ Remove whitespace from a string """
66 return str(NO_WHITESPACE_REGEX.sub('-', value))
67
68
69 def slugify(value):
70 """
71 Normalizes string, converts to lowercase, removes non-alpha characters,
72 and converts spaces to hyphens.
73 """
74 value = unicodedata.normalize('NFKD', value).encode(
75 'ascii', 'ignore').decode()
76 value = str(re.sub(r'[^\w\s-]', '', value).strip())
77 value = str(NO_WHITESPACE_REGEX.sub('-', value))
78 return value
79
80 class PageChecker(object):
81 def __init__(self):
82 self.log = []
83 self.title = None
84 self.file_count = None
85 self.files = None
86 self.images = None
87 self.license = None
88
89
90 def __call__(self, _):
91 try:
92 self.log.append("call")
93 if self.title is None:
94 # first find the name
95 name = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=ThingPage__modelName]"))
96 if name is None:
97 return False
98 self.title = name.text
99
100 if self.file_count is None:
101 # OK. Do we know how many files we have to download?
102 metrics = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=MetricButton]"))
103 self.log.append("got some metrics: {}".format(len(metrics)))
104 cur_count = int([x.text.split("\n")[0] for x in metrics if x.text.endswith("\nThing Files")][0])
105 self.log.append(cur_count)
106 if cur_count == 0:
107 return False
108 self.file_count = cur_count
109
110 self.log.append("looking for {} files".format(self.file_count))
111 fileRows = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=ThingFile__fileRow]"))
112 self.log.append("found {} files".format(len(fileRows)))
113 if len(fileRows) < self.file_count:
114 return False
115
116 self.log.append("Looking for images")
117 # By this point _should_ have loaded all the images
118 self.images = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=thumb]"))
119 self.license = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=License__licenseText]")).text
120 self.log.append("found {} images".format(len(self.images)))
121 self.files = fileRows
122 return True
123 except Exception:
124 return False
125
126
127
128
129 class Downloader(multiprocessing.Process):
130 """
131 Class to handle downloading the things we have found to get.
132 """
133
134 def __init__(self, thing_queue, download_directory):
135 multiprocessing.Process.__init__(self)
136 # TODO: add parameters
137 self.thing_queue = thing_queue
138 self.download_directory = download_directory
139
140 def run(self):
141 """ actual download loop.
142 """
143 while True:
144 thing_id = self.thing_queue.get()
145 if thing_id is None:
146 logging.info("Shutting download queue")
147 self.thing_queue.task_done()
148 break
149 logging.info("Handling id {}".format(thing_id))
150 Thing(thing_id).download(self.download_directory)
151 self.thing_queue.task_done()
152 return
153
154
155
156
157
158 class Grouping:
159 """ Holds details of a group of things for download
160 This is effectively (although not actually) an abstract class
161 - use Collection or Designs instead.
162 """
163
164 def __init__(self, quick):
165 self.things = []
166 self.total = 0
167 self.req_id = None
168 self.last_page = 0
169 self.per_page = None
170 # Should we stop downloading when we hit a known datestamp?
171 self.quick = quick
172 # These should be set by child classes.
173 self.url = None
174 self.download_dir = None
175 self.collection_url = None
176
177 def _get_small_grouping(self, req):
178 """ Handle small groupings """
179 soup = BeautifulSoup(req.text, features='lxml')
180 links = soup.find_all('a', {'class': 'card-img-holder'})
181 self.things = [x['href'].split(':')[1] for x in links]
182 self.total = len(self.things)
183
184 return self.things
185
186 def get(self):
187 """ retrieve the things of the grouping. """
188 if self.things:
189 # We've already done it.
190 return self.things
191
192 # Check for initialisation:
193 if not self.url:
194 logging.error("No URL set - object not initialised properly?")
195 raise ValueError("No URL set - object not initialised properly?")
196
197 # Get the internal details of the grouping.
198 logging.debug("Querying {}".format(self.url))
199 c_req = requests.get(self.url)
200 total = TOTAL_REGEX.search(c_req.text)
201 if total is None:
202 # This is a small (<13) items grouping. Pull the list from this req.
203 return self._get_small_grouping(c_req)
204 self.total = total.groups()[0]
205 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
206 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
207 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
208 parameters = {
209 'base_url': self.url,
210 'page': '1',
211 'per_page': '12',
212 'id': self.req_id
213 }
214 for current_page in range(1, self.last_page + 1):
215 parameters['page'] = current_page
216 req = requests.post(self.collection_url, parameters)
217 soup = BeautifulSoup(req.text, features='lxml')
218 links = soup.find_all('a', {'class': 'card-img-holder'})
219 self.things += [x['href'].split(':')[1] for x in links]
220
221 return self.things
222
223 def download(self):
224 """ Downloads all the files in a collection """
225 if not self.things:
226 self.get()
227
228 if not self.download_dir:
229 raise ValueError(
230 "No download_dir set - invalidly initialised object?")
231
232 base_dir = os.getcwd()
233 try:
234 os.mkdir(self.download_dir)
235 except FileExistsError:
236 logging.info("Target directory {} already exists. Assuming a resume."
237 .format(self.download_dir))
238 logging.info("Downloading {} thing(s).".format(self.total))
239 for idx, thing in enumerate(self.things):
240 logging.info("Downloading thing {} - {}".format(idx, thing))
241 RC = Thing(thing).download(self.download_dir)
242 if self.quick and RC==State.ALREADY_DOWNLOADED:
243 logging.info("Caught up, stopping.")
244 return
245
246
247 class Collection(Grouping):
248 """ Holds details of a collection. """
249
250 def __init__(self, user, name, directory, quick):
251 Grouping.__init__(self, quick)
252 self.user = user
253 self.name = name
254 self.url = "{}/{}/collections/{}".format(
255 URL_BASE, self.user, strip_ws(self.name))
256 self.download_dir = os.path.join(directory,
257 "{}-{}".format(slugify(self.user), slugify(self.name)))
258 self.collection_url = URL_COLLECTION
259
260
261 class Designs(Grouping):
262 """ Holds details of all of a users' designs. """
263
264 def __init__(self, user, directory, quick):
265 Grouping.__init__(self, quick)
266 self.user = user
267 self.url = "{}/{}/designs".format(URL_BASE, self.user)
268 self.download_dir = os.path.join(
269 directory, "{} designs".format(slugify(self.user)))
270 self.collection_url = USER_COLLECTION
271
272
273 class Thing:
274 """ An individual design on thingiverse. """
275
276 def __init__(self, thing_id):
277 self.thing_id = thing_id
278 self.last_time = None
279 self._parsed = False
280 self._needs_download = True
281 self.text = None
282 self.title = None
283 self.download_dir = None
284
285 def _parse(self, base_dir):
286 """ Work out what, if anything needs to be done. """
287 if self._parsed:
288 return
289
290 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
291 try:
292 BROWSER.get(url)
293 wait = WebDriverWait(BROWSER, 60)
294 pc = PageChecker()
295 wait.until(pc)
296 except requests.exceptions.ConnectionError as error:
297 logging.error("Unable to connect for thing {}: {}".format(
298 self.thing_id, error))
299 return
300 except selenium.common.exceptions.TimeoutException:
301 logging.error(pc.log)
302 logging.error("Timeout trying to parse thing {}".format(self.thing_id))
303 return
304
305 self.title = pc.title
306 self._file_links=[]
307 for link in pc.files:
308 logging.debug("Parsing link: {}".format(link.text))
309 link_link = link.find_element_by_xpath(".//a").get_attribute("href")
310 if link_link.endswith("/zip"):
311 # bulk link.
312 continue
313 try:
314 link_title, link_details, _ = link.text.split("\n")
315 except ValueError:
316 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
317 _, link_title, link_details, _ = link.text.split("\n")
318
319 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
320 #need to convert from M D Y to Y M D
321 link_date = [int(x) for x in link_details.split("|")[1].split()[-1].split("-")]
322 logging.error(link_details)
323 try:
324 self._file_links.append(FileLink(link_title, datetime.datetime(link_date[2], link_date[0], link_date[1]), link_link))
325 except ValueError:
326 logging.error(link_date)
327
328 self._image_links=[x.find_element_by_xpath(".//img").get_attribute("src") for x in pc.images]
329 self._license = pc.license
330 self.pc = pc
331
332
333 self.old_download_dir = os.path.join(base_dir, slugify(self.title))
334 self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, slugify(self.title)))
335
336 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
337
338 if not os.path.exists(self.download_dir):
339 logging.info("Looking for old dir at {}".format(self.old_download_dir))
340 if os.path.exists(self.old_download_dir):
341 logging.warning("Found previous style download directory. Moving it from {} to {}".format(self.old_download_dir, self.download_dir))
342 os.rename(self.old_download_dir, self.download_dir)
343 else:
344 # Not yet downloaded
345 self._parsed = True
346 return
347
348 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
349 if not os.path.exists(timestamp_file):
350 # Old download from before
351 logging.warning(
352 "Old-style download directory found. Assuming update required.")
353 self._parsed = True
354 return
355
356 try:
357 with open(timestamp_file, 'r') as timestamp_handle:
358 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
359 last_bits = [int(x) for x in timestamp_handle.readlines()[0].split(' ')[0].split("-")]
360 logging.warning(last_bits)
361 try:
362 self.last_time = datetime.datetime(last_bits[0], last_bits[1], last_bits[2])
363 except ValueError:
364 # This one appears to be M D Y
365 self.last_time = datetime.datetime(last_bits[2], last_bits[0], last_bits[1])
366
367 logging.info("last downloaded version: {}".format(self.last_time))
368 except FileNotFoundError:
369 # Not run on this thing before.
370 logging.info(
371 "Old-style download directory found. Assuming update required.")
372 self.last_time = None
373 self._needs_download = True
374 self._parsed = True
375 return
376
377 # OK, so we have a timestamp, lets see if there is anything new to get
378 for file_link in self._file_links:
379 if file_link.last_update > self.last_time:
380 logging.info(
381 "Found new/updated file {} - {}".format(file_link.name, file_link.last_update))
382 self._needs_download = True
383 self._parsed = True
384 return
385
386 # Got here, so nope, no new files.
387 self._needs_download = False
388 self._parsed = True
389
390 def download(self, base_dir):
391 """ Download all files for a given thing.
392 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
393 """
394 if not self._parsed:
395 self._parse(base_dir)
396
397 if not self._parsed:
398 logging.error(
399 "Unable to parse {} - aborting download".format(self.thing_id))
400 return State.FAILED
401
402 if not self._needs_download:
403 print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title))
404 return State.ALREADY_DOWNLOADED
405
406 # Have we already downloaded some things?
407 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
408 prev_dir = None
409 if os.path.exists(self.download_dir):
410 if not os.path.exists(timestamp_file):
411 # edge case: old style dir w/out timestamp.
412 logging.warning("Old style download dir found at {}".format(self.title))
413 prev_count = 0
414 target_dir = "{}_old".format(self.download_dir)
415 while os.path.exists(target_dir):
416 prev_count = prev_count + 1
417 target_dir = "{}_old_{}".format(self.download_dir, prev_count)
418 os.rename(self.download_dir, target_dir)
419 else:
420 prev_dir = "{}_{}".format(self.download_dir, slugify(self.last_time.__str__()))
421 os.rename(self.download_dir, prev_dir)
422
423 # Get the list of files to download
424
425 new_file_links = []
426 old_file_links = []
427 new_last_time = None
428
429 if not self.last_time:
430 # If we don't have anything to copy from, then it is all new.
431 logging.debug("No last time, downloading all files")
432 new_file_links = self._file_links
433 new_last_time = new_file_links[0].last_update
434
435 for file_link in new_file_links:
436 new_last_time = max(new_last_time, file_link.last_update)
437 logging.debug("New timestamp will be {}".format(new_last_time))
438 else:
439 new_last_time = self.last_time
440 for file_link in self._file_links:
441 if file_link.last_update > self.last_time:
442 new_file_links.append(file_link)
443 new_last_time = max(new_last_time, file_link.last_update)
444 else:
445 old_file_links.append(file_link)
446
447 logging.debug("new timestamp {}".format(new_last_time))
448
449 # OK. Time to get to work.
450 logging.debug("Generating download_dir")
451 os.mkdir(self.download_dir)
452 filelist_file = os.path.join(self.download_dir, "filelist.txt")
453 with open(filelist_file, 'w') as fl_handle:
454 for fl in self._file_links:
455 base_link = fl.link
456 try:
457 fl.link=requests.get(fl.link, allow_redirects=False).headers['location']
458 except Exception:
459 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
460 pass
461
462 fl_handle.write("{},{},{}, {}\n".format(fl.link, fl.name, fl.last_update, base_link))
463
464
465 # First grab the cached files (if any)
466 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
467 for file_link in old_file_links:
468 old_file = os.path.join(prev_dir, file_link.name)
469 new_file = os.path.join(self.download_dir, file_link.name)
470 try:
471 logging.debug("Copying {} to {}".format(old_file, new_file))
472 copyfile(old_file, new_file)
473 except FileNotFoundError:
474 logging.warning(
475 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
476 new_file_links.append(file_link)
477
478 # Now download the new ones
479 logging.info("Downloading {} new files of {}".format(
480 len(new_file_links), len(self._file_links)))
481 try:
482 for file_link in new_file_links:
483 file_name = os.path.join(self.download_dir, file_link.name)
484 logging.debug("Downloading {} from {} to {}".format(
485 file_link.name, file_link.link, file_name))
486 data_req = requests.get(file_link.link)
487 with open(file_name, 'wb') as handle:
488 handle.write(data_req.content)
489 except Exception as exception:
490 logging.error("Failed to download {} - {}".format(file_link.name, exception))
491 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
492 return State.FAILED
493
494
495 # People like images. But this doesn't work yet.
496 image_dir = os.path.join(self.download_dir, 'images')
497 logging.info("Downloading {} images.".format(len(self._image_links)))
498 try:
499 os.mkdir(image_dir)
500 for imagelink in self._image_links:
501 filename = os.path.basename(imagelink)
502 if filename.endswith('stl'):
503 filename = "{}.png".format(filename)
504 image_req = requests.get(imagelink)
505 with open(os.path.join(image_dir, filename), 'wb') as handle:
506 handle.write(image_req.content)
507 except Exception as exception:
508 print("Failed to download {} - {}".format(filename, exception))
509 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
510 return State.FAILED
511
512 """
513 # instructions are good too.
514 logging.info("Downloading readme")
515 try:
516 readme_txt = soup.find('meta', property='og:description')[
517 'content']
518 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
519 readme_handle.write("{}\n".format(readme_txt))
520 except (TypeError, KeyError) as exception:
521 logging.warning("No readme? {}".format(exception))
522 except IOError as exception:
523 logging.warning("Failed to write readme! {}".format(exception))
524
525 """
526 # Best get some licenses
527 logging.info("Downloading license")
528 try:
529 if self._license:
530 with open(os.path.join(self.download_dir, 'license.txt'), 'w') as license_handle:
531 license_handle.write("{}\n".format(self._license))
532 except IOError as exception:
533 logging.warning("Failed to write license! {}".format(exception))
534
535 try:
536 # Now write the timestamp
537 with open(timestamp_file, 'w') as timestamp_handle:
538 timestamp_handle.write(new_last_time.__str__())
539 except Exception as exception:
540 print("Failed to write timestamp file - {}".format(exception))
541 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
542 return State.FAILED
543 self._needs_download = False
544 logging.debug("Download of {} finished".format(self.title))
545 return State.OK
546
547
548 def do_batch(batch_file, download_dir, quick):
549 """ Read a file in line by line, parsing each as a set of calls to this script."""
550 with open(batch_file) as handle:
551 for line in handle:
552 line = line.strip()
553 if not line:
554 # Skip empty lines
555 continue
556 logging.info("Handling instruction {}".format(line))
557 command_arr = line.split()
558 if command_arr[0] == "thing":
559 logging.debug(
560 "Handling batch thing instruction: {}".format(line))
561 Thing(command_arr[1]).download(download_dir)
562 continue
563 if command_arr[0] == "collection":
564 logging.debug(
565 "Handling batch collection instruction: {}".format(line))
566 Collection(command_arr[1], command_arr[2],
567 download_dir, quick).download()
568 continue
569 if command_arr[0] == "user":
570 logging.debug(
571 "Handling batch collection instruction: {}".format(line))
572 Designs(command_arr[1], download_dir, quick).download()
573 continue
574 logging.warning("Unable to parse current instruction. Skipping.")
575
576
577 def main():
578 """ Entry point for script being run as a command. """
579 parser = argparse.ArgumentParser()
580 parser.add_argument("-l", "--log-level", choices=[
581 'debug', 'info', 'warning'], default='info', help="level of logging desired")
582 parser.add_argument("-d", "--directory",
583 help="Target directory to download into")
584 parser.add_argument("-f", "--log-file",
585 help="Place to log debug information to")
586 parser.add_argument("-q", "--quick", action="store_true",
587 help="Assume date ordering on posts")
588
589 subparsers = parser.add_subparsers(
590 help="Type of thing to download", dest="subcommand")
591 collection_parser = subparsers.add_parser(
592 'collection', help="Download one or more entire collection(s)")
593 collection_parser.add_argument(
594 "owner", help="The owner of the collection(s) to get")
595 collection_parser.add_argument(
596 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
597 thing_parser = subparsers.add_parser(
598 'thing', help="Download a single thing.")
599 thing_parser.add_argument(
600 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
601 user_parser = subparsers.add_parser(
602 "user", help="Download all things by one or more users")
603 user_parser.add_argument(
604 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
605 batch_parser = subparsers.add_parser(
606 "batch", help="Perform multiple actions written in a text file")
607 batch_parser.add_argument(
608 "batch_file", help="The name of the file to read.")
609 subparsers.add_parser("version", help="Show the current version")
610
611 args = parser.parse_args()
612 if not args.subcommand:
613 parser.print_help()
614 sys.exit(1)
615 if not args.directory:
616 args.directory = os.getcwd()
617
618 logger = logging.getLogger()
619 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
620 logger.setLevel(logging.DEBUG)
621 console_handler = logging.StreamHandler()
622 console_handler.setLevel(args.log_level.upper())
623
624 logger.addHandler(console_handler)
625 if args.log_file:
626 file_handler = logging.FileHandler(args.log_file)
627 file_handler.setLevel(logging.DEBUG)
628 file_handler.setFormatter(formatter)
629 logger.addHandler(file_handler)
630
631
632 # Start downloader
633 thing_queue = multiprocessing.JoinableQueue()
634 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
635 downloaders = [Downloader(thing_queue, args.directory) for _ in range(DOWNLOADER_COUNT)]
636 for downloader in downloaders:
637 downloader.start()
638
639
640 if args.subcommand.startswith("collection"):
641 for collection in args.collections:
642 Collection(args.owner, collection, args.directory, args.quick).download()
643 if args.subcommand == "thing":
644 for thing in args.things:
645 thing_queue.put(thing)
646 if args.subcommand == "user":
647 for user in args.users:
648 Designs(user, args.directory, args.quick).download()
649 if args.subcommand == "version":
650 print("thingy_grabber.py version {}".format(VERSION))
651 if args.subcommand == "batch":
652 do_batch(args.batch_file, args.directory, args.quick)
653
654 # Stop the downloader processes
655 for downloader in downloaders:
656 thing_queue.put(None)
657
658 if __name__ == "__main__":
659 multiprocessing.freeze_support()
660 main()