Couple of minor filname handling fixes for windows - resolves #10, resolves #11
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 import logging
13 import multiprocessing
14 import enum
15 import datetime
16 from shutil import copyfile
17 from bs4 import BeautifulSoup
18 from dataclasses import dataclass
19 import selenium
20 from selenium import webdriver
21 from selenium.webdriver.common.by import By
22 from selenium.webdriver.support.ui import WebDriverWait
23 from selenium.webdriver.support import expected_conditions as EC
24 from selenium.webdriver.firefox.options import Options
25 import atexit
26
27 URL_BASE = "https://www.thingiverse.com"
28 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
29 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
30
31 ID_REGEX = re.compile(r'"id":(\d*),')
32 TOTAL_REGEX = re.compile(r'"total":(\d*),')
33 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
34 # This appears to be fixed at 12, but if it changes would screw the rest up.
35 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
36 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
37
38 DOWNLOADER_COUNT = 1
39 RETRY_COUNT = 3
40
41 MAX_PATH_LENGTH = 250
42
43 VERSION = "0.8.5"
44
45
46 #BROWSER = webdriver.PhantomJS('./phantomjs')
47 options = Options()
48 options.add_argument("--headless")
49 BROWSER = webdriver.Firefox(options=options)
50
51 BROWSER.set_window_size(1980, 1080)
52
53
54 @dataclass
55 class FileLink:
56 name: str
57 last_update: str
58 link: datetime.datetime
59
60
61 class State(enum.Enum):
62 OK = enum.auto()
63 FAILED = enum.auto()
64 ALREADY_DOWNLOADED = enum.auto()
65
66
67 def fail_dir(dir_name):
68 """ When a download has failed, move it sideways.
69 """
70 target_dir = "{}_failed".format(dir_name)
71 inc = 0
72 while os.path.exists(target_dir):
73 target_dir = "{}_failed_{}".format(dir_name, inc)
74 inc += 1
75 os.rename(dir_name, target_dir)
76
77
78 def truncate_name(file_name):
79 """ Ensure the filename is not too long for, well windows basically.
80 """
81 path = os.path.abspath(file_name)
82 if len(path) <= MAX_PATH_LENGTH:
83 return path
84 to_cut = len(path) - (MAX_PATH_LENGTH + 3)
85 base, extension = os.path.splitext(path)
86 inc = 0
87 new_path = "{}_{}{}".format(base, inc, extension)
88 while os.path.exists(new_path):
89 new_path = "{}_{}{}".format(base, inc, extension)
90 inc += 1
91 return new_path
92
93
94 def strip_ws(value):
95 """ Remove whitespace from a string """
96 return str(NO_WHITESPACE_REGEX.sub('-', value))
97
98
99 def slugify(value):
100 """
101 Normalise string, removes invalid for filename charactersr
102 and converts string to lowercase.
103 """
104 value = unicodedata.normalize('NFKC', value).lower().strip()
105 value = re.sub(r'[\\/<>:\?\*\|"]', '', value)
106 value = re.sub(r'\.*$', '', value)
107 return value
108
109 class PageChecker(object):
110 def __init__(self):
111 self.log = []
112 self.title = None
113 self.file_count = None
114 self.files = None
115 self.images = None
116 self.license = None
117
118
119 def __call__(self, _):
120 try:
121 self.log.append("call")
122 if self.title is None:
123 # first find the name
124 name = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=ThingPage__modelName]"))
125 if name is None:
126 return False
127 self.title = name.text
128
129 if self.file_count is None:
130 # OK. Do we know how many files we have to download?
131 metrics = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=MetricButton]"))
132 self.log.append("got some metrics: {}".format(len(metrics)))
133 cur_count = int([x.text.split("\n")[0] for x in metrics if x.text.endswith("\nThing Files")][0])
134 self.log.append(cur_count)
135 if cur_count == 0:
136 return False
137 self.file_count = cur_count
138
139 self.log.append("looking for {} files".format(self.file_count))
140 fileRows = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=ThingFile__fileRow]"))
141 self.log.append("found {} files".format(len(fileRows)))
142 if len(fileRows) < self.file_count:
143 return False
144
145 self.log.append("Looking for images")
146 # By this point _should_ have loaded all the images
147 self.images = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=thumb]"))
148 self.license = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=License__licenseText]")).text
149 self.log.append("found {} images".format(len(self.images)))
150 self.files = fileRows
151 return True
152 except Exception:
153 return False
154
155
156
157
158 class Downloader(multiprocessing.Process):
159 """
160 Class to handle downloading the things we have found to get.
161 """
162
163 def __init__(self, thing_queue, download_directory):
164 multiprocessing.Process.__init__(self)
165 # TODO: add parameters
166 self.thing_queue = thing_queue
167 self.download_directory = download_directory
168
169 def run(self):
170 """ actual download loop.
171 """
172 while True:
173 thing_id = self.thing_queue.get()
174 if thing_id is None:
175 logging.info("Shutting download queue")
176 self.thing_queue.task_done()
177 break
178 logging.info("Handling id {}".format(thing_id))
179 Thing(thing_id).download(self.download_directory)
180 self.thing_queue.task_done()
181 return
182
183
184
185
186
187 class Grouping:
188 """ Holds details of a group of things for download
189 This is effectively (although not actually) an abstract class
190 - use Collection or Designs instead.
191 """
192
193 def __init__(self, quick):
194 self.things = []
195 self.total = 0
196 self.req_id = None
197 self.last_page = 0
198 self.per_page = None
199 # Should we stop downloading when we hit a known datestamp?
200 self.quick = quick
201 # These should be set by child classes.
202 self.url = None
203 self.download_dir = None
204 self.collection_url = None
205
206 def _get_small_grouping(self, req):
207 """ Handle small groupings """
208 soup = BeautifulSoup(req.text, features='lxml')
209 links = soup.find_all('a', {'class': 'card-img-holder'})
210 self.things = [x['href'].split(':')[1] for x in links]
211 self.total = len(self.things)
212
213 return self.things
214
215 def get(self):
216 """ retrieve the things of the grouping. """
217 if self.things:
218 # We've already done it.
219 return self.things
220
221 # Check for initialisation:
222 if not self.url:
223 logging.error("No URL set - object not initialised properly?")
224 raise ValueError("No URL set - object not initialised properly?")
225
226 # Get the internal details of the grouping.
227 logging.debug("Querying {}".format(self.url))
228 c_req = requests.get(self.url)
229 total = TOTAL_REGEX.search(c_req.text)
230 if total is None:
231 # This is a small (<13) items grouping. Pull the list from this req.
232 return self._get_small_grouping(c_req)
233 self.total = total.groups()[0]
234 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
235 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
236 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
237 parameters = {
238 'base_url': self.url,
239 'page': '1',
240 'per_page': '12',
241 'id': self.req_id
242 }
243 for current_page in range(1, self.last_page + 1):
244 parameters['page'] = current_page
245 req = requests.post(self.collection_url, parameters)
246 soup = BeautifulSoup(req.text, features='lxml')
247 links = soup.find_all('a', {'class': 'card-img-holder'})
248 self.things += [x['href'].split(':')[1] for x in links]
249
250 return self.things
251
252 def download(self):
253 """ Downloads all the files in a collection """
254 if not self.things:
255 self.get()
256
257 if not self.download_dir:
258 raise ValueError(
259 "No download_dir set - invalidly initialised object?")
260
261 base_dir = os.getcwd()
262 try:
263 os.mkdir(self.download_dir)
264 except FileExistsError:
265 logging.info("Target directory {} already exists. Assuming a resume."
266 .format(self.download_dir))
267 logging.info("Downloading {} thing(s).".format(self.total))
268 for idx, thing in enumerate(self.things):
269 logging.info("Downloading thing {} - {}".format(idx, thing))
270 RC = Thing(thing).download(self.download_dir)
271 if self.quick and RC==State.ALREADY_DOWNLOADED:
272 logging.info("Caught up, stopping.")
273 return
274
275
276 class Collection(Grouping):
277 """ Holds details of a collection. """
278
279 def __init__(self, user, name, directory, quick):
280 Grouping.__init__(self, quick)
281 self.user = user
282 self.name = name
283 self.url = "{}/{}/collections/{}".format(
284 URL_BASE, self.user, strip_ws(self.name))
285 self.download_dir = os.path.join(directory,
286 "{}-{}".format(slugify(self.user), slugify(self.name)))
287 self.collection_url = URL_COLLECTION
288
289
290 class Designs(Grouping):
291 """ Holds details of all of a users' designs. """
292
293 def __init__(self, user, directory, quick):
294 Grouping.__init__(self, quick)
295 self.user = user
296 self.url = "{}/{}/designs".format(URL_BASE, self.user)
297 self.download_dir = os.path.join(
298 directory, "{} designs".format(slugify(self.user)))
299 self.collection_url = USER_COLLECTION
300
301
302 class Thing:
303 """ An individual design on thingiverse. """
304
305 def __init__(self, thing_id):
306 self.thing_id = thing_id
307 self.last_time = None
308 self._parsed = False
309 self._needs_download = True
310 self.text = None
311 self.title = None
312 self.download_dir = None
313
314 def _parse(self, base_dir):
315 """ Work out what, if anything needs to be done. """
316 if self._parsed:
317 return
318
319 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
320 try:
321 BROWSER.get(url)
322 wait = WebDriverWait(BROWSER, 60)
323 pc = PageChecker()
324 wait.until(pc)
325 except requests.exceptions.ConnectionError as error:
326 logging.error("Unable to connect for thing {}: {}".format(
327 self.thing_id, error))
328 return
329 except selenium.common.exceptions.TimeoutException:
330 logging.error(pc.log)
331 logging.error("Timeout trying to parse thing {}".format(self.thing_id))
332 return
333
334 self.title = pc.title
335 self._file_links=[]
336 for link in pc.files:
337 logging.debug("Parsing link: {}".format(link.text))
338 link_link = link.find_element_by_xpath(".//a").get_attribute("href")
339 if link_link.endswith("/zip"):
340 # bulk link.
341 continue
342 try:
343 link_title, link_details, _ = link.text.split("\n")
344 except ValueError:
345 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
346 _, link_title, link_details, _ = link.text.split("\n")
347
348 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
349 #need to convert from M D Y to Y M D
350 link_date = [int(x) for x in link_details.split("|")[1].split()[-1].split("-")]
351 try:
352 self._file_links.append(FileLink(link_title, datetime.datetime(link_date[2], link_date[0], link_date[1]), link_link))
353 except ValueError:
354 logging.error(link_date)
355
356 self._image_links=[x.find_element_by_xpath(".//img").get_attribute("src") for x in pc.images]
357 self._license = pc.license
358 self.pc = pc
359
360
361 self.old_download_dir = os.path.join(base_dir, slugify(self.title))
362 self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, slugify(self.title)))
363
364 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
365
366 if not os.path.exists(self.download_dir):
367 logging.info("Looking for old dir at {}".format(self.old_download_dir))
368 if os.path.exists(self.old_download_dir):
369 logging.warning("Found previous style download directory. Moving it from {} to {}".format(self.old_download_dir, self.download_dir))
370 os.rename(self.old_download_dir, self.download_dir)
371 else:
372 # Not yet downloaded
373 self._parsed = True
374 return
375
376 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
377 if not os.path.exists(timestamp_file):
378 # Old download from before
379 logging.warning(
380 "Old-style download directory found. Assuming update required.")
381 self._parsed = True
382 return
383
384 try:
385 with open(timestamp_file, 'r') as timestamp_handle:
386 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
387 last_bits = [int(x) for x in timestamp_handle.readlines()[0].split(' ')[0].split("-")]
388 logging.warning(last_bits)
389 try:
390 self.last_time = datetime.datetime(last_bits[0], last_bits[1], last_bits[2])
391 except ValueError:
392 # This one appears to be M D Y
393 self.last_time = datetime.datetime(last_bits[2], last_bits[0], last_bits[1])
394
395 logging.info("last downloaded version: {}".format(self.last_time))
396 except FileNotFoundError:
397 # Not run on this thing before.
398 logging.info(
399 "Old-style download directory found. Assuming update required.")
400 self.last_time = None
401 self._needs_download = True
402 self._parsed = True
403 return
404
405 # OK, so we have a timestamp, lets see if there is anything new to get
406 for file_link in self._file_links:
407 if file_link.last_update > self.last_time:
408 logging.info(
409 "Found new/updated file {} - {}".format(file_link.name, file_link.last_update))
410 self._needs_download = True
411 self._parsed = True
412 return
413
414 # Got here, so nope, no new files.
415 self._needs_download = False
416 self._parsed = True
417
418 def download(self, base_dir):
419 """ Download all files for a given thing.
420 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
421 """
422 if not self._parsed:
423 self._parse(base_dir)
424
425 if not self._parsed:
426 logging.error(
427 "Unable to parse {} - aborting download".format(self.thing_id))
428 return State.FAILED
429
430 if not self._needs_download:
431 print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title))
432 return State.ALREADY_DOWNLOADED
433
434 # Have we already downloaded some things?
435 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
436 prev_dir = None
437 if os.path.exists(self.download_dir):
438 if not os.path.exists(timestamp_file):
439 # edge case: old style dir w/out timestamp.
440 logging.warning("Old style download dir found at {}".format(self.title))
441 prev_count = 0
442 target_dir = "{}_old".format(self.download_dir)
443 while os.path.exists(target_dir):
444 prev_count = prev_count + 1
445 target_dir = "{}_old_{}".format(self.download_dir, prev_count)
446 os.rename(self.download_dir, target_dir)
447 else:
448 prev_dir = "{}_{}".format(self.download_dir, slugify(self.last_time.__str__()))
449 os.rename(self.download_dir, prev_dir)
450
451 # Get the list of files to download
452
453 new_file_links = []
454 old_file_links = []
455 new_last_time = None
456
457 if not self.last_time:
458 # If we don't have anything to copy from, then it is all new.
459 logging.debug("No last time, downloading all files")
460 new_file_links = self._file_links
461 new_last_time = new_file_links[0].last_update
462
463 for file_link in new_file_links:
464 new_last_time = max(new_last_time, file_link.last_update)
465 logging.debug("New timestamp will be {}".format(new_last_time))
466 else:
467 new_last_time = self.last_time
468 for file_link in self._file_links:
469 if file_link.last_update > self.last_time:
470 new_file_links.append(file_link)
471 new_last_time = max(new_last_time, file_link.last_update)
472 else:
473 old_file_links.append(file_link)
474
475 logging.debug("new timestamp {}".format(new_last_time))
476
477 # OK. Time to get to work.
478 logging.debug("Generating download_dir")
479 os.mkdir(self.download_dir)
480 filelist_file = os.path.join(self.download_dir, "filelist.txt")
481 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
482 for fl in self._file_links:
483 base_link = fl.link
484 try:
485 fl.link=requests.get(fl.link, allow_redirects=False).headers['location']
486 except Exception:
487 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
488 pass
489
490 fl_handle.write("{},{},{}, {}\n".format(fl.link, fl.name, fl.last_update, base_link))
491
492
493 # First grab the cached files (if any)
494 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
495 for file_link in old_file_links:
496 old_file = os.path.join(prev_dir, file_link.name)
497 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
498 try:
499 logging.debug("Copying {} to {}".format(old_file, new_file))
500 copyfile(old_file, new_file)
501 except FileNotFoundError:
502 logging.warning(
503 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
504 new_file_links.append(file_link)
505
506 # Now download the new ones
507 logging.info("Downloading {} new files of {}".format(
508 len(new_file_links), len(self._file_links)))
509 try:
510 for file_link in new_file_links:
511 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
512 logging.debug("Downloading {} from {} to {}".format(
513 file_link.name, file_link.link, file_name))
514 data_req = requests.get(file_link.link)
515 with open(file_name, 'wb') as handle:
516 handle.write(data_req.content)
517 except Exception as exception:
518 logging.error("Failed to download {} - {}".format(file_link.name, exception))
519 fail_dir(self.download_dir)
520 return State.FAILED
521
522
523 # People like images. But this doesn't work yet.
524 image_dir = os.path.join(self.download_dir, 'images')
525 logging.info("Downloading {} images.".format(len(self._image_links)))
526 try:
527 os.mkdir(image_dir)
528 for imagelink in self._image_links:
529 filename = os.path.basename(imagelink)
530 if filename.endswith('stl'):
531 filename = "{}.png".format(filename)
532 image_req = requests.get(imagelink)
533 with open(truncate_name(os.path.join(image_dir, filename)), 'wb') as handle:
534 handle.write(image_req.content)
535 except Exception as exception:
536 print("Failed to download {} - {}".format(filename, exception))
537 fail_dir(self.download_dir)
538 return State.FAILED
539
540 """
541 # instructions are good too.
542 logging.info("Downloading readme")
543 try:
544 readme_txt = soup.find('meta', property='og:description')[
545 'content']
546 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
547 readme_handle.write("{}\n".format(readme_txt))
548 except (TypeError, KeyError) as exception:
549 logging.warning("No readme? {}".format(exception))
550 except IOError as exception:
551 logging.warning("Failed to write readme! {}".format(exception))
552
553 """
554 # Best get some licenses
555 logging.info("Downloading license")
556 try:
557 if self._license:
558 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w', encoding="utf-8") as license_handle:
559 license_handle.write("{}\n".format(self._license))
560 except IOError as exception:
561 logging.warning("Failed to write license! {}".format(exception))
562
563 try:
564 # Now write the timestamp
565 with open(timestamp_file, 'w', encoding="utf-8") as timestamp_handle:
566 timestamp_handle.write(new_last_time.__str__())
567 except Exception as exception:
568 print("Failed to write timestamp file - {}".format(exception))
569 fail_dir(self.download_dir)
570 return State.FAILED
571 self._needs_download = False
572 logging.debug("Download of {} finished".format(self.title))
573 return State.OK
574
575
576 def do_batch(batch_file, download_dir, quick):
577 """ Read a file in line by line, parsing each as a set of calls to this script."""
578 with open(batch_file) as handle:
579 for line in handle:
580 line = line.strip()
581 if not line:
582 # Skip empty lines
583 continue
584 logging.info("Handling instruction {}".format(line))
585 command_arr = line.split()
586 if command_arr[0] == "thing":
587 logging.debug(
588 "Handling batch thing instruction: {}".format(line))
589 Thing(command_arr[1]).download(download_dir)
590 continue
591 if command_arr[0] == "collection":
592 logging.debug(
593 "Handling batch collection instruction: {}".format(line))
594 Collection(command_arr[1], command_arr[2],
595 download_dir, quick).download()
596 continue
597 if command_arr[0] == "user":
598 logging.debug(
599 "Handling batch collection instruction: {}".format(line))
600 Designs(command_arr[1], download_dir, quick).download()
601 continue
602 logging.warning("Unable to parse current instruction. Skipping.")
603
604
605 def main():
606 """ Entry point for script being run as a command. """
607 parser = argparse.ArgumentParser()
608 parser.add_argument("-l", "--log-level", choices=[
609 'debug', 'info', 'warning'], default='info', help="level of logging desired")
610 parser.add_argument("-d", "--directory",
611 help="Target directory to download into")
612 parser.add_argument("-f", "--log-file",
613 help="Place to log debug information to")
614 parser.add_argument("-q", "--quick", action="store_true",
615 help="Assume date ordering on posts")
616
617 subparsers = parser.add_subparsers(
618 help="Type of thing to download", dest="subcommand")
619 collection_parser = subparsers.add_parser(
620 'collection', help="Download one or more entire collection(s)")
621 collection_parser.add_argument(
622 "owner", help="The owner of the collection(s) to get")
623 collection_parser.add_argument(
624 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
625 thing_parser = subparsers.add_parser(
626 'thing', help="Download a single thing.")
627 thing_parser.add_argument(
628 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
629 user_parser = subparsers.add_parser(
630 "user", help="Download all things by one or more users")
631 user_parser.add_argument(
632 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
633 batch_parser = subparsers.add_parser(
634 "batch", help="Perform multiple actions written in a text file")
635 batch_parser.add_argument(
636 "batch_file", help="The name of the file to read.")
637 subparsers.add_parser("version", help="Show the current version")
638
639 args = parser.parse_args()
640 if not args.subcommand:
641 parser.print_help()
642 sys.exit(1)
643 if not args.directory:
644 args.directory = os.getcwd()
645
646 logger = logging.getLogger()
647 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
648 logger.setLevel(logging.DEBUG)
649 console_handler = logging.StreamHandler()
650 console_handler.setLevel(args.log_level.upper())
651
652 logger.addHandler(console_handler)
653 if args.log_file:
654 file_handler = logging.FileHandler(args.log_file)
655 file_handler.setLevel(logging.DEBUG)
656 file_handler.setFormatter(formatter)
657 logger.addHandler(file_handler)
658
659
660 # Start downloader
661 thing_queue = multiprocessing.JoinableQueue()
662 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
663 downloaders = [Downloader(thing_queue, args.directory) for _ in range(DOWNLOADER_COUNT)]
664 for downloader in downloaders:
665 downloader.start()
666
667
668 if args.subcommand.startswith("collection"):
669 for collection in args.collections:
670 Collection(args.owner, collection, args.directory, args.quick).download()
671 if args.subcommand == "thing":
672 for thing in args.things:
673 thing_queue.put(thing)
674 if args.subcommand == "user":
675 for user in args.users:
676 Designs(user, args.directory, args.quick).download()
677 if args.subcommand == "version":
678 print("thingy_grabber.py version {}".format(VERSION))
679 if args.subcommand == "batch":
680 do_batch(args.batch_file, args.directory, args.quick)
681
682 # Stop the downloader processes
683 for downloader in downloaders:
684 thing_queue.put(None)
685
686 atexit.register(BROWSER.quit)
687
688 if __name__ == "__main__":
689 multiprocessing.freeze_support()
690 main()