Make readmes text files
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 import logging
13 import multiprocessing
14 import enum
15 import datetime
16 from shutil import copyfile
17 from dataclasses import dataclass
18 import py7zr
19 import glob
20 import shutil
21 from io import StringIO
22 from html.parser import HTMLParser
23
24 SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
25
26 # I don't think this is exported by datetime
27 DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
28 # Windows cannot handle : in filenames
29 SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
30
31 API_BASE = "https://api.thingiverse.com"
32 ACCESS_QP = "access_token={}"
33 PAGE_QP = "page={}"
34 API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
35 API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
36
37 # Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
38 API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
39 API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
40
41 API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
42 API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
43 API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
44 API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
45
46 DOWNLOADER_COUNT = 1
47 RETRY_COUNT = 3
48
49 MAX_PATH_LENGTH = 250
50
51 VERSION = "0.10.4"
52
53 TIMESTAMP_FILE = "timestamp.txt"
54
55 SESSION = requests.Session()
56
57
58 class MLStripper(HTMLParser):
59 """ Turns HTML markup into plain text
60 """
61
62 def error(self, message):
63 raise ValueError(message)
64
65 def __init__(self):
66 super().__init__()
67 self.reset()
68 self.strict = False
69 self.convert_charrefs= True
70 self.text = StringIO()
71
72 def handle_data(self, d):
73 self.text.write(d)
74
75 def get_data(self):
76 return self.text.getvalue()
77
78 @staticmethod
79 def strip_tags(html):
80 s = MLStripper()
81 s.feed(html)
82 return s.get_data()
83
84 @dataclass
85 class ThingLink:
86 thing_id: str
87 name: str
88 api_link: str
89
90
91 @dataclass
92 class FileLink:
93 name: str
94 last_update: datetime.datetime
95 link: str
96
97
98 @dataclass
99 class ImageLink:
100 name: str
101 link: str
102
103
104 class FileLinks:
105 def __init__(self, initial_links=None):
106 if initial_links is None:
107 initial_links = []
108 self.links = []
109 self.last_update = None
110 for link in initial_links:
111 self.append(link)
112
113 def __iter__(self):
114 return iter(self.links)
115
116 def __getitem__(self, item):
117 return self.links[item]
118
119 def __len__(self):
120 return len(self.links)
121
122 def append(self, link):
123 try:
124 self.last_update = max(self.last_update, link.last_update)
125 except TypeError:
126 self.last_update = link.last_update
127 self.links.append(link)
128
129
130 class State(enum.Enum):
131 OK = enum.auto()
132 FAILED = enum.auto()
133 ALREADY_DOWNLOADED = enum.auto()
134
135
136 def sanitise_url(url):
137 """ remove api keys from an url
138 """
139 return re.sub(r'access_token=\w*',
140 'access_token=***',
141 url)
142
143
144 def strip_time(date_obj):
145 """ Takes a datetime object and returns another with the time set to 00:00
146 """
147 return datetime.datetime.combine(date_obj.date(), datetime.time())
148
149
150 def rename_unique(dir_name, target_dir_name):
151 """ Move a directory sideways to a new name, ensuring it is unique.
152 """
153 target_dir = target_dir_name
154 inc = 0
155 while os.path.exists(target_dir):
156 target_dir = "{}_{}".format(target_dir_name, inc)
157 inc += 1
158 os.rename(dir_name, target_dir)
159 return target_dir
160
161
162 def fail_dir(dir_name):
163 """ When a download has failed, move it sideways.
164 """
165 return rename_unique(dir_name, "{}_failed".format(dir_name))
166
167
168 def truncate_name(file_name):
169 """ Ensure the filename is not too long for, well windows basically.
170 """
171 path = os.path.abspath(file_name)
172 if len(path) <= MAX_PATH_LENGTH:
173 return path
174 base, extension = os.path.splitext(path)
175 inc = 0
176 new_path = "{}_{}{}".format(base, inc, extension)
177 while os.path.exists(new_path):
178 new_path = "{}_{}{}".format(base, inc, extension)
179 inc += 1
180 return new_path
181
182
183 def slugify(value):
184 """
185 Normalise string, removes invalid for filename charactersr
186 and converts string to lowercase.
187 """
188 logging.debug("Sluggyfying {}".format(value))
189 value = unicodedata.normalize('NFKC', value).lower().strip()
190 value = re.sub(r'[\\/<>:?*|"]', '', value)
191 value = re.sub(r'\.*$', '', value)
192 return value.strip()
193
194
195 class Downloader(multiprocessing.Process):
196 """
197 Class to handle downloading the things we have found to get.
198 """
199
200 def __init__(self, thing_queue, download_directory, compress, api_key):
201 multiprocessing.Process.__init__(self)
202 # TODO: add parameters
203 self.thing_queue = thing_queue
204 self.download_directory = download_directory
205 self.compress = compress
206 self.api_key = api_key
207
208 def run(self):
209 """ actual download loop.
210 """
211 while True:
212 thing_id = self.thing_queue.get()
213 if thing_id is None:
214 logging.info("Shutting download queue")
215 self.thing_queue.task_done()
216 break
217 thing = None
218 if isinstance(thing_id, str):
219 thing = Thing.from_thing_id(thing_id)
220 if isinstance(thing_id, ThingLink):
221 thing = Thing(thing_id)
222 if not thing:
223 logging.error("Don't know how to handle thing_id {}".format(thing_id))
224 else:
225 logging.info("Handling id {}".format(thing_id))
226 thing.download(self.download_directory, self.compress, self.api_key)
227 self.thing_queue.task_done()
228 return
229
230
231 class Grouping:
232 """ Holds details of a group of things for download
233 This is effectively (although not actually) an abstract class
234 - use Collection or Designs instead.
235 """
236
237 def __init__(self, quick, compress, api_key):
238 self.things = []
239 self.total = 0
240 self.req_id = None
241 self.last_page = 0
242 self.per_page = None
243 # Should we stop downloading when we hit a known datestamp?
244 self.quick = quick
245 self.compress = compress
246 self.api_key = api_key
247 # These should be set by child classes.
248 self.url = None
249 self.download_dir = None
250
251
252 @property
253 def get(self):
254 """ retrieve the things of the grouping. """
255 if self.things:
256 # We've already done it.
257 return self.things
258
259 # Check for initialisation:
260 if not self.url:
261 logging.error("No URL set - object not initialised properly?")
262 raise ValueError("No URL set - object not initialised properly?")
263
264 # Get the internal details of the grouping.
265 logging.debug("Querying {}".format(sanitise_url(self.url)))
266
267 # self.url should already have been formatted as we don't need pagination
268 logging.info("requesting:{}".format(sanitise_url(self.url)))
269 current_req = SESSION.get(self.url)
270 if current_req.status_code != 200:
271 logging.error(
272 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
273 current_req.text))
274 else:
275 current_json = current_req.json()
276 for thing in current_json:
277 logging.info(thing)
278 self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
279 logging.info("Found {} things.".format(len(self.things)))
280 return self.things
281
282 def download(self):
283 """ Downloads all the files in a collection """
284 if not self.things:
285 self.get
286
287 if not self.download_dir:
288 raise ValueError(
289 "No download_dir set - invalidly initialised object?")
290
291 base_dir = os.getcwd()
292 try:
293 os.mkdir(self.download_dir)
294 except FileExistsError:
295 logging.info("Target directory {} already exists. Assuming a resume."
296 .format(self.download_dir))
297 logging.info("Downloading {} thing(s).".format(self.total))
298 for idx, thing in enumerate(self.things):
299 logging.info("Downloading thing {} - {}".format(idx, thing))
300 return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
301 if self.quick and return_code == State.ALREADY_DOWNLOADED:
302 logging.info("Caught up, stopping.")
303 return
304
305
306 class Collection(Grouping):
307 """ Holds details of a collection. """
308
309 def __init__(self, user, name, directory, quick, compress, api_key):
310 Grouping.__init__(self, quick, compress, api_key)
311 self.user = user
312 self.name = name
313 self.paginated = False
314 # need to figure out the the ID for the collection
315 collection_url = API_USER_COLLECTIONS.format(user, api_key)
316 try:
317 current_req = SESSION.get(collection_url)
318 except requests.exceptions.ConnectionError as error:
319 logging.error("Unable to connect for collections for user {}: {}".format(
320 self.user, error))
321 return
322 if current_req.status_code != 200:
323 logging.error(
324 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
325 current_req.text))
326 return
327 collection_list = current_req.json()
328 try:
329 # case insensitive to retain parity with previous behaviour
330 collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
331 except IndexError:
332 logging.error("Unable to find collection {} for user {}".format(name, user))
333 return
334 self.collection_id = collection['id']
335 self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
336
337 self.download_dir = os.path.join(directory,
338 "{}-{}".format(slugify(self.user), slugify(self.name)))
339
340
341 class Designs(Grouping):
342 """ Holds details of all of a users' designs. """
343
344 def __init__(self, user, directory, quick, compress, api_key):
345 Grouping.__init__(self, quick, compress, api_key)
346 self.user = user
347 self.url = API_USER_DESIGNS.format(user, api_key)
348 self.download_dir = os.path.join(
349 directory, "{} designs".format(slugify(self.user)))
350
351
352 class Thing:
353 """ An individual design on thingiverse. """
354
355 def __init__(self, thing_link):
356 self.thing_id = thing_link.thing_id
357 self.name = thing_link.name
358 self.last_time = None
359 self._parsed = False
360 self._needs_download = True
361 self.text = None
362 self.download_dir = None
363 self.time_stamp = None
364 self._file_links = FileLinks()
365 self._image_links = []
366
367 @classmethod
368 def from_thing_id(cls, thing_id):
369 """
370 Factory method that looks up a thing by ID and creates a Thing object for it
371 :param thing_id: to look up
372 :return: Thing or None
373 """
374 return Thing(ThingLink(thing_id, "", ""))
375
376
377 def _parse(self, base_dir, api_key):
378 """ Work out what, if anything needs to be done. """
379 if self._parsed:
380 return
381
382 # First get the broad details
383 url = API_THING_DETAILS.format(self.thing_id, api_key)
384 try:
385 current_req = SESSION.get(url)
386 except requests.exceptions.ConnectionError as error:
387 logging.error("Unable to connect for thing {}: {}".format(
388 self.thing_id, error))
389 return
390 # Check for DMCA
391 if current_req.status_code == 403:
392 logging.error("Access to thing {} is forbidden".format(self.thing_id))
393 return
394 if current_req.status_code != 200:
395 logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
396 current_req.text))
397 return
398
399 thing_json = current_req.json()
400 try:
401 self._license = thing_json['license']
402 except KeyError:
403 logging.warning("No license found for thing {}?".format(self.thing_id))
404
405 details = None
406 try:
407 details = thing_json['details']
408 except KeyError:
409 logging.warning("No description found for thing {}?".format(self.thing_id))
410
411
412 if details:
413 try:
414 self._details = MLStripper.strip_tags(details)
415 except ValueError as e:
416 logging.warning("Unable to strip HTML from readme: {}".format(e))
417 self._details = details
418
419
420 if not self.name:
421 # Probably generated with factory method.
422 try:
423 self.name = thing_json['name']
424 except KeyError:
425 logging.warning("No name found for thing {}?".format(self.thing_id))
426 self.name = self.thing_id
427
428 # Now get the file details
429 file_url = API_THING_FILES.format(self.thing_id, api_key)
430
431 try:
432 current_req = SESSION.get(file_url)
433 except requests.exceptions.ConnectionError as error:
434 logging.error("Unable to connect for thing {}: {}".format(
435 self.thing_id, error))
436 return
437
438 if current_req.status_code != 200:
439 logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
440 current_req.text))
441 return
442
443 link_list = current_req.json()
444
445 if not link_list:
446 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
447 self.thing_id))
448
449 for link in link_list:
450 logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
451 try:
452 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
453 self._file_links.append(
454 FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
455 except ValueError:
456 logging.error(link['date'])
457
458 # Finally get the image links
459 image_url = API_THING_IMAGES.format(self.thing_id, api_key)
460
461 try:
462 current_req = SESSION.get(image_url)
463 except requests.exceptions.ConnectionError as error:
464 logging.error("Unable to connect for thing {}: {}".format(
465 self.thing_id, error))
466 return
467
468 if current_req.status_code != 200:
469 logging.error(
470 "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
471 current_req.text))
472 return
473
474 image_list = current_req.json()
475
476 if not image_list:
477 logging.warning(
478 "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
479 self.thing_id))
480
481 for image in image_list:
482 logging.debug("parsing image: {}".format(image))
483 name = None
484 try:
485 name = slugify(image['name'])
486 # TODO: fallback to other types
487 url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
488 except KeyError:
489 logging.warning("Missing image for {}".format(name))
490 self._image_links.append(ImageLink(name, url))
491
492 self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
493 self.download_dir = os.path.join(base_dir, self.slug)
494
495 self._handle_old_directory(base_dir)
496
497 logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
498 latest, self.last_time = self._find_last_download(base_dir)
499
500 if not latest:
501 # Not yet downloaded
502 self._parsed = True
503 return
504
505 logging.info("last downloaded version: {}".format(self.last_time))
506
507 # OK, so we have a timestamp, lets see if there is anything new to get
508 # First off, are we comparing an old download that threw away the timestamp?
509 ignore_time = self.last_time == strip_time(self.last_time)
510 try:
511 # TODO: Allow for comparison at the exact time
512 files_last_update = self._file_links.last_update
513 if ignore_time:
514 logging.info("Dropping time from comparison stamp as old-style download dir")
515 files_last_update = strip_time(files_last_update)
516
517 if files_last_update > self.last_time:
518 logging.info(
519 "Found new/updated files {}".format(self._file_links.last_update))
520 self._needs_download = True
521 self._parsed = True
522 return
523 except TypeError:
524 logging.warning("No files found for {}.".format(self.thing_id))
525
526 # Got here, so nope, no new files.
527 self._needs_download = False
528 self._parsed = True
529
530 def _handle_old_directory(self, base_dir):
531 """ Deal with any old directories from previous versions of the code.
532 """
533 old_dir = os.path.join(base_dir, slugify(self.name))
534 if os.path.exists(old_dir):
535 logging.warning("Found old style download_dir. Moving.")
536 rename_unique(old_dir, self.download_dir)
537
538 def _handle_outdated_directory(self):
539 """ Move the current download directory sideways if the thing has changed.
540 """
541 if not os.path.exists(self.download_dir):
542 # No old directory to move.
543 return None
544 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
545 if not os.path.exists(timestamp_file):
546 # Old form of download directory
547 target_dir_name = "{} - old".format(self.download_dir)
548 else:
549 target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
550 return rename_unique(self.download_dir, target_dir_name)
551
552 def _find_last_download(self, base_dir):
553 """ Look for the most recent previous download (if any) of the thing.
554 """
555 logging.info("Looking for old things")
556
557 # First the DL directory itself.
558 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
559
560 latest = None
561 latest_time = None
562
563 try:
564 logging.debug("Checking for existing download in normal place.")
565 with open(timestamp_file) as ts_fh:
566 timestamp_text = ts_fh.read().strip()
567 latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
568 latest = self.download_dir
569 except FileNotFoundError:
570 # No existing download directory. huh.
571 pass
572 except TypeError:
573 logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
574
575 # TODO: Maybe look for old download directories.
576
577 # Now look for 7z files
578 candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
579 # +3 to allow for ' - '
580 leading_length = len(self.slug) + 3
581 for path in candidates:
582 candidate = os.path.basename(path)
583 try:
584 logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
585 candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
586 except ValueError:
587 logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
588 continue
589 try:
590 if candidate_time > latest_time:
591 latest_time = candidate_time
592 latest = candidate
593 except TypeError:
594 latest_time = candidate_time
595 latest = candidate
596 logging.info("Found last old thing: {} / {}".format(latest, latest_time))
597 return latest, latest_time
598
599 def download(self, base_dir, compress, api_key):
600 """ Download all files for a given thing.
601 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
602 """
603 if not self._parsed:
604 self._parse(base_dir, api_key)
605
606 if not self._parsed:
607 logging.error(
608 "Unable to parse {} - aborting download".format(self.thing_id))
609 return State.FAILED
610
611 if not self._needs_download:
612 logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
613 return State.ALREADY_DOWNLOADED
614
615 if not self._file_links:
616 logging.error(
617 "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
618 return State.FAILED
619
620 # Have we already downloaded some things?
621 renamed_dir = self._handle_outdated_directory()
622
623 # Get the list of files to download
624
625 new_file_links = []
626 old_file_links = []
627 self.time_stamp = None
628
629 if not self.last_time:
630 # If we don't have anything to copy from, then it is all new.
631 logging.debug("No last time, downloading all files")
632 new_file_links = self._file_links
633 self.time_stamp = new_file_links[0].last_update
634
635 for file_link in new_file_links:
636 self.time_stamp = max(self.time_stamp, file_link.last_update)
637 logging.debug("New timestamp will be {}".format(self.time_stamp))
638 else:
639 self.time_stamp = self.last_time
640 for file_link in self._file_links:
641 if file_link.last_update > self.last_time:
642 new_file_links.append(file_link)
643 self.time_stamp = max(self.time_stamp, file_link.last_update)
644 else:
645 old_file_links.append(file_link)
646
647 logging.debug("new timestamp {}".format(self.time_stamp))
648
649 # OK. Time to get to work.
650 logging.debug("Generating download_dir")
651 os.mkdir(self.download_dir)
652 filelist_file = os.path.join(self.download_dir, "filelist.txt")
653 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
654 for fl in self._file_links:
655 fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
656
657 # First grab the cached files (if any)
658 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
659 if renamed_dir:
660 for file_link in old_file_links:
661 try:
662 old_file = os.path.join(renamed_dir, file_link.name)
663 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
664 logging.debug("Copying {} to {}".format(old_file, new_file))
665 copyfile(old_file, new_file)
666 except FileNotFoundError:
667 logging.warning(
668 "Unable to find {} in old archive, redownloading".format(file_link.name))
669 new_file_links.append(file_link)
670 except TypeError:
671 # Not altogether sure how this could occur, possibly with some combination of the old file types
672 logging.warning(
673 "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
674 new_file_links.append(file_link)
675
676 # Now download the new ones
677 logging.info("Downloading {} new files of {}".format(
678 len(new_file_links), len(self._file_links)))
679 try:
680 for file_link in new_file_links:
681 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
682 logging.debug("Downloading {} from {} to {}".format(
683 file_link.name, file_link.link, file_name))
684 data_req = SESSION.get(file_link.link)
685 if data_req.status_code != 200:
686 logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
687 sanitise_url(file_link.link),
688 data_req.text))
689 fail_dir(self.download_dir)
690 return State.FAILED
691
692 with open(file_name, 'wb') as handle:
693 handle.write(data_req.content)
694 except Exception as exception:
695 logging.error("Failed to download {} - {}".format(file_link.name, exception))
696 fail_dir(self.download_dir)
697 return State.FAILED
698
699 # People like images.
700 image_dir = os.path.join(self.download_dir, 'images')
701 logging.info("Downloading {} images.".format(len(self._image_links)))
702 try:
703 os.mkdir(image_dir)
704 for imagelink in self._image_links:
705 filename = os.path.join(image_dir, imagelink.name)
706 image_req = SESSION.get(imagelink.link)
707 if image_req.status_code != 200:
708 logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
709 sanitise_url(imagelink.link),
710 image_req.text))
711 fail_dir(self.download_dir)
712 return State.FAILED
713 with open(truncate_name(filename), 'wb') as handle:
714 handle.write(image_req.content)
715 except Exception as exception:
716 logging.error("Failed to download {} - {}".format(imagelink.name, exception))
717 fail_dir(self.download_dir)
718 return State.FAILED
719
720 # Best get some licenses
721 logging.info("writing license file")
722 try:
723 if self._license:
724 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
725 encoding="utf-8") as license_handle:
726 license_handle.write("{}\n".format(self._license))
727 except IOError as exception:
728 logging.warning("Failed to write license! {}".format(exception))
729
730 logging.info("writing readme")
731 try:
732 if self._details:
733 with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
734 encoding="utf-8") as readme_handle:
735 readme_handle.write("{}\n".format(self._details))
736 except IOError as exception:
737 logging.warning("Failed to write readme! {}".format(exception))
738
739 try:
740 # Now write the timestamp
741 with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
742 timestamp_handle.write(self.time_stamp.__str__())
743 except Exception as exception:
744 logging.error("Failed to write timestamp file - {}".format(exception))
745 fail_dir(self.download_dir)
746 return State.FAILED
747 self._needs_download = False
748 logging.debug("Download of {} finished".format(self.name))
749 if not compress:
750 return State.OK
751
752 thing_dir = "{} - {} - {}".format(self.thing_id,
753 slugify(self.name),
754 self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
755 file_name = os.path.join(base_dir,
756 "{}.7z".format(thing_dir))
757 logging.debug("Compressing {} to {}".format(
758 self.name,
759 file_name))
760 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
761 archive.writeall(self.download_dir, thing_dir)
762 logging.debug("Compression of {} finished.".format(self.name))
763 shutil.rmtree(self.download_dir)
764 logging.debug("Removed temporary download dir of {}.".format(self.name))
765 return State.OK
766
767
768 def do_batch(batch_file, download_dir, quick, compress):
769 """ Read a file in line by line, parsing each as a set of calls to this script."""
770 with open(batch_file) as handle:
771 for line in handle:
772 line = line.strip()
773 if not line:
774 # Skip empty lines
775 continue
776 logging.info("Handling instruction {}".format(line))
777 command_arr = line.split()
778 if command_arr[0] == "thing":
779 logging.debug(
780 "Handling batch thing instruction: {}".format(line))
781 Thing.from_thing_id(command_arr[1]).download(download_dir, compress)
782 continue
783 if command_arr[0] == "collection":
784 logging.debug(
785 "Handling batch collection instruction: {}".format(line))
786 Collection(command_arr[1], command_arr[2],
787 download_dir, quick, compress).download()
788 continue
789 if command_arr[0] == "user":
790 logging.debug(
791 "Handling batch collection instruction: {}".format(line))
792 Designs(command_arr[1], download_dir, quick, compress).download()
793 continue
794 logging.warning("Unable to parse current instruction. Skipping.")
795
796
797 def main():
798 """ Entry point for script being run as a command. """
799 parser = argparse.ArgumentParser()
800 parser.add_argument("-l", "--log-level", choices=[
801 'debug', 'info', 'warning'], default='info', help="level of logging desired")
802 parser.add_argument("-d", "--directory",
803 help="Target directory to download into")
804 parser.add_argument("-f", "--log-file",
805 help="Place to log debug information to")
806 parser.add_argument("-q", "--quick", action="store_true",
807 help="Assume date ordering on posts")
808 parser.add_argument("-c", "--compress", action="store_true",
809 help="Compress files")
810 parser.add_argument("-a", "--api-key",
811 help="API key for thingiverse")
812
813 subparsers = parser.add_subparsers(
814 help="Type of thing to download", dest="subcommand")
815 collection_parser = subparsers.add_parser(
816 'collection', help="Download one or more entire collection(s)")
817 collection_parser.add_argument(
818 "owner", help="The owner of the collection(s) to get")
819 collection_parser.add_argument(
820 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
821 thing_parser = subparsers.add_parser(
822 'thing', help="Download a single thing.")
823 thing_parser.add_argument(
824 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
825 user_parser = subparsers.add_parser(
826 "user", help="Download all things by one or more users")
827 user_parser.add_argument(
828 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
829 batch_parser = subparsers.add_parser(
830 "batch", help="Perform multiple actions written in a text file")
831 batch_parser.add_argument(
832 "batch_file", help="The name of the file to read.")
833 subparsers.add_parser("version", help="Show the current version")
834
835 args = parser.parse_args()
836 if not args.subcommand:
837 parser.print_help()
838 sys.exit(1)
839 if not args.directory:
840 args.directory = os.getcwd()
841
842 logger = logging.getLogger()
843 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
844 logger.setLevel(logging.DEBUG)
845 console_handler = logging.StreamHandler()
846 console_handler.setLevel(args.log_level.upper())
847
848
849 if args.api_key:
850 api_key = args.api_key
851 else:
852 try:
853 with open("api.key") as fh:
854 api_key = fh.read().strip()
855 except Exception as e:
856 logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
857 logging.error("Exception: {}".format(e))
858 return
859
860 logger.addHandler(console_handler)
861 if args.log_file:
862 file_handler = logging.FileHandler(args.log_file)
863 file_handler.setLevel(logging.DEBUG)
864 file_handler.setFormatter(formatter)
865 logger.addHandler(file_handler)
866
867 # Start downloader
868 thing_queue = multiprocessing.JoinableQueue()
869 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
870 downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
871 for downloader in downloaders:
872 downloader.start()
873
874 if args.subcommand.startswith("collection"):
875 for collection in args.collections:
876 Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
877 if args.subcommand == "thing":
878 for thing in args.things:
879 thing_queue.put(thing)
880 if args.subcommand == "user":
881 for user in args.users:
882 Designs(user, args.directory, args.quick, args.compress, api_key).download()
883 if args.subcommand == "version":
884 print("thingy_grabber.py version {}".format(VERSION))
885 if args.subcommand == "batch":
886 do_batch(args.batch_file, args.directory, args.quick, args.compress)
887
888 # Stop the downloader processes
889 for _ in downloaders:
890 thing_queue.put(None)
891
892
893 if __name__ == "__main__":
894 multiprocessing.freeze_support()
895 main()