Only output file download error text when logging is turned up.
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
b497d705 17from dataclasses import dataclass
9828dabe 18import py7zr
8ed15058
OM
19import glob
20import shutil
e1306099
OM
21from io import StringIO
22from html.parser import HTMLParser
975060c9 23
ae598d73
OM
24SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
25
8ed15058
OM
26# I don't think this is exported by datetime
27DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
3ac180ed
OM
28# Windows cannot handle : in filenames
29SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
8ed15058 30
73695baf
OM
31API_BASE = "https://api.thingiverse.com"
32ACCESS_QP = "access_token={}"
33PAGE_QP = "page={}"
714415bd 34API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
e45ba963 35API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
975060c9 36
e45ba963
OM
37# Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
38API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
39API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
40
41API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
42API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
43API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
10f0238d 44API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
e45ba963 45
6a777954 46DOWNLOADER_COUNT = 1
7b84ba6d 47RETRY_COUNT = 3
6a777954 48
65bd8b43
OM
49MAX_PATH_LENGTH = 250
50
66c327ef 51VERSION = "0.10.5"
dbdb1782 52
8ed15058 53TIMESTAMP_FILE = "timestamp.txt"
b497d705 54
e45ba963 55SESSION = requests.Session()
b497d705 56
73695baf 57
e1306099
OM
58class MLStripper(HTMLParser):
59 """ Turns HTML markup into plain text
60 """
61
62 def error(self, message):
63 raise ValueError(message)
64
65 def __init__(self):
66 super().__init__()
67 self.reset()
68 self.strict = False
dc7d51fa 69 self.convert_charrefs = True
e1306099
OM
70 self.text = StringIO()
71
72 def handle_data(self, d):
73 self.text.write(d)
74
75 def get_data(self):
76 return self.text.getvalue()
77
78 @staticmethod
79 def strip_tags(html):
80 s = MLStripper()
81 s.feed(html)
82 return s.get_data()
83
dc7d51fa 84
e45ba963
OM
85@dataclass
86class ThingLink:
87 thing_id: str
88 name: str
89 api_link: str
b497d705 90
73695baf 91
b497d705
OM
92@dataclass
93class FileLink:
94 name: str
ae598d73
OM
95 last_update: datetime.datetime
96 link: str
97
73695baf 98
e45ba963
OM
99@dataclass
100class ImageLink:
101 name: str
102 link: str
103
73695baf 104
ae598d73 105class FileLinks:
73695baf
OM
106 def __init__(self, initial_links=None):
107 if initial_links is None:
108 initial_links = []
ae598d73
OM
109 self.links = []
110 self.last_update = None
73695baf 111 for link in initial_links:
ae598d73
OM
112 self.append(link)
113
114 def __iter__(self):
115 return iter(self.links)
116
117 def __getitem__(self, item):
118 return self.links[item]
119
120 def __len__(self):
121 return len(self.links)
122
123 def append(self, link):
124 try:
125 self.last_update = max(self.last_update, link.last_update)
126 except TypeError:
127 self.last_update = link.last_update
128 self.links.append(link)
8ed15058 129
b497d705 130
7b84ba6d
OM
131class State(enum.Enum):
132 OK = enum.auto()
133 FAILED = enum.auto()
134 ALREADY_DOWNLOADED = enum.auto()
135
73695baf 136
e45ba963
OM
137def sanitise_url(url):
138 """ remove api keys from an url
139 """
140 return re.sub(r'access_token=\w*',
141 'access_token=***',
142 url)
143
73695baf 144
e45ba963
OM
145def strip_time(date_obj):
146 """ Takes a datetime object and returns another with the time set to 00:00
147 """
148 return datetime.datetime.combine(date_obj.date(), datetime.time())
149
73695baf 150
8ed15058
OM
151def rename_unique(dir_name, target_dir_name):
152 """ Move a directory sideways to a new name, ensuring it is unique.
65bd8b43 153 """
8ed15058 154 target_dir = target_dir_name
65bd8b43
OM
155 inc = 0
156 while os.path.exists(target_dir):
73695baf
OM
157 target_dir = "{}_{}".format(target_dir_name, inc)
158 inc += 1
65bd8b43 159 os.rename(dir_name, target_dir)
8ed15058
OM
160 return target_dir
161
162
163def fail_dir(dir_name):
164 """ When a download has failed, move it sideways.
165 """
73695baf 166 return rename_unique(dir_name, "{}_failed".format(dir_name))
65bd8b43
OM
167
168
169def truncate_name(file_name):
170 """ Ensure the filename is not too long for, well windows basically.
171 """
172 path = os.path.abspath(file_name)
173 if len(path) <= MAX_PATH_LENGTH:
174 return path
65bd8b43
OM
175 base, extension = os.path.splitext(path)
176 inc = 0
177 new_path = "{}_{}{}".format(base, inc, extension)
178 while os.path.exists(new_path):
179 new_path = "{}_{}{}".format(base, inc, extension)
180 inc += 1
181 return new_path
182
183
975060c9
OM
184def slugify(value):
185 """
d194b140
OM
186 Normalise string, removes invalid for filename charactersr
187 and converts string to lowercase.
975060c9 188 """
e45ba963 189 logging.debug("Sluggyfying {}".format(value))
d194b140 190 value = unicodedata.normalize('NFKC', value).lower().strip()
73695baf 191 value = re.sub(r'[\\/<>:?*|"]', '', value)
65bd8b43 192 value = re.sub(r'\.*$', '', value)
eb7a88fb 193 return value.strip()
975060c9 194
b497d705 195
6a777954
OM
196class Downloader(multiprocessing.Process):
197 """
198 Class to handle downloading the things we have found to get.
199 """
200
eb4e5a3f 201 def __init__(self, thing_queue, download_directory, compress, api_key):
6a777954
OM
202 multiprocessing.Process.__init__(self)
203 # TODO: add parameters
204 self.thing_queue = thing_queue
205 self.download_directory = download_directory
ae598d73 206 self.compress = compress
eb4e5a3f 207 self.api_key = api_key
6a777954
OM
208
209 def run(self):
210 """ actual download loop.
211 """
212 while True:
eb4e5a3f 213 thing_id = self.thing_queue.get()
6a777954
OM
214 if thing_id is None:
215 logging.info("Shutting download queue")
216 self.thing_queue.task_done()
217 break
eb4e5a3f
OM
218 thing = None
219 if isinstance(thing_id, str):
220 thing = Thing.from_thing_id(thing_id)
221 if isinstance(thing_id, ThingLink):
222 thing = Thing(thing_id)
223 if not thing:
224 logging.error("Don't know how to handle thing_id {}".format(thing_id))
225 else:
226 logging.info("Handling id {}".format(thing_id))
227 thing.download(self.download_directory, self.compress, self.api_key)
6a777954
OM
228 self.thing_queue.task_done()
229 return
230
7b84ba6d 231
3522a3bf 232class Grouping:
d66f1f78 233 """ Holds details of a group of things for download
3c82f75b
OM
234 This is effectively (although not actually) an abstract class
235 - use Collection or Designs instead.
236 """
dbdb1782 237
714415bd 238 def __init__(self, quick, compress, api_key):
975060c9
OM
239 self.things = []
240 self.total = 0
241 self.req_id = None
242 self.last_page = 0
243 self.per_page = None
7b84ba6d 244 # Should we stop downloading when we hit a known datestamp?
73695baf 245 self.quick = quick
ae598d73 246 self.compress = compress
714415bd 247 self.api_key = api_key
948bd56f 248 # These should be set by child classes.
3522a3bf
OM
249 self.url = None
250 self.download_dir = None
975060c9 251
73695baf 252 @property
3522a3bf
OM
253 def get(self):
254 """ retrieve the things of the grouping. """
975060c9
OM
255 if self.things:
256 # We've already done it.
257 return self.things
258
3522a3bf
OM
259 # Check for initialisation:
260 if not self.url:
fa2f3251 261 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
262 raise ValueError("No URL set - object not initialised properly?")
263
264 # Get the internal details of the grouping.
e45ba963 265 logging.debug("Querying {}".format(sanitise_url(self.url)))
73695baf 266
cdbbbe17
CE
267 # follow next links until all items are found
268 current_url = self.url
269 while current_url != None:
270 logging.info("requesting:{}".format(sanitise_url(current_url)))
271 current_req = SESSION.get(current_url)
272 current_url = current_req.links.get('next', {}).get('url')
273 if current_req.status_code != 200:
274 logging.error(
275 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
276 current_req.text))
277 else:
278 current_json = current_req.json()
279 for thing in current_json:
280 logging.debug(thing)
281 self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
e45ba963 282 logging.info("Found {} things.".format(len(self.things)))
975060c9
OM
283 return self.things
284
285 def download(self):
286 """ Downloads all the files in a collection """
287 if not self.things:
714415bd 288 self.get
3522a3bf
OM
289
290 if not self.download_dir:
dbdb1782
OM
291 raise ValueError(
292 "No download_dir set - invalidly initialised object?")
3522a3bf 293
975060c9 294 try:
3522a3bf 295 os.mkdir(self.download_dir)
975060c9 296 except FileExistsError:
fa2f3251 297 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 298 .format(self.download_dir))
fa2f3251 299 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 300 for idx, thing in enumerate(self.things):
fb28c59b 301 logging.info("Downloading thing {} - {}".format(idx, thing))
714415bd 302 return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
eb4e5a3f 303 if self.quick and return_code == State.ALREADY_DOWNLOADED:
7b84ba6d
OM
304 logging.info("Caught up, stopping.")
305 return
975060c9 306
73695baf 307
3522a3bf
OM
308class Collection(Grouping):
309 """ Holds details of a collection. """
dbdb1782 310
714415bd
OM
311 def __init__(self, user, name, directory, quick, compress, api_key):
312 Grouping.__init__(self, quick, compress, api_key)
3522a3bf
OM
313 self.user = user
314 self.name = name
e45ba963
OM
315 self.paginated = False
316 # need to figure out the the ID for the collection
714415bd 317 collection_url = API_USER_COLLECTIONS.format(user, api_key)
e45ba963
OM
318 try:
319 current_req = SESSION.get(collection_url)
320 except requests.exceptions.ConnectionError as error:
73695baf
OM
321 logging.error("Unable to connect for collections for user {}: {}".format(
322 self.user, error))
e45ba963
OM
323 return
324 if current_req.status_code != 200:
73695baf
OM
325 logging.error(
326 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
327 current_req.text))
e45ba963
OM
328 return
329 collection_list = current_req.json()
330 try:
331 # case insensitive to retain parity with previous behaviour
332 collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
333 except IndexError:
334 logging.error("Unable to find collection {} for user {}".format(name, user))
335 return
336 self.collection_id = collection['id']
714415bd 337 self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
e45ba963 338
d66f1f78 339 self.download_dir = os.path.join(directory,
3c82f75b 340 "{}-{}".format(slugify(self.user), slugify(self.name)))
3522a3bf 341
dbdb1782 342
3522a3bf
OM
343class Designs(Grouping):
344 """ Holds details of all of a users' designs. """
dbdb1782 345
714415bd
OM
346 def __init__(self, user, directory, quick, compress, api_key):
347 Grouping.__init__(self, quick, compress, api_key)
3522a3bf 348 self.user = user
714415bd 349 self.url = API_USER_DESIGNS.format(user, api_key)
dbdb1782
OM
350 self.download_dir = os.path.join(
351 directory, "{} designs".format(slugify(self.user)))
975060c9 352
dbdb1782 353
3c82f75b
OM
354class Thing:
355 """ An individual design on thingiverse. """
dbdb1782 356
e45ba963
OM
357 def __init__(self, thing_link):
358 self.thing_id = thing_link.thing_id
359 self.name = thing_link.name
3c82f75b
OM
360 self.last_time = None
361 self._parsed = False
362 self._needs_download = True
363 self.text = None
3c82f75b 364 self.download_dir = None
ae598d73
OM
365 self.time_stamp = None
366 self._file_links = FileLinks()
e45ba963 367 self._image_links = []
975060c9 368
eb4e5a3f
OM
369 @classmethod
370 def from_thing_id(cls, thing_id):
371 """
372 Factory method that looks up a thing by ID and creates a Thing object for it
373 :param thing_id: to look up
374 :return: Thing or None
375 """
376 return Thing(ThingLink(thing_id, "", ""))
377
eb4e5a3f 378 def _parse(self, base_dir, api_key):
3c82f75b
OM
379 """ Work out what, if anything needs to be done. """
380 if self._parsed:
381 return
e36c2a07 382
e45ba963 383 # First get the broad details
eb4e5a3f 384 url = API_THING_DETAILS.format(self.thing_id, api_key)
e0e69fc6 385 try:
e45ba963 386 current_req = SESSION.get(url)
e0e69fc6 387 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
388 logging.error("Unable to connect for thing {}: {}".format(
389 self.thing_id, error))
390 return
e45ba963
OM
391 # Check for DMCA
392 if current_req.status_code == 403:
393 logging.error("Access to thing {} is forbidden".format(self.thing_id))
fb28c59b 394 return
e45ba963 395 if current_req.status_code != 200:
73695baf
OM
396 logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
397 current_req.text))
e45ba963
OM
398 return
399
400 thing_json = current_req.json()
401 try:
402 self._license = thing_json['license']
403 except KeyError:
404 logging.warning("No license found for thing {}?".format(self.thing_id))
405
e1306099 406 details = None
e45ba963 407 try:
e1306099 408 details = thing_json['details']
e45ba963
OM
409 except KeyError:
410 logging.warning("No description found for thing {}?".format(self.thing_id))
e0e69fc6 411
e1306099
OM
412 if details:
413 try:
414 self._details = MLStripper.strip_tags(details)
415 except ValueError as e:
416 logging.warning("Unable to strip HTML from readme: {}".format(e))
417 self._details = details
418
eb4e5a3f
OM
419 if not self.name:
420 # Probably generated with factory method.
421 try:
422 self.name = thing_json['name']
423 except KeyError:
424 logging.warning("No name found for thing {}?".format(self.thing_id))
425 self.name = self.thing_id
426
e45ba963 427 # Now get the file details
eb4e5a3f 428 file_url = API_THING_FILES.format(self.thing_id, api_key)
e45ba963
OM
429
430 try:
431 current_req = SESSION.get(file_url)
432 except requests.exceptions.ConnectionError as error:
433 logging.error("Unable to connect for thing {}: {}".format(
434 self.thing_id, error))
435 return
436
437 if current_req.status_code != 200:
73695baf
OM
438 logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
439 current_req.text))
e45ba963
OM
440 return
441
442 link_list = current_req.json()
443
444 if not link_list:
73695baf
OM
445 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
446 self.thing_id))
e45ba963
OM
447
448 for link in link_list:
449 logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
fb28c59b 450 try:
e45ba963 451 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
73695baf 452 self._file_links.append(
eb4e5a3f 453 FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
fb28c59b 454 except ValueError:
e45ba963
OM
455 logging.error(link['date'])
456
457 # Finally get the image links
eb4e5a3f 458 image_url = API_THING_IMAGES.format(self.thing_id, api_key)
e45ba963
OM
459
460 try:
461 current_req = SESSION.get(image_url)
462 except requests.exceptions.ConnectionError as error:
463 logging.error("Unable to connect for thing {}: {}".format(
464 self.thing_id, error))
465 return
466
467 if current_req.status_code != 200:
73695baf
OM
468 logging.error(
469 "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
470 current_req.text))
e45ba963 471 return
fb28c59b 472
e45ba963 473 image_list = current_req.json()
e0e69fc6 474
e45ba963 475 if not image_list:
73695baf
OM
476 logging.warning(
477 "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
478 self.thing_id))
e0e69fc6 479
e45ba963
OM
480 for image in image_list:
481 logging.debug("parsing image: {}".format(image))
73695baf 482 name = None
e45ba963
OM
483 try:
484 name = slugify(image['name'])
485 # TODO: fallback to other types
73695baf 486 url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
e45ba963
OM
487 except KeyError:
488 logging.warning("Missing image for {}".format(name))
489 self._image_links.append(ImageLink(name, url))
490
491 self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
8ed15058
OM
492 self.download_dir = os.path.join(base_dir, self.slug)
493
494 self._handle_old_directory(base_dir)
3c82f75b 495
e45ba963 496 logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
8ed15058 497 latest, self.last_time = self._find_last_download(base_dir)
fa2f3251 498
8ed15058 499 if not latest:
73695baf
OM
500 # Not yet downloaded
501 self._parsed = True
502 return
3c82f75b 503
8ed15058 504 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
505
506 # OK, so we have a timestamp, lets see if there is anything new to get
e45ba963
OM
507 # First off, are we comparing an old download that threw away the timestamp?
508 ignore_time = self.last_time == strip_time(self.last_time)
ae598d73 509 try:
e45ba963
OM
510 # TODO: Allow for comparison at the exact time
511 files_last_update = self._file_links.last_update
512 if ignore_time:
513 logging.info("Dropping time from comparison stamp as old-style download dir")
514 files_last_update = strip_time(files_last_update)
515
e45ba963 516 if files_last_update > self.last_time:
dbdb1782 517 logging.info(
ae598d73 518 "Found new/updated files {}".format(self._file_links.last_update))
3c82f75b
OM
519 self._needs_download = True
520 self._parsed = True
521 return
ae598d73
OM
522 except TypeError:
523 logging.warning("No files found for {}.".format(self.thing_id))
b497d705 524
3c82f75b 525 # Got here, so nope, no new files.
3c82f75b
OM
526 self._needs_download = False
527 self._parsed = True
528
8ed15058
OM
529 def _handle_old_directory(self, base_dir):
530 """ Deal with any old directories from previous versions of the code.
531 """
e45ba963 532 old_dir = os.path.join(base_dir, slugify(self.name))
8ed15058
OM
533 if os.path.exists(old_dir):
534 logging.warning("Found old style download_dir. Moving.")
535 rename_unique(old_dir, self.download_dir)
536
73695baf 537 def _handle_outdated_directory(self):
8ed15058
OM
538 """ Move the current download directory sideways if the thing has changed.
539 """
540 if not os.path.exists(self.download_dir):
541 # No old directory to move.
542 return None
543 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
544 if not os.path.exists(timestamp_file):
545 # Old form of download directory
546 target_dir_name = "{} - old".format(self.download_dir)
547 else:
3ac180ed 548 target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
8ed15058
OM
549 return rename_unique(self.download_dir, target_dir_name)
550
551 def _find_last_download(self, base_dir):
552 """ Look for the most recent previous download (if any) of the thing.
553 """
554 logging.info("Looking for old things")
555
556 # First the DL directory itself.
557 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
558
559 latest = None
560 latest_time = None
561
562 try:
563 logging.debug("Checking for existing download in normal place.")
564 with open(timestamp_file) as ts_fh:
565 timestamp_text = ts_fh.read().strip()
566 latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
567 latest = self.download_dir
568 except FileNotFoundError:
569 # No existing download directory. huh.
570 pass
571 except TypeError:
572 logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
573
574 # TODO: Maybe look for old download directories.
575
8ed15058
OM
576 # Now look for 7z files
577 candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
578 # +3 to allow for ' - '
73695baf 579 leading_length = len(self.slug) + 3
8ed15058
OM
580 for path in candidates:
581 candidate = os.path.basename(path)
582 try:
583 logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
3ac180ed 584 candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
8ed15058
OM
585 except ValueError:
586 logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
587 continue
588 try:
589 if candidate_time > latest_time:
590 latest_time = candidate_time
591 latest = candidate
592 except TypeError:
593 latest_time = candidate_time
594 latest = candidate
73695baf 595 logging.info("Found last old thing: {} / {}".format(latest, latest_time))
eb4e5a3f 596 return latest, latest_time
8ed15058 597
eb4e5a3f 598 def download(self, base_dir, compress, api_key):
7b84ba6d
OM
599 """ Download all files for a given thing.
600 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
601 """
3c82f75b 602 if not self._parsed:
eb4e5a3f 603 self._parse(base_dir, api_key)
3c82f75b 604
e0e69fc6 605 if not self._parsed:
8cdd1b54
OM
606 logging.error(
607 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 608 return State.FAILED
e0e69fc6 609
3c82f75b 610 if not self._needs_download:
e45ba963 611 logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
7b84ba6d 612 return State.ALREADY_DOWNLOADED
3c82f75b 613
247c2cd5 614 if not self._file_links:
73695baf
OM
615 logging.error(
616 "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
247c2cd5
OM
617 return State.FAILED
618
3c82f75b 619 # Have we already downloaded some things?
73695baf 620 renamed_dir = self._handle_outdated_directory()
3c82f75b
OM
621
622 # Get the list of files to download
3c82f75b
OM
623
624 new_file_links = []
625 old_file_links = []
ae598d73 626 self.time_stamp = None
3c82f75b
OM
627
628 if not self.last_time:
629 # If we don't have anything to copy from, then it is all new.
b497d705
OM
630 logging.debug("No last time, downloading all files")
631 new_file_links = self._file_links
ae598d73 632 self.time_stamp = new_file_links[0].last_update
73695baf 633
b497d705 634 for file_link in new_file_links:
ae598d73
OM
635 self.time_stamp = max(self.time_stamp, file_link.last_update)
636 logging.debug("New timestamp will be {}".format(self.time_stamp))
3c82f75b 637 else:
ae598d73 638 self.time_stamp = self.last_time
b497d705
OM
639 for file_link in self._file_links:
640 if file_link.last_update > self.last_time:
3c82f75b 641 new_file_links.append(file_link)
ae598d73 642 self.time_stamp = max(self.time_stamp, file_link.last_update)
3c82f75b
OM
643 else:
644 old_file_links.append(file_link)
3c82f75b 645
ae598d73 646 logging.debug("new timestamp {}".format(self.time_stamp))
3c82f75b
OM
647
648 # OK. Time to get to work.
fa2f3251 649 logging.debug("Generating download_dir")
3c82f75b 650 os.mkdir(self.download_dir)
b497d705 651 filelist_file = os.path.join(self.download_dir, "filelist.txt")
d194b140 652 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705 653 for fl in self._file_links:
73695baf 654 fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
b497d705 655
3c82f75b 656 # First grab the cached files (if any)
fa2f3251 657 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
e6d8def4
OM
658 if renamed_dir:
659 for file_link in old_file_links:
660 try:
661 old_file = os.path.join(renamed_dir, file_link.name)
662 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
663 logging.debug("Copying {} to {}".format(old_file, new_file))
664 copyfile(old_file, new_file)
665 except FileNotFoundError:
666 logging.warning(
667 "Unable to find {} in old archive, redownloading".format(file_link.name))
668 new_file_links.append(file_link)
669 except TypeError:
670 # Not altogether sure how this could occur, possibly with some combination of the old file types
671 logging.warning(
672 "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
673 new_file_links.append(file_link)
674
3c82f75b 675 # Now download the new ones
dbdb1782 676 logging.info("Downloading {} new files of {}".format(
b497d705 677 len(new_file_links), len(self._file_links)))
3c82f75b 678 try:
b497d705 679 for file_link in new_file_links:
65bd8b43 680 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 681 logging.debug("Downloading {} from {} to {}".format(
b497d705 682 file_link.name, file_link.link, file_name))
10f0238d 683 data_req = SESSION.get(file_link.link)
e45ba963 684 if data_req.status_code != 200:
98a6444c
OM
685 logging.error("Unexpected status code {} for {}".format(data_req.status_code,
686 sanitise_url(file_link.link)))
687 logging.debug("Unexpected status code {} for {}: {}".format(data_req.status_code,
73695baf
OM
688 sanitise_url(file_link.link),
689 data_req.text))
e45ba963
OM
690 fail_dir(self.download_dir)
691 return State.FAILED
e45ba963 692
3c82f75b
OM
693 with open(file_name, 'wb') as handle:
694 handle.write(data_req.content)
695 except Exception as exception:
b497d705 696 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 697 fail_dir(self.download_dir)
7b84ba6d 698 return State.FAILED
3c82f75b 699
e45ba963 700 # People like images.
680039fe 701 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 702 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
703 try:
704 os.mkdir(image_dir)
fb28c59b 705 for imagelink in self._image_links:
e45ba963
OM
706 filename = os.path.join(image_dir, imagelink.name)
707 image_req = SESSION.get(imagelink.link)
708 if image_req.status_code != 200:
73695baf
OM
709 logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
710 sanitise_url(imagelink.link),
711 image_req.text))
e45ba963
OM
712 fail_dir(self.download_dir)
713 return State.FAILED
714 with open(truncate_name(filename), 'wb') as handle:
680039fe
OM
715 handle.write(image_req.content)
716 except Exception as exception:
e45ba963 717 logging.error("Failed to download {} - {}".format(imagelink.name, exception))
65bd8b43 718 fail_dir(self.download_dir)
7b84ba6d 719 return State.FAILED
680039fe 720
4f75dd69 721 # Best get some licenses
e45ba963 722 logging.info("writing license file")
4f75dd69 723 try:
fb28c59b 724 if self._license:
73695baf
OM
725 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
726 encoding="utf-8") as license_handle:
fb28c59b 727 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
728 except IOError as exception:
729 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 730
e45ba963
OM
731 logging.info("writing readme")
732 try:
733 if self._details:
73695baf
OM
734 with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
735 encoding="utf-8") as readme_handle:
e45ba963
OM
736 readme_handle.write("{}\n".format(self._details))
737 except IOError as exception:
738 logging.warning("Failed to write readme! {}".format(exception))
739
3c82f75b
OM
740 try:
741 # Now write the timestamp
73695baf 742 with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
ae598d73 743 timestamp_handle.write(self.time_stamp.__str__())
3c82f75b 744 except Exception as exception:
e45ba963 745 logging.error("Failed to write timestamp file - {}".format(exception))
65bd8b43 746 fail_dir(self.download_dir)
7b84ba6d 747 return State.FAILED
3c82f75b 748 self._needs_download = False
e45ba963 749 logging.debug("Download of {} finished".format(self.name))
ae598d73
OM
750 if not compress:
751 return State.OK
752
ae598d73 753 thing_dir = "{} - {} - {}".format(self.thing_id,
73695baf
OM
754 slugify(self.name),
755 self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
ae598d73 756 file_name = os.path.join(base_dir,
73695baf 757 "{}.7z".format(thing_dir))
ae598d73 758 logging.debug("Compressing {} to {}".format(
e45ba963 759 self.name,
ae598d73 760 file_name))
ae598d73 761 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
ae598d73 762 archive.writeall(self.download_dir, thing_dir)
e45ba963 763 logging.debug("Compression of {} finished.".format(self.name))
8ed15058 764 shutil.rmtree(self.download_dir)
e45ba963 765 logging.debug("Removed temporary download dir of {}.".format(self.name))
7b84ba6d 766 return State.OK
975060c9 767
dbdb1782 768
dc7d51fa 769def do_batch(batch_file, download_dir, quick, compress, api_key):
1ab49020
OM
770 """ Read a file in line by line, parsing each as a set of calls to this script."""
771 with open(batch_file) as handle:
772 for line in handle:
773 line = line.strip()
cf280385
M
774 if not line:
775 # Skip empty lines
776 continue
1ab49020
OM
777 logging.info("Handling instruction {}".format(line))
778 command_arr = line.split()
779 if command_arr[0] == "thing":
dbdb1782
OM
780 logging.debug(
781 "Handling batch thing instruction: {}".format(line))
dc7d51fa 782 Thing.from_thing_id(command_arr[1]).download(download_dir, compress, api_key)
1ab49020
OM
783 continue
784 if command_arr[0] == "collection":
dbdb1782
OM
785 logging.debug(
786 "Handling batch collection instruction: {}".format(line))
787 Collection(command_arr[1], command_arr[2],
dc7d51fa 788 download_dir, quick, compress, api_key).download()
1ab49020
OM
789 continue
790 if command_arr[0] == "user":
dbdb1782
OM
791 logging.debug(
792 "Handling batch collection instruction: {}".format(line))
dc7d51fa 793 Designs(command_arr[1], download_dir, quick, compress, api_key).download()
1ab49020
OM
794 continue
795 logging.warning("Unable to parse current instruction. Skipping.")
796
dbdb1782 797
975060c9
OM
798def main():
799 """ Entry point for script being run as a command. """
800 parser = argparse.ArgumentParser()
dbdb1782 801 parser.add_argument("-l", "--log-level", choices=[
73695baf 802 'debug', 'info', 'warning'], default='info', help="level of logging desired")
dbdb1782
OM
803 parser.add_argument("-d", "--directory",
804 help="Target directory to download into")
4f94efc8
OM
805 parser.add_argument("-f", "--log-file",
806 help="Place to log debug information to")
7b84ba6d
OM
807 parser.add_argument("-q", "--quick", action="store_true",
808 help="Assume date ordering on posts")
ae598d73
OM
809 parser.add_argument("-c", "--compress", action="store_true",
810 help="Compress files")
e45ba963
OM
811 parser.add_argument("-a", "--api-key",
812 help="API key for thingiverse")
7b84ba6d 813
dbdb1782
OM
814 subparsers = parser.add_subparsers(
815 help="Type of thing to download", dest="subcommand")
816 collection_parser = subparsers.add_parser(
b7bfef68 817 'collection', help="Download one or more entire collection(s)")
dbdb1782 818 collection_parser.add_argument(
b7bfef68 819 "owner", help="The owner of the collection(s) to get")
dbdb1782 820 collection_parser.add_argument(
73695baf 821 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
822 thing_parser = subparsers.add_parser(
823 'thing', help="Download a single thing.")
8cdd1b54
OM
824 thing_parser.add_argument(
825 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 826 user_parser = subparsers.add_parser(
73695baf 827 "user", help="Download all things by one or more users")
8cdd1b54
OM
828 user_parser.add_argument(
829 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
830 batch_parser = subparsers.add_parser(
831 "batch", help="Perform multiple actions written in a text file")
832 batch_parser.add_argument(
833 "batch_file", help="The name of the file to read.")
680039fe 834 subparsers.add_parser("version", help="Show the current version")
4a98996b 835
975060c9 836 args = parser.parse_args()
4a98996b
OM
837 if not args.subcommand:
838 parser.print_help()
839 sys.exit(1)
d66f1f78
OM
840 if not args.directory:
841 args.directory = os.getcwd()
4f94efc8
OM
842
843 logger = logging.getLogger()
844 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
845 logger.setLevel(logging.DEBUG)
846 console_handler = logging.StreamHandler()
847 console_handler.setLevel(args.log_level.upper())
848
e45ba963 849 if args.api_key:
714415bd 850 api_key = args.api_key
e45ba963
OM
851 else:
852 try:
853 with open("api.key") as fh:
714415bd 854 api_key = fh.read().strip()
e45ba963
OM
855 except Exception as e:
856 logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
857 logging.error("Exception: {}".format(e))
858 return
859
4f94efc8
OM
860 logger.addHandler(console_handler)
861 if args.log_file:
862 file_handler = logging.FileHandler(args.log_file)
863 file_handler.setLevel(logging.DEBUG)
864 file_handler.setFormatter(formatter)
865 logger.addHandler(file_handler)
fa2f3251 866
6a777954
OM
867 # Start downloader
868 thing_queue = multiprocessing.JoinableQueue()
869 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
714415bd 870 downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
6a777954
OM
871 for downloader in downloaders:
872 downloader.start()
873
4a98996b 874 if args.subcommand.startswith("collection"):
b7bfef68 875 for collection in args.collections:
714415bd 876 Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
4a98996b 877 if args.subcommand == "thing":
b7bfef68 878 for thing in args.things:
6a777954 879 thing_queue.put(thing)
3522a3bf 880 if args.subcommand == "user":
b7bfef68 881 for user in args.users:
714415bd 882 Designs(user, args.directory, args.quick, args.compress, api_key).download()
db8066ec
OM
883 if args.subcommand == "version":
884 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 885 if args.subcommand == "batch":
dc7d51fa 886 do_batch(args.batch_file, args.directory, args.quick, args.compress, api_key)
1ab49020 887
6a777954 888 # Stop the downloader processes
73695baf 889 for _ in downloaders:
6a777954 890 thing_queue.put(None)
975060c9 891
d194b140 892
73695baf 893if __name__ == "__main__":
0930777e 894 multiprocessing.freeze_support()
975060c9 895 main()