update changelog
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
b497d705 17from dataclasses import dataclass
9828dabe 18import py7zr
8ed15058
OM
19import glob
20import shutil
e1306099
OM
21from io import StringIO
22from html.parser import HTMLParser
975060c9 23
ae598d73
OM
24SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
25
8ed15058
OM
26# I don't think this is exported by datetime
27DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
3ac180ed
OM
28# Windows cannot handle : in filenames
29SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
8ed15058 30
73695baf
OM
31API_BASE = "https://api.thingiverse.com"
32ACCESS_QP = "access_token={}"
33PAGE_QP = "page={}"
714415bd 34API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
e45ba963 35API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
975060c9 36
e45ba963
OM
37# Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
38API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
39API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
40
41API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
42API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
43API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
10f0238d 44API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
e45ba963 45
6a777954 46DOWNLOADER_COUNT = 1
7b84ba6d 47RETRY_COUNT = 3
6a777954 48
65bd8b43
OM
49MAX_PATH_LENGTH = 250
50
66c327ef 51VERSION = "0.10.5"
dbdb1782 52
8ed15058 53TIMESTAMP_FILE = "timestamp.txt"
b497d705 54
e45ba963 55SESSION = requests.Session()
b497d705 56
73695baf 57
e1306099
OM
58class MLStripper(HTMLParser):
59 """ Turns HTML markup into plain text
60 """
61
62 def error(self, message):
63 raise ValueError(message)
64
65 def __init__(self):
66 super().__init__()
67 self.reset()
68 self.strict = False
dc7d51fa 69 self.convert_charrefs = True
e1306099
OM
70 self.text = StringIO()
71
72 def handle_data(self, d):
73 self.text.write(d)
74
75 def get_data(self):
76 return self.text.getvalue()
77
78 @staticmethod
79 def strip_tags(html):
80 s = MLStripper()
81 s.feed(html)
82 return s.get_data()
83
dc7d51fa 84
e45ba963
OM
85@dataclass
86class ThingLink:
87 thing_id: str
88 name: str
89 api_link: str
b497d705 90
73695baf 91
b497d705
OM
92@dataclass
93class FileLink:
94 name: str
ae598d73
OM
95 last_update: datetime.datetime
96 link: str
97
73695baf 98
e45ba963
OM
99@dataclass
100class ImageLink:
101 name: str
102 link: str
103
73695baf 104
ae598d73 105class FileLinks:
73695baf
OM
106 def __init__(self, initial_links=None):
107 if initial_links is None:
108 initial_links = []
ae598d73
OM
109 self.links = []
110 self.last_update = None
73695baf 111 for link in initial_links:
ae598d73
OM
112 self.append(link)
113
114 def __iter__(self):
115 return iter(self.links)
116
117 def __getitem__(self, item):
118 return self.links[item]
119
120 def __len__(self):
121 return len(self.links)
122
123 def append(self, link):
124 try:
125 self.last_update = max(self.last_update, link.last_update)
126 except TypeError:
127 self.last_update = link.last_update
128 self.links.append(link)
8ed15058 129
b497d705 130
7b84ba6d
OM
131class State(enum.Enum):
132 OK = enum.auto()
133 FAILED = enum.auto()
134 ALREADY_DOWNLOADED = enum.auto()
135
73695baf 136
e45ba963
OM
137def sanitise_url(url):
138 """ remove api keys from an url
139 """
140 return re.sub(r'access_token=\w*',
141 'access_token=***',
142 url)
143
73695baf 144
e45ba963
OM
145def strip_time(date_obj):
146 """ Takes a datetime object and returns another with the time set to 00:00
147 """
148 return datetime.datetime.combine(date_obj.date(), datetime.time())
149
73695baf 150
8ed15058
OM
151def rename_unique(dir_name, target_dir_name):
152 """ Move a directory sideways to a new name, ensuring it is unique.
65bd8b43 153 """
8ed15058 154 target_dir = target_dir_name
65bd8b43
OM
155 inc = 0
156 while os.path.exists(target_dir):
73695baf
OM
157 target_dir = "{}_{}".format(target_dir_name, inc)
158 inc += 1
65bd8b43 159 os.rename(dir_name, target_dir)
8ed15058
OM
160 return target_dir
161
162
163def fail_dir(dir_name):
164 """ When a download has failed, move it sideways.
165 """
73695baf 166 return rename_unique(dir_name, "{}_failed".format(dir_name))
65bd8b43
OM
167
168
169def truncate_name(file_name):
170 """ Ensure the filename is not too long for, well windows basically.
171 """
172 path = os.path.abspath(file_name)
173 if len(path) <= MAX_PATH_LENGTH:
174 return path
65bd8b43
OM
175 base, extension = os.path.splitext(path)
176 inc = 0
177 new_path = "{}_{}{}".format(base, inc, extension)
178 while os.path.exists(new_path):
179 new_path = "{}_{}{}".format(base, inc, extension)
180 inc += 1
181 return new_path
182
183
975060c9
OM
184def slugify(value):
185 """
d194b140
OM
186 Normalise string, removes invalid for filename charactersr
187 and converts string to lowercase.
975060c9 188 """
e45ba963 189 logging.debug("Sluggyfying {}".format(value))
d194b140 190 value = unicodedata.normalize('NFKC', value).lower().strip()
73695baf 191 value = re.sub(r'[\\/<>:?*|"]', '', value)
65bd8b43 192 value = re.sub(r'\.*$', '', value)
eb7a88fb 193 return value.strip()
975060c9 194
b497d705 195
6a777954
OM
196class Downloader(multiprocessing.Process):
197 """
198 Class to handle downloading the things we have found to get.
199 """
200
eb4e5a3f 201 def __init__(self, thing_queue, download_directory, compress, api_key):
6a777954
OM
202 multiprocessing.Process.__init__(self)
203 # TODO: add parameters
204 self.thing_queue = thing_queue
205 self.download_directory = download_directory
ae598d73 206 self.compress = compress
eb4e5a3f 207 self.api_key = api_key
6a777954
OM
208
209 def run(self):
210 """ actual download loop.
211 """
212 while True:
eb4e5a3f 213 thing_id = self.thing_queue.get()
6a777954
OM
214 if thing_id is None:
215 logging.info("Shutting download queue")
216 self.thing_queue.task_done()
217 break
eb4e5a3f
OM
218 thing = None
219 if isinstance(thing_id, str):
220 thing = Thing.from_thing_id(thing_id)
221 if isinstance(thing_id, ThingLink):
222 thing = Thing(thing_id)
223 if not thing:
224 logging.error("Don't know how to handle thing_id {}".format(thing_id))
225 else:
226 logging.info("Handling id {}".format(thing_id))
227 thing.download(self.download_directory, self.compress, self.api_key)
6a777954
OM
228 self.thing_queue.task_done()
229 return
230
7b84ba6d 231
3522a3bf 232class Grouping:
d66f1f78 233 """ Holds details of a group of things for download
3c82f75b
OM
234 This is effectively (although not actually) an abstract class
235 - use Collection or Designs instead.
236 """
dbdb1782 237
714415bd 238 def __init__(self, quick, compress, api_key):
975060c9
OM
239 self.things = []
240 self.total = 0
241 self.req_id = None
242 self.last_page = 0
243 self.per_page = None
7b84ba6d 244 # Should we stop downloading when we hit a known datestamp?
73695baf 245 self.quick = quick
ae598d73 246 self.compress = compress
714415bd 247 self.api_key = api_key
948bd56f 248 # These should be set by child classes.
3522a3bf
OM
249 self.url = None
250 self.download_dir = None
975060c9 251
73695baf 252 @property
3522a3bf
OM
253 def get(self):
254 """ retrieve the things of the grouping. """
975060c9
OM
255 if self.things:
256 # We've already done it.
257 return self.things
258
3522a3bf
OM
259 # Check for initialisation:
260 if not self.url:
fa2f3251 261 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
262 raise ValueError("No URL set - object not initialised properly?")
263
264 # Get the internal details of the grouping.
e45ba963 265 logging.debug("Querying {}".format(sanitise_url(self.url)))
73695baf 266
cdbbbe17
CE
267 # follow next links until all items are found
268 current_url = self.url
269 while current_url != None:
270 logging.info("requesting:{}".format(sanitise_url(current_url)))
271 current_req = SESSION.get(current_url)
272 current_url = current_req.links.get('next', {}).get('url')
273 if current_req.status_code != 200:
274 logging.error(
275 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
276 current_req.text))
277 else:
278 current_json = current_req.json()
279 for thing in current_json:
280 logging.debug(thing)
281 self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
e45ba963 282 logging.info("Found {} things.".format(len(self.things)))
975060c9
OM
283 return self.things
284
285 def download(self):
286 """ Downloads all the files in a collection """
287 if not self.things:
714415bd 288 self.get
3522a3bf
OM
289
290 if not self.download_dir:
dbdb1782
OM
291 raise ValueError(
292 "No download_dir set - invalidly initialised object?")
3522a3bf 293
975060c9 294 try:
3522a3bf 295 os.mkdir(self.download_dir)
975060c9 296 except FileExistsError:
fa2f3251 297 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 298 .format(self.download_dir))
fa2f3251 299 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 300 for idx, thing in enumerate(self.things):
fb28c59b 301 logging.info("Downloading thing {} - {}".format(idx, thing))
714415bd 302 return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
eb4e5a3f 303 if self.quick and return_code == State.ALREADY_DOWNLOADED:
7b84ba6d
OM
304 logging.info("Caught up, stopping.")
305 return
975060c9 306
73695baf 307
3522a3bf
OM
308class Collection(Grouping):
309 """ Holds details of a collection. """
dbdb1782 310
714415bd
OM
311 def __init__(self, user, name, directory, quick, compress, api_key):
312 Grouping.__init__(self, quick, compress, api_key)
3522a3bf
OM
313 self.user = user
314 self.name = name
e45ba963
OM
315 self.paginated = False
316 # need to figure out the the ID for the collection
714415bd 317 collection_url = API_USER_COLLECTIONS.format(user, api_key)
e45ba963
OM
318 try:
319 current_req = SESSION.get(collection_url)
320 except requests.exceptions.ConnectionError as error:
73695baf
OM
321 logging.error("Unable to connect for collections for user {}: {}".format(
322 self.user, error))
e45ba963
OM
323 return
324 if current_req.status_code != 200:
73695baf
OM
325 logging.error(
326 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
327 current_req.text))
e45ba963
OM
328 return
329 collection_list = current_req.json()
330 try:
331 # case insensitive to retain parity with previous behaviour
332 collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
333 except IndexError:
334 logging.error("Unable to find collection {} for user {}".format(name, user))
335 return
336 self.collection_id = collection['id']
714415bd 337 self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
e45ba963 338
d66f1f78 339 self.download_dir = os.path.join(directory,
3c82f75b 340 "{}-{}".format(slugify(self.user), slugify(self.name)))
3522a3bf 341
dbdb1782 342
3522a3bf
OM
343class Designs(Grouping):
344 """ Holds details of all of a users' designs. """
dbdb1782 345
714415bd
OM
346 def __init__(self, user, directory, quick, compress, api_key):
347 Grouping.__init__(self, quick, compress, api_key)
3522a3bf 348 self.user = user
714415bd 349 self.url = API_USER_DESIGNS.format(user, api_key)
dbdb1782
OM
350 self.download_dir = os.path.join(
351 directory, "{} designs".format(slugify(self.user)))
975060c9 352
dbdb1782 353
3c82f75b
OM
354class Thing:
355 """ An individual design on thingiverse. """
dbdb1782 356
e45ba963
OM
357 def __init__(self, thing_link):
358 self.thing_id = thing_link.thing_id
359 self.name = thing_link.name
3c82f75b
OM
360 self.last_time = None
361 self._parsed = False
362 self._needs_download = True
363 self.text = None
3c82f75b 364 self.download_dir = None
ae598d73
OM
365 self.time_stamp = None
366 self._file_links = FileLinks()
e45ba963 367 self._image_links = []
975060c9 368
eb4e5a3f
OM
369 @classmethod
370 def from_thing_id(cls, thing_id):
371 """
372 Factory method that looks up a thing by ID and creates a Thing object for it
373 :param thing_id: to look up
374 :return: Thing or None
375 """
376 return Thing(ThingLink(thing_id, "", ""))
377
eb4e5a3f 378 def _parse(self, base_dir, api_key):
3c82f75b
OM
379 """ Work out what, if anything needs to be done. """
380 if self._parsed:
381 return
e36c2a07 382
e45ba963 383 # First get the broad details
eb4e5a3f 384 url = API_THING_DETAILS.format(self.thing_id, api_key)
e0e69fc6 385 try:
e45ba963 386 current_req = SESSION.get(url)
e0e69fc6 387 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
388 logging.error("Unable to connect for thing {}: {}".format(
389 self.thing_id, error))
390 return
e45ba963
OM
391 # Check for DMCA
392 if current_req.status_code == 403:
393 logging.error("Access to thing {} is forbidden".format(self.thing_id))
fb28c59b 394 return
e45ba963 395 if current_req.status_code != 200:
73695baf
OM
396 logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
397 current_req.text))
e45ba963
OM
398 return
399
400 thing_json = current_req.json()
401 try:
402 self._license = thing_json['license']
403 except KeyError:
404 logging.warning("No license found for thing {}?".format(self.thing_id))
405
e1306099 406 details = None
e45ba963 407 try:
e1306099 408 details = thing_json['details']
e45ba963
OM
409 except KeyError:
410 logging.warning("No description found for thing {}?".format(self.thing_id))
e0e69fc6 411
e1306099
OM
412 if details:
413 try:
414 self._details = MLStripper.strip_tags(details)
415 except ValueError as e:
416 logging.warning("Unable to strip HTML from readme: {}".format(e))
417 self._details = details
418
eb4e5a3f
OM
419 if not self.name:
420 # Probably generated with factory method.
421 try:
422 self.name = thing_json['name']
423 except KeyError:
424 logging.warning("No name found for thing {}?".format(self.thing_id))
425 self.name = self.thing_id
426
e45ba963 427 # Now get the file details
eb4e5a3f 428 file_url = API_THING_FILES.format(self.thing_id, api_key)
e45ba963
OM
429
430 try:
431 current_req = SESSION.get(file_url)
432 except requests.exceptions.ConnectionError as error:
433 logging.error("Unable to connect for thing {}: {}".format(
434 self.thing_id, error))
435 return
436
437 if current_req.status_code != 200:
73695baf
OM
438 logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
439 current_req.text))
e45ba963
OM
440 return
441
442 link_list = current_req.json()
443
444 if not link_list:
73695baf
OM
445 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
446 self.thing_id))
e45ba963
OM
447
448 for link in link_list:
449 logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
fb28c59b 450 try:
e45ba963 451 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
73695baf 452 self._file_links.append(
eb4e5a3f 453 FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
fb28c59b 454 except ValueError:
e45ba963
OM
455 logging.error(link['date'])
456
457 # Finally get the image links
eb4e5a3f 458 image_url = API_THING_IMAGES.format(self.thing_id, api_key)
e45ba963
OM
459
460 try:
461 current_req = SESSION.get(image_url)
462 except requests.exceptions.ConnectionError as error:
463 logging.error("Unable to connect for thing {}: {}".format(
464 self.thing_id, error))
465 return
466
467 if current_req.status_code != 200:
73695baf
OM
468 logging.error(
469 "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
470 current_req.text))
e45ba963 471 return
fb28c59b 472
e45ba963 473 image_list = current_req.json()
e0e69fc6 474
e45ba963 475 if not image_list:
73695baf
OM
476 logging.warning(
477 "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
478 self.thing_id))
e0e69fc6 479
e45ba963
OM
480 for image in image_list:
481 logging.debug("parsing image: {}".format(image))
73695baf 482 name = None
e45ba963
OM
483 try:
484 name = slugify(image['name'])
485 # TODO: fallback to other types
73695baf 486 url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
e45ba963
OM
487 except KeyError:
488 logging.warning("Missing image for {}".format(name))
489 self._image_links.append(ImageLink(name, url))
490
491 self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
8ed15058
OM
492 self.download_dir = os.path.join(base_dir, self.slug)
493
494 self._handle_old_directory(base_dir)
3c82f75b 495
e45ba963 496 logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
8ed15058 497 latest, self.last_time = self._find_last_download(base_dir)
fa2f3251 498
8ed15058 499 if not latest:
73695baf
OM
500 # Not yet downloaded
501 self._parsed = True
502 return
3c82f75b 503
8ed15058 504 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
505
506 # OK, so we have a timestamp, lets see if there is anything new to get
e45ba963
OM
507 # First off, are we comparing an old download that threw away the timestamp?
508 ignore_time = self.last_time == strip_time(self.last_time)
ae598d73 509 try:
e45ba963
OM
510 # TODO: Allow for comparison at the exact time
511 files_last_update = self._file_links.last_update
512 if ignore_time:
513 logging.info("Dropping time from comparison stamp as old-style download dir")
514 files_last_update = strip_time(files_last_update)
515
e45ba963 516 if files_last_update > self.last_time:
dbdb1782 517 logging.info(
ae598d73 518 "Found new/updated files {}".format(self._file_links.last_update))
3c82f75b
OM
519 self._needs_download = True
520 self._parsed = True
521 return
ae598d73
OM
522 except TypeError:
523 logging.warning("No files found for {}.".format(self.thing_id))
b497d705 524
3c82f75b 525 # Got here, so nope, no new files.
3c82f75b
OM
526 self._needs_download = False
527 self._parsed = True
528
8ed15058
OM
529 def _handle_old_directory(self, base_dir):
530 """ Deal with any old directories from previous versions of the code.
531 """
e45ba963 532 old_dir = os.path.join(base_dir, slugify(self.name))
8ed15058
OM
533 if os.path.exists(old_dir):
534 logging.warning("Found old style download_dir. Moving.")
535 rename_unique(old_dir, self.download_dir)
536
73695baf 537 def _handle_outdated_directory(self):
8ed15058
OM
538 """ Move the current download directory sideways if the thing has changed.
539 """
540 if not os.path.exists(self.download_dir):
541 # No old directory to move.
542 return None
543 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
544 if not os.path.exists(timestamp_file):
545 # Old form of download directory
546 target_dir_name = "{} - old".format(self.download_dir)
547 else:
3ac180ed 548 target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
8ed15058
OM
549 return rename_unique(self.download_dir, target_dir_name)
550
551 def _find_last_download(self, base_dir):
552 """ Look for the most recent previous download (if any) of the thing.
553 """
554 logging.info("Looking for old things")
555
556 # First the DL directory itself.
557 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
558
559 latest = None
560 latest_time = None
561
562 try:
563 logging.debug("Checking for existing download in normal place.")
564 with open(timestamp_file) as ts_fh:
565 timestamp_text = ts_fh.read().strip()
566 latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
567 latest = self.download_dir
568 except FileNotFoundError:
569 # No existing download directory. huh.
570 pass
571 except TypeError:
572 logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
573
574 # TODO: Maybe look for old download directories.
575
8ed15058
OM
576 # Now look for 7z files
577 candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
578 # +3 to allow for ' - '
73695baf 579 leading_length = len(self.slug) + 3
8ed15058
OM
580 for path in candidates:
581 candidate = os.path.basename(path)
582 try:
583 logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
3ac180ed 584 candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
8ed15058
OM
585 except ValueError:
586 logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
587 continue
588 try:
589 if candidate_time > latest_time:
590 latest_time = candidate_time
591 latest = candidate
592 except TypeError:
593 latest_time = candidate_time
594 latest = candidate
73695baf 595 logging.info("Found last old thing: {} / {}".format(latest, latest_time))
eb4e5a3f 596 return latest, latest_time
8ed15058 597
eb4e5a3f 598 def download(self, base_dir, compress, api_key):
7b84ba6d
OM
599 """ Download all files for a given thing.
600 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
601 """
3c82f75b 602 if not self._parsed:
eb4e5a3f 603 self._parse(base_dir, api_key)
3c82f75b 604
e0e69fc6 605 if not self._parsed:
8cdd1b54
OM
606 logging.error(
607 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 608 return State.FAILED
e0e69fc6 609
3c82f75b 610 if not self._needs_download:
e45ba963 611 logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
7b84ba6d 612 return State.ALREADY_DOWNLOADED
3c82f75b 613
247c2cd5 614 if not self._file_links:
73695baf
OM
615 logging.error(
616 "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
247c2cd5
OM
617 return State.FAILED
618
3c82f75b 619 # Have we already downloaded some things?
73695baf 620 renamed_dir = self._handle_outdated_directory()
3c82f75b
OM
621
622 # Get the list of files to download
3c82f75b
OM
623
624 new_file_links = []
625 old_file_links = []
ae598d73 626 self.time_stamp = None
3c82f75b
OM
627
628 if not self.last_time:
629 # If we don't have anything to copy from, then it is all new.
b497d705
OM
630 logging.debug("No last time, downloading all files")
631 new_file_links = self._file_links
ae598d73 632 self.time_stamp = new_file_links[0].last_update
73695baf 633
b497d705 634 for file_link in new_file_links:
ae598d73
OM
635 self.time_stamp = max(self.time_stamp, file_link.last_update)
636 logging.debug("New timestamp will be {}".format(self.time_stamp))
3c82f75b 637 else:
ae598d73 638 self.time_stamp = self.last_time
b497d705
OM
639 for file_link in self._file_links:
640 if file_link.last_update > self.last_time:
3c82f75b 641 new_file_links.append(file_link)
ae598d73 642 self.time_stamp = max(self.time_stamp, file_link.last_update)
3c82f75b
OM
643 else:
644 old_file_links.append(file_link)
3c82f75b 645
ae598d73 646 logging.debug("new timestamp {}".format(self.time_stamp))
3c82f75b
OM
647
648 # OK. Time to get to work.
fa2f3251 649 logging.debug("Generating download_dir")
3c82f75b 650 os.mkdir(self.download_dir)
b497d705 651 filelist_file = os.path.join(self.download_dir, "filelist.txt")
d194b140 652 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705 653 for fl in self._file_links:
73695baf 654 fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
b497d705 655
3c82f75b 656 # First grab the cached files (if any)
fa2f3251 657 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
e6d8def4
OM
658 if renamed_dir:
659 for file_link in old_file_links:
660 try:
661 old_file = os.path.join(renamed_dir, file_link.name)
662 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
663 logging.debug("Copying {} to {}".format(old_file, new_file))
664 copyfile(old_file, new_file)
665 except FileNotFoundError:
666 logging.warning(
667 "Unable to find {} in old archive, redownloading".format(file_link.name))
668 new_file_links.append(file_link)
669 except TypeError:
670 # Not altogether sure how this could occur, possibly with some combination of the old file types
671 logging.warning(
672 "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
673 new_file_links.append(file_link)
674
3c82f75b 675 # Now download the new ones
dbdb1782 676 logging.info("Downloading {} new files of {}".format(
b497d705 677 len(new_file_links), len(self._file_links)))
3c82f75b 678 try:
b497d705 679 for file_link in new_file_links:
65bd8b43 680 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 681 logging.debug("Downloading {} from {} to {}".format(
b497d705 682 file_link.name, file_link.link, file_name))
10f0238d 683 data_req = SESSION.get(file_link.link)
e45ba963 684 if data_req.status_code != 200:
73695baf
OM
685 logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
686 sanitise_url(file_link.link),
687 data_req.text))
e45ba963
OM
688 fail_dir(self.download_dir)
689 return State.FAILED
e45ba963 690
3c82f75b
OM
691 with open(file_name, 'wb') as handle:
692 handle.write(data_req.content)
693 except Exception as exception:
b497d705 694 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 695 fail_dir(self.download_dir)
7b84ba6d 696 return State.FAILED
3c82f75b 697
e45ba963 698 # People like images.
680039fe 699 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 700 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
701 try:
702 os.mkdir(image_dir)
fb28c59b 703 for imagelink in self._image_links:
e45ba963
OM
704 filename = os.path.join(image_dir, imagelink.name)
705 image_req = SESSION.get(imagelink.link)
706 if image_req.status_code != 200:
73695baf
OM
707 logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
708 sanitise_url(imagelink.link),
709 image_req.text))
e45ba963
OM
710 fail_dir(self.download_dir)
711 return State.FAILED
712 with open(truncate_name(filename), 'wb') as handle:
680039fe
OM
713 handle.write(image_req.content)
714 except Exception as exception:
e45ba963 715 logging.error("Failed to download {} - {}".format(imagelink.name, exception))
65bd8b43 716 fail_dir(self.download_dir)
7b84ba6d 717 return State.FAILED
680039fe 718
4f75dd69 719 # Best get some licenses
e45ba963 720 logging.info("writing license file")
4f75dd69 721 try:
fb28c59b 722 if self._license:
73695baf
OM
723 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
724 encoding="utf-8") as license_handle:
fb28c59b 725 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
726 except IOError as exception:
727 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 728
e45ba963
OM
729 logging.info("writing readme")
730 try:
731 if self._details:
73695baf
OM
732 with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
733 encoding="utf-8") as readme_handle:
e45ba963
OM
734 readme_handle.write("{}\n".format(self._details))
735 except IOError as exception:
736 logging.warning("Failed to write readme! {}".format(exception))
737
3c82f75b
OM
738 try:
739 # Now write the timestamp
73695baf 740 with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
ae598d73 741 timestamp_handle.write(self.time_stamp.__str__())
3c82f75b 742 except Exception as exception:
e45ba963 743 logging.error("Failed to write timestamp file - {}".format(exception))
65bd8b43 744 fail_dir(self.download_dir)
7b84ba6d 745 return State.FAILED
3c82f75b 746 self._needs_download = False
e45ba963 747 logging.debug("Download of {} finished".format(self.name))
ae598d73
OM
748 if not compress:
749 return State.OK
750
ae598d73 751 thing_dir = "{} - {} - {}".format(self.thing_id,
73695baf
OM
752 slugify(self.name),
753 self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
ae598d73 754 file_name = os.path.join(base_dir,
73695baf 755 "{}.7z".format(thing_dir))
ae598d73 756 logging.debug("Compressing {} to {}".format(
e45ba963 757 self.name,
ae598d73 758 file_name))
ae598d73 759 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
ae598d73 760 archive.writeall(self.download_dir, thing_dir)
e45ba963 761 logging.debug("Compression of {} finished.".format(self.name))
8ed15058 762 shutil.rmtree(self.download_dir)
e45ba963 763 logging.debug("Removed temporary download dir of {}.".format(self.name))
7b84ba6d 764 return State.OK
975060c9 765
dbdb1782 766
dc7d51fa 767def do_batch(batch_file, download_dir, quick, compress, api_key):
1ab49020
OM
768 """ Read a file in line by line, parsing each as a set of calls to this script."""
769 with open(batch_file) as handle:
770 for line in handle:
771 line = line.strip()
cf280385
M
772 if not line:
773 # Skip empty lines
774 continue
1ab49020
OM
775 logging.info("Handling instruction {}".format(line))
776 command_arr = line.split()
777 if command_arr[0] == "thing":
dbdb1782
OM
778 logging.debug(
779 "Handling batch thing instruction: {}".format(line))
dc7d51fa 780 Thing.from_thing_id(command_arr[1]).download(download_dir, compress, api_key)
1ab49020
OM
781 continue
782 if command_arr[0] == "collection":
dbdb1782
OM
783 logging.debug(
784 "Handling batch collection instruction: {}".format(line))
785 Collection(command_arr[1], command_arr[2],
dc7d51fa 786 download_dir, quick, compress, api_key).download()
1ab49020
OM
787 continue
788 if command_arr[0] == "user":
dbdb1782
OM
789 logging.debug(
790 "Handling batch collection instruction: {}".format(line))
dc7d51fa 791 Designs(command_arr[1], download_dir, quick, compress, api_key).download()
1ab49020
OM
792 continue
793 logging.warning("Unable to parse current instruction. Skipping.")
794
dbdb1782 795
975060c9
OM
796def main():
797 """ Entry point for script being run as a command. """
798 parser = argparse.ArgumentParser()
dbdb1782 799 parser.add_argument("-l", "--log-level", choices=[
73695baf 800 'debug', 'info', 'warning'], default='info', help="level of logging desired")
dbdb1782
OM
801 parser.add_argument("-d", "--directory",
802 help="Target directory to download into")
4f94efc8
OM
803 parser.add_argument("-f", "--log-file",
804 help="Place to log debug information to")
7b84ba6d
OM
805 parser.add_argument("-q", "--quick", action="store_true",
806 help="Assume date ordering on posts")
ae598d73
OM
807 parser.add_argument("-c", "--compress", action="store_true",
808 help="Compress files")
e45ba963
OM
809 parser.add_argument("-a", "--api-key",
810 help="API key for thingiverse")
7b84ba6d 811
dbdb1782
OM
812 subparsers = parser.add_subparsers(
813 help="Type of thing to download", dest="subcommand")
814 collection_parser = subparsers.add_parser(
b7bfef68 815 'collection', help="Download one or more entire collection(s)")
dbdb1782 816 collection_parser.add_argument(
b7bfef68 817 "owner", help="The owner of the collection(s) to get")
dbdb1782 818 collection_parser.add_argument(
73695baf 819 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
820 thing_parser = subparsers.add_parser(
821 'thing', help="Download a single thing.")
8cdd1b54
OM
822 thing_parser.add_argument(
823 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 824 user_parser = subparsers.add_parser(
73695baf 825 "user", help="Download all things by one or more users")
8cdd1b54
OM
826 user_parser.add_argument(
827 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
828 batch_parser = subparsers.add_parser(
829 "batch", help="Perform multiple actions written in a text file")
830 batch_parser.add_argument(
831 "batch_file", help="The name of the file to read.")
680039fe 832 subparsers.add_parser("version", help="Show the current version")
4a98996b 833
975060c9 834 args = parser.parse_args()
4a98996b
OM
835 if not args.subcommand:
836 parser.print_help()
837 sys.exit(1)
d66f1f78
OM
838 if not args.directory:
839 args.directory = os.getcwd()
4f94efc8
OM
840
841 logger = logging.getLogger()
842 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
843 logger.setLevel(logging.DEBUG)
844 console_handler = logging.StreamHandler()
845 console_handler.setLevel(args.log_level.upper())
846
e45ba963 847 if args.api_key:
714415bd 848 api_key = args.api_key
e45ba963
OM
849 else:
850 try:
851 with open("api.key") as fh:
714415bd 852 api_key = fh.read().strip()
e45ba963
OM
853 except Exception as e:
854 logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
855 logging.error("Exception: {}".format(e))
856 return
857
4f94efc8
OM
858 logger.addHandler(console_handler)
859 if args.log_file:
860 file_handler = logging.FileHandler(args.log_file)
861 file_handler.setLevel(logging.DEBUG)
862 file_handler.setFormatter(formatter)
863 logger.addHandler(file_handler)
fa2f3251 864
6a777954
OM
865 # Start downloader
866 thing_queue = multiprocessing.JoinableQueue()
867 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
714415bd 868 downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
6a777954
OM
869 for downloader in downloaders:
870 downloader.start()
871
4a98996b 872 if args.subcommand.startswith("collection"):
b7bfef68 873 for collection in args.collections:
714415bd 874 Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
4a98996b 875 if args.subcommand == "thing":
b7bfef68 876 for thing in args.things:
6a777954 877 thing_queue.put(thing)
3522a3bf 878 if args.subcommand == "user":
b7bfef68 879 for user in args.users:
714415bd 880 Designs(user, args.directory, args.quick, args.compress, api_key).download()
db8066ec
OM
881 if args.subcommand == "version":
882 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 883 if args.subcommand == "batch":
dc7d51fa 884 do_batch(args.batch_file, args.directory, args.quick, args.compress, api_key)
1ab49020 885
6a777954 886 # Stop the downloader processes
73695baf 887 for _ in downloaders:
6a777954 888 thing_queue.put(None)
975060c9 889
d194b140 890
73695baf 891if __name__ == "__main__":
0930777e 892 multiprocessing.freeze_support()
975060c9 893 main()