Fix global batch
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
b497d705 17from dataclasses import dataclass
9828dabe 18import py7zr
8ed15058
OM
19import glob
20import shutil
e1306099
OM
21from io import StringIO
22from html.parser import HTMLParser
975060c9 23
ae598d73
OM
24SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
25
8ed15058
OM
26# I don't think this is exported by datetime
27DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
3ac180ed
OM
28# Windows cannot handle : in filenames
29SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
8ed15058 30
73695baf
OM
31API_BASE = "https://api.thingiverse.com"
32ACCESS_QP = "access_token={}"
33PAGE_QP = "page={}"
714415bd 34API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
e45ba963 35API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
975060c9 36
e45ba963
OM
37# Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
38API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
39API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
40
41API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
42API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
43API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
10f0238d 44API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
e45ba963 45
6a777954 46DOWNLOADER_COUNT = 1
7b84ba6d 47RETRY_COUNT = 3
6a777954 48
65bd8b43
OM
49MAX_PATH_LENGTH = 250
50
e1306099 51VERSION = "0.10.4"
dbdb1782 52
8ed15058 53TIMESTAMP_FILE = "timestamp.txt"
b497d705 54
e45ba963 55SESSION = requests.Session()
b497d705 56
73695baf 57
e1306099
OM
58class MLStripper(HTMLParser):
59 """ Turns HTML markup into plain text
60 """
61
62 def error(self, message):
63 raise ValueError(message)
64
65 def __init__(self):
66 super().__init__()
67 self.reset()
68 self.strict = False
dc7d51fa 69 self.convert_charrefs = True
e1306099
OM
70 self.text = StringIO()
71
72 def handle_data(self, d):
73 self.text.write(d)
74
75 def get_data(self):
76 return self.text.getvalue()
77
78 @staticmethod
79 def strip_tags(html):
80 s = MLStripper()
81 s.feed(html)
82 return s.get_data()
83
dc7d51fa 84
e45ba963
OM
85@dataclass
86class ThingLink:
87 thing_id: str
88 name: str
89 api_link: str
b497d705 90
73695baf 91
b497d705
OM
92@dataclass
93class FileLink:
94 name: str
ae598d73
OM
95 last_update: datetime.datetime
96 link: str
97
73695baf 98
e45ba963
OM
99@dataclass
100class ImageLink:
101 name: str
102 link: str
103
73695baf 104
ae598d73 105class FileLinks:
73695baf
OM
106 def __init__(self, initial_links=None):
107 if initial_links is None:
108 initial_links = []
ae598d73
OM
109 self.links = []
110 self.last_update = None
73695baf 111 for link in initial_links:
ae598d73
OM
112 self.append(link)
113
114 def __iter__(self):
115 return iter(self.links)
116
117 def __getitem__(self, item):
118 return self.links[item]
119
120 def __len__(self):
121 return len(self.links)
122
123 def append(self, link):
124 try:
125 self.last_update = max(self.last_update, link.last_update)
126 except TypeError:
127 self.last_update = link.last_update
128 self.links.append(link)
8ed15058 129
b497d705 130
7b84ba6d
OM
131class State(enum.Enum):
132 OK = enum.auto()
133 FAILED = enum.auto()
134 ALREADY_DOWNLOADED = enum.auto()
135
73695baf 136
e45ba963
OM
137def sanitise_url(url):
138 """ remove api keys from an url
139 """
140 return re.sub(r'access_token=\w*',
141 'access_token=***',
142 url)
143
73695baf 144
e45ba963
OM
145def strip_time(date_obj):
146 """ Takes a datetime object and returns another with the time set to 00:00
147 """
148 return datetime.datetime.combine(date_obj.date(), datetime.time())
149
73695baf 150
8ed15058
OM
151def rename_unique(dir_name, target_dir_name):
152 """ Move a directory sideways to a new name, ensuring it is unique.
65bd8b43 153 """
8ed15058 154 target_dir = target_dir_name
65bd8b43
OM
155 inc = 0
156 while os.path.exists(target_dir):
73695baf
OM
157 target_dir = "{}_{}".format(target_dir_name, inc)
158 inc += 1
65bd8b43 159 os.rename(dir_name, target_dir)
8ed15058
OM
160 return target_dir
161
162
163def fail_dir(dir_name):
164 """ When a download has failed, move it sideways.
165 """
73695baf 166 return rename_unique(dir_name, "{}_failed".format(dir_name))
65bd8b43
OM
167
168
169def truncate_name(file_name):
170 """ Ensure the filename is not too long for, well windows basically.
171 """
172 path = os.path.abspath(file_name)
173 if len(path) <= MAX_PATH_LENGTH:
174 return path
65bd8b43
OM
175 base, extension = os.path.splitext(path)
176 inc = 0
177 new_path = "{}_{}{}".format(base, inc, extension)
178 while os.path.exists(new_path):
179 new_path = "{}_{}{}".format(base, inc, extension)
180 inc += 1
181 return new_path
182
183
975060c9
OM
184def slugify(value):
185 """
d194b140
OM
186 Normalise string, removes invalid for filename charactersr
187 and converts string to lowercase.
975060c9 188 """
e45ba963 189 logging.debug("Sluggyfying {}".format(value))
d194b140 190 value = unicodedata.normalize('NFKC', value).lower().strip()
73695baf 191 value = re.sub(r'[\\/<>:?*|"]', '', value)
65bd8b43 192 value = re.sub(r'\.*$', '', value)
eb7a88fb 193 return value.strip()
975060c9 194
b497d705 195
6a777954
OM
196class Downloader(multiprocessing.Process):
197 """
198 Class to handle downloading the things we have found to get.
199 """
200
eb4e5a3f 201 def __init__(self, thing_queue, download_directory, compress, api_key):
6a777954
OM
202 multiprocessing.Process.__init__(self)
203 # TODO: add parameters
204 self.thing_queue = thing_queue
205 self.download_directory = download_directory
ae598d73 206 self.compress = compress
eb4e5a3f 207 self.api_key = api_key
6a777954
OM
208
209 def run(self):
210 """ actual download loop.
211 """
212 while True:
eb4e5a3f 213 thing_id = self.thing_queue.get()
6a777954
OM
214 if thing_id is None:
215 logging.info("Shutting download queue")
216 self.thing_queue.task_done()
217 break
eb4e5a3f
OM
218 thing = None
219 if isinstance(thing_id, str):
220 thing = Thing.from_thing_id(thing_id)
221 if isinstance(thing_id, ThingLink):
222 thing = Thing(thing_id)
223 if not thing:
224 logging.error("Don't know how to handle thing_id {}".format(thing_id))
225 else:
226 logging.info("Handling id {}".format(thing_id))
227 thing.download(self.download_directory, self.compress, self.api_key)
6a777954
OM
228 self.thing_queue.task_done()
229 return
230
7b84ba6d 231
3522a3bf 232class Grouping:
d66f1f78 233 """ Holds details of a group of things for download
3c82f75b
OM
234 This is effectively (although not actually) an abstract class
235 - use Collection or Designs instead.
236 """
dbdb1782 237
714415bd 238 def __init__(self, quick, compress, api_key):
975060c9
OM
239 self.things = []
240 self.total = 0
241 self.req_id = None
242 self.last_page = 0
243 self.per_page = None
7b84ba6d 244 # Should we stop downloading when we hit a known datestamp?
73695baf 245 self.quick = quick
ae598d73 246 self.compress = compress
714415bd 247 self.api_key = api_key
948bd56f 248 # These should be set by child classes.
3522a3bf
OM
249 self.url = None
250 self.download_dir = None
975060c9 251
73695baf 252 @property
3522a3bf
OM
253 def get(self):
254 """ retrieve the things of the grouping. """
975060c9
OM
255 if self.things:
256 # We've already done it.
257 return self.things
258
3522a3bf
OM
259 # Check for initialisation:
260 if not self.url:
fa2f3251 261 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
262 raise ValueError("No URL set - object not initialised properly?")
263
264 # Get the internal details of the grouping.
e45ba963 265 logging.debug("Querying {}".format(sanitise_url(self.url)))
73695baf
OM
266
267 # self.url should already have been formatted as we don't need pagination
268 logging.info("requesting:{}".format(sanitise_url(self.url)))
269 current_req = SESSION.get(self.url)
270 if current_req.status_code != 200:
271 logging.error(
272 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
273 current_req.text))
e45ba963 274 else:
73695baf
OM
275 current_json = current_req.json()
276 for thing in current_json:
277 logging.info(thing)
278 self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
e45ba963 279 logging.info("Found {} things.".format(len(self.things)))
975060c9
OM
280 return self.things
281
282 def download(self):
283 """ Downloads all the files in a collection """
284 if not self.things:
714415bd 285 self.get
3522a3bf
OM
286
287 if not self.download_dir:
dbdb1782
OM
288 raise ValueError(
289 "No download_dir set - invalidly initialised object?")
3522a3bf 290
975060c9 291 try:
3522a3bf 292 os.mkdir(self.download_dir)
975060c9 293 except FileExistsError:
fa2f3251 294 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 295 .format(self.download_dir))
fa2f3251 296 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 297 for idx, thing in enumerate(self.things):
fb28c59b 298 logging.info("Downloading thing {} - {}".format(idx, thing))
714415bd 299 return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
eb4e5a3f 300 if self.quick and return_code == State.ALREADY_DOWNLOADED:
7b84ba6d
OM
301 logging.info("Caught up, stopping.")
302 return
975060c9 303
73695baf 304
3522a3bf
OM
305class Collection(Grouping):
306 """ Holds details of a collection. """
dbdb1782 307
714415bd
OM
308 def __init__(self, user, name, directory, quick, compress, api_key):
309 Grouping.__init__(self, quick, compress, api_key)
3522a3bf
OM
310 self.user = user
311 self.name = name
e45ba963
OM
312 self.paginated = False
313 # need to figure out the the ID for the collection
714415bd 314 collection_url = API_USER_COLLECTIONS.format(user, api_key)
e45ba963
OM
315 try:
316 current_req = SESSION.get(collection_url)
317 except requests.exceptions.ConnectionError as error:
73695baf
OM
318 logging.error("Unable to connect for collections for user {}: {}".format(
319 self.user, error))
e45ba963
OM
320 return
321 if current_req.status_code != 200:
73695baf
OM
322 logging.error(
323 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
324 current_req.text))
e45ba963
OM
325 return
326 collection_list = current_req.json()
327 try:
328 # case insensitive to retain parity with previous behaviour
329 collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
330 except IndexError:
331 logging.error("Unable to find collection {} for user {}".format(name, user))
332 return
333 self.collection_id = collection['id']
714415bd 334 self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
e45ba963 335
d66f1f78 336 self.download_dir = os.path.join(directory,
3c82f75b 337 "{}-{}".format(slugify(self.user), slugify(self.name)))
3522a3bf 338
dbdb1782 339
3522a3bf
OM
340class Designs(Grouping):
341 """ Holds details of all of a users' designs. """
dbdb1782 342
714415bd
OM
343 def __init__(self, user, directory, quick, compress, api_key):
344 Grouping.__init__(self, quick, compress, api_key)
3522a3bf 345 self.user = user
714415bd 346 self.url = API_USER_DESIGNS.format(user, api_key)
dbdb1782
OM
347 self.download_dir = os.path.join(
348 directory, "{} designs".format(slugify(self.user)))
975060c9 349
dbdb1782 350
3c82f75b
OM
351class Thing:
352 """ An individual design on thingiverse. """
dbdb1782 353
e45ba963
OM
354 def __init__(self, thing_link):
355 self.thing_id = thing_link.thing_id
356 self.name = thing_link.name
3c82f75b
OM
357 self.last_time = None
358 self._parsed = False
359 self._needs_download = True
360 self.text = None
3c82f75b 361 self.download_dir = None
ae598d73
OM
362 self.time_stamp = None
363 self._file_links = FileLinks()
e45ba963 364 self._image_links = []
975060c9 365
eb4e5a3f
OM
366 @classmethod
367 def from_thing_id(cls, thing_id):
368 """
369 Factory method that looks up a thing by ID and creates a Thing object for it
370 :param thing_id: to look up
371 :return: Thing or None
372 """
373 return Thing(ThingLink(thing_id, "", ""))
374
eb4e5a3f 375 def _parse(self, base_dir, api_key):
3c82f75b
OM
376 """ Work out what, if anything needs to be done. """
377 if self._parsed:
378 return
e36c2a07 379
e45ba963 380 # First get the broad details
eb4e5a3f 381 url = API_THING_DETAILS.format(self.thing_id, api_key)
e0e69fc6 382 try:
e45ba963 383 current_req = SESSION.get(url)
e0e69fc6 384 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
385 logging.error("Unable to connect for thing {}: {}".format(
386 self.thing_id, error))
387 return
e45ba963
OM
388 # Check for DMCA
389 if current_req.status_code == 403:
390 logging.error("Access to thing {} is forbidden".format(self.thing_id))
fb28c59b 391 return
e45ba963 392 if current_req.status_code != 200:
73695baf
OM
393 logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
394 current_req.text))
e45ba963
OM
395 return
396
397 thing_json = current_req.json()
398 try:
399 self._license = thing_json['license']
400 except KeyError:
401 logging.warning("No license found for thing {}?".format(self.thing_id))
402
e1306099 403 details = None
e45ba963 404 try:
e1306099 405 details = thing_json['details']
e45ba963
OM
406 except KeyError:
407 logging.warning("No description found for thing {}?".format(self.thing_id))
e0e69fc6 408
e1306099
OM
409 if details:
410 try:
411 self._details = MLStripper.strip_tags(details)
412 except ValueError as e:
413 logging.warning("Unable to strip HTML from readme: {}".format(e))
414 self._details = details
415
eb4e5a3f
OM
416 if not self.name:
417 # Probably generated with factory method.
418 try:
419 self.name = thing_json['name']
420 except KeyError:
421 logging.warning("No name found for thing {}?".format(self.thing_id))
422 self.name = self.thing_id
423
e45ba963 424 # Now get the file details
eb4e5a3f 425 file_url = API_THING_FILES.format(self.thing_id, api_key)
e45ba963
OM
426
427 try:
428 current_req = SESSION.get(file_url)
429 except requests.exceptions.ConnectionError as error:
430 logging.error("Unable to connect for thing {}: {}".format(
431 self.thing_id, error))
432 return
433
434 if current_req.status_code != 200:
73695baf
OM
435 logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
436 current_req.text))
e45ba963
OM
437 return
438
439 link_list = current_req.json()
440
441 if not link_list:
73695baf
OM
442 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
443 self.thing_id))
e45ba963
OM
444
445 for link in link_list:
446 logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
fb28c59b 447 try:
e45ba963 448 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
73695baf 449 self._file_links.append(
eb4e5a3f 450 FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
fb28c59b 451 except ValueError:
e45ba963
OM
452 logging.error(link['date'])
453
454 # Finally get the image links
eb4e5a3f 455 image_url = API_THING_IMAGES.format(self.thing_id, api_key)
e45ba963
OM
456
457 try:
458 current_req = SESSION.get(image_url)
459 except requests.exceptions.ConnectionError as error:
460 logging.error("Unable to connect for thing {}: {}".format(
461 self.thing_id, error))
462 return
463
464 if current_req.status_code != 200:
73695baf
OM
465 logging.error(
466 "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
467 current_req.text))
e45ba963 468 return
fb28c59b 469
e45ba963 470 image_list = current_req.json()
e0e69fc6 471
e45ba963 472 if not image_list:
73695baf
OM
473 logging.warning(
474 "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
475 self.thing_id))
e0e69fc6 476
e45ba963
OM
477 for image in image_list:
478 logging.debug("parsing image: {}".format(image))
73695baf 479 name = None
e45ba963
OM
480 try:
481 name = slugify(image['name'])
482 # TODO: fallback to other types
73695baf 483 url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
e45ba963
OM
484 except KeyError:
485 logging.warning("Missing image for {}".format(name))
486 self._image_links.append(ImageLink(name, url))
487
488 self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
8ed15058
OM
489 self.download_dir = os.path.join(base_dir, self.slug)
490
491 self._handle_old_directory(base_dir)
3c82f75b 492
e45ba963 493 logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
8ed15058 494 latest, self.last_time = self._find_last_download(base_dir)
fa2f3251 495
8ed15058 496 if not latest:
73695baf
OM
497 # Not yet downloaded
498 self._parsed = True
499 return
3c82f75b 500
8ed15058 501 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
502
503 # OK, so we have a timestamp, lets see if there is anything new to get
e45ba963
OM
504 # First off, are we comparing an old download that threw away the timestamp?
505 ignore_time = self.last_time == strip_time(self.last_time)
ae598d73 506 try:
e45ba963
OM
507 # TODO: Allow for comparison at the exact time
508 files_last_update = self._file_links.last_update
509 if ignore_time:
510 logging.info("Dropping time from comparison stamp as old-style download dir")
511 files_last_update = strip_time(files_last_update)
512
e45ba963 513 if files_last_update > self.last_time:
dbdb1782 514 logging.info(
ae598d73 515 "Found new/updated files {}".format(self._file_links.last_update))
3c82f75b
OM
516 self._needs_download = True
517 self._parsed = True
518 return
ae598d73
OM
519 except TypeError:
520 logging.warning("No files found for {}.".format(self.thing_id))
b497d705 521
3c82f75b 522 # Got here, so nope, no new files.
3c82f75b
OM
523 self._needs_download = False
524 self._parsed = True
525
8ed15058
OM
526 def _handle_old_directory(self, base_dir):
527 """ Deal with any old directories from previous versions of the code.
528 """
e45ba963 529 old_dir = os.path.join(base_dir, slugify(self.name))
8ed15058
OM
530 if os.path.exists(old_dir):
531 logging.warning("Found old style download_dir. Moving.")
532 rename_unique(old_dir, self.download_dir)
533
73695baf 534 def _handle_outdated_directory(self):
8ed15058
OM
535 """ Move the current download directory sideways if the thing has changed.
536 """
537 if not os.path.exists(self.download_dir):
538 # No old directory to move.
539 return None
540 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
541 if not os.path.exists(timestamp_file):
542 # Old form of download directory
543 target_dir_name = "{} - old".format(self.download_dir)
544 else:
3ac180ed 545 target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
8ed15058
OM
546 return rename_unique(self.download_dir, target_dir_name)
547
548 def _find_last_download(self, base_dir):
549 """ Look for the most recent previous download (if any) of the thing.
550 """
551 logging.info("Looking for old things")
552
553 # First the DL directory itself.
554 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
555
556 latest = None
557 latest_time = None
558
559 try:
560 logging.debug("Checking for existing download in normal place.")
561 with open(timestamp_file) as ts_fh:
562 timestamp_text = ts_fh.read().strip()
563 latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
564 latest = self.download_dir
565 except FileNotFoundError:
566 # No existing download directory. huh.
567 pass
568 except TypeError:
569 logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
570
571 # TODO: Maybe look for old download directories.
572
8ed15058
OM
573 # Now look for 7z files
574 candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
575 # +3 to allow for ' - '
73695baf 576 leading_length = len(self.slug) + 3
8ed15058
OM
577 for path in candidates:
578 candidate = os.path.basename(path)
579 try:
580 logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
3ac180ed 581 candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
8ed15058
OM
582 except ValueError:
583 logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
584 continue
585 try:
586 if candidate_time > latest_time:
587 latest_time = candidate_time
588 latest = candidate
589 except TypeError:
590 latest_time = candidate_time
591 latest = candidate
73695baf 592 logging.info("Found last old thing: {} / {}".format(latest, latest_time))
eb4e5a3f 593 return latest, latest_time
8ed15058 594
eb4e5a3f 595 def download(self, base_dir, compress, api_key):
7b84ba6d
OM
596 """ Download all files for a given thing.
597 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
598 """
3c82f75b 599 if not self._parsed:
eb4e5a3f 600 self._parse(base_dir, api_key)
3c82f75b 601
e0e69fc6 602 if not self._parsed:
8cdd1b54
OM
603 logging.error(
604 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 605 return State.FAILED
e0e69fc6 606
3c82f75b 607 if not self._needs_download:
e45ba963 608 logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
7b84ba6d 609 return State.ALREADY_DOWNLOADED
3c82f75b 610
247c2cd5 611 if not self._file_links:
73695baf
OM
612 logging.error(
613 "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
247c2cd5
OM
614 return State.FAILED
615
3c82f75b 616 # Have we already downloaded some things?
73695baf 617 renamed_dir = self._handle_outdated_directory()
3c82f75b
OM
618
619 # Get the list of files to download
3c82f75b
OM
620
621 new_file_links = []
622 old_file_links = []
ae598d73 623 self.time_stamp = None
3c82f75b
OM
624
625 if not self.last_time:
626 # If we don't have anything to copy from, then it is all new.
b497d705
OM
627 logging.debug("No last time, downloading all files")
628 new_file_links = self._file_links
ae598d73 629 self.time_stamp = new_file_links[0].last_update
73695baf 630
b497d705 631 for file_link in new_file_links:
ae598d73
OM
632 self.time_stamp = max(self.time_stamp, file_link.last_update)
633 logging.debug("New timestamp will be {}".format(self.time_stamp))
3c82f75b 634 else:
ae598d73 635 self.time_stamp = self.last_time
b497d705
OM
636 for file_link in self._file_links:
637 if file_link.last_update > self.last_time:
3c82f75b 638 new_file_links.append(file_link)
ae598d73 639 self.time_stamp = max(self.time_stamp, file_link.last_update)
3c82f75b
OM
640 else:
641 old_file_links.append(file_link)
3c82f75b 642
ae598d73 643 logging.debug("new timestamp {}".format(self.time_stamp))
3c82f75b
OM
644
645 # OK. Time to get to work.
fa2f3251 646 logging.debug("Generating download_dir")
3c82f75b 647 os.mkdir(self.download_dir)
b497d705 648 filelist_file = os.path.join(self.download_dir, "filelist.txt")
d194b140 649 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705 650 for fl in self._file_links:
73695baf 651 fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
b497d705 652
3c82f75b 653 # First grab the cached files (if any)
fa2f3251 654 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
e6d8def4
OM
655 if renamed_dir:
656 for file_link in old_file_links:
657 try:
658 old_file = os.path.join(renamed_dir, file_link.name)
659 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
660 logging.debug("Copying {} to {}".format(old_file, new_file))
661 copyfile(old_file, new_file)
662 except FileNotFoundError:
663 logging.warning(
664 "Unable to find {} in old archive, redownloading".format(file_link.name))
665 new_file_links.append(file_link)
666 except TypeError:
667 # Not altogether sure how this could occur, possibly with some combination of the old file types
668 logging.warning(
669 "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
670 new_file_links.append(file_link)
671
3c82f75b 672 # Now download the new ones
dbdb1782 673 logging.info("Downloading {} new files of {}".format(
b497d705 674 len(new_file_links), len(self._file_links)))
3c82f75b 675 try:
b497d705 676 for file_link in new_file_links:
65bd8b43 677 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 678 logging.debug("Downloading {} from {} to {}".format(
b497d705 679 file_link.name, file_link.link, file_name))
10f0238d 680 data_req = SESSION.get(file_link.link)
e45ba963 681 if data_req.status_code != 200:
73695baf
OM
682 logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
683 sanitise_url(file_link.link),
684 data_req.text))
e45ba963
OM
685 fail_dir(self.download_dir)
686 return State.FAILED
e45ba963 687
3c82f75b
OM
688 with open(file_name, 'wb') as handle:
689 handle.write(data_req.content)
690 except Exception as exception:
b497d705 691 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 692 fail_dir(self.download_dir)
7b84ba6d 693 return State.FAILED
3c82f75b 694
e45ba963 695 # People like images.
680039fe 696 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 697 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
698 try:
699 os.mkdir(image_dir)
fb28c59b 700 for imagelink in self._image_links:
e45ba963
OM
701 filename = os.path.join(image_dir, imagelink.name)
702 image_req = SESSION.get(imagelink.link)
703 if image_req.status_code != 200:
73695baf
OM
704 logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
705 sanitise_url(imagelink.link),
706 image_req.text))
e45ba963
OM
707 fail_dir(self.download_dir)
708 return State.FAILED
709 with open(truncate_name(filename), 'wb') as handle:
680039fe
OM
710 handle.write(image_req.content)
711 except Exception as exception:
e45ba963 712 logging.error("Failed to download {} - {}".format(imagelink.name, exception))
65bd8b43 713 fail_dir(self.download_dir)
7b84ba6d 714 return State.FAILED
680039fe 715
4f75dd69 716 # Best get some licenses
e45ba963 717 logging.info("writing license file")
4f75dd69 718 try:
fb28c59b 719 if self._license:
73695baf
OM
720 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
721 encoding="utf-8") as license_handle:
fb28c59b 722 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
723 except IOError as exception:
724 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 725
e45ba963
OM
726 logging.info("writing readme")
727 try:
728 if self._details:
73695baf
OM
729 with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
730 encoding="utf-8") as readme_handle:
e45ba963
OM
731 readme_handle.write("{}\n".format(self._details))
732 except IOError as exception:
733 logging.warning("Failed to write readme! {}".format(exception))
734
3c82f75b
OM
735 try:
736 # Now write the timestamp
73695baf 737 with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
ae598d73 738 timestamp_handle.write(self.time_stamp.__str__())
3c82f75b 739 except Exception as exception:
e45ba963 740 logging.error("Failed to write timestamp file - {}".format(exception))
65bd8b43 741 fail_dir(self.download_dir)
7b84ba6d 742 return State.FAILED
3c82f75b 743 self._needs_download = False
e45ba963 744 logging.debug("Download of {} finished".format(self.name))
ae598d73
OM
745 if not compress:
746 return State.OK
747
ae598d73 748 thing_dir = "{} - {} - {}".format(self.thing_id,
73695baf
OM
749 slugify(self.name),
750 self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
ae598d73 751 file_name = os.path.join(base_dir,
73695baf 752 "{}.7z".format(thing_dir))
ae598d73 753 logging.debug("Compressing {} to {}".format(
e45ba963 754 self.name,
ae598d73 755 file_name))
ae598d73 756 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
ae598d73 757 archive.writeall(self.download_dir, thing_dir)
e45ba963 758 logging.debug("Compression of {} finished.".format(self.name))
8ed15058 759 shutil.rmtree(self.download_dir)
e45ba963 760 logging.debug("Removed temporary download dir of {}.".format(self.name))
7b84ba6d 761 return State.OK
975060c9 762
dbdb1782 763
dc7d51fa 764def do_batch(batch_file, download_dir, quick, compress, api_key):
1ab49020
OM
765 """ Read a file in line by line, parsing each as a set of calls to this script."""
766 with open(batch_file) as handle:
767 for line in handle:
768 line = line.strip()
cf280385
M
769 if not line:
770 # Skip empty lines
771 continue
1ab49020
OM
772 logging.info("Handling instruction {}".format(line))
773 command_arr = line.split()
774 if command_arr[0] == "thing":
dbdb1782
OM
775 logging.debug(
776 "Handling batch thing instruction: {}".format(line))
dc7d51fa 777 Thing.from_thing_id(command_arr[1]).download(download_dir, compress, api_key)
1ab49020
OM
778 continue
779 if command_arr[0] == "collection":
dbdb1782
OM
780 logging.debug(
781 "Handling batch collection instruction: {}".format(line))
782 Collection(command_arr[1], command_arr[2],
dc7d51fa 783 download_dir, quick, compress, api_key).download()
1ab49020
OM
784 continue
785 if command_arr[0] == "user":
dbdb1782
OM
786 logging.debug(
787 "Handling batch collection instruction: {}".format(line))
dc7d51fa 788 Designs(command_arr[1], download_dir, quick, compress, api_key).download()
1ab49020
OM
789 continue
790 logging.warning("Unable to parse current instruction. Skipping.")
791
dbdb1782 792
975060c9
OM
793def main():
794 """ Entry point for script being run as a command. """
795 parser = argparse.ArgumentParser()
dbdb1782 796 parser.add_argument("-l", "--log-level", choices=[
73695baf 797 'debug', 'info', 'warning'], default='info', help="level of logging desired")
dbdb1782
OM
798 parser.add_argument("-d", "--directory",
799 help="Target directory to download into")
4f94efc8
OM
800 parser.add_argument("-f", "--log-file",
801 help="Place to log debug information to")
7b84ba6d
OM
802 parser.add_argument("-q", "--quick", action="store_true",
803 help="Assume date ordering on posts")
ae598d73
OM
804 parser.add_argument("-c", "--compress", action="store_true",
805 help="Compress files")
e45ba963
OM
806 parser.add_argument("-a", "--api-key",
807 help="API key for thingiverse")
7b84ba6d 808
dbdb1782
OM
809 subparsers = parser.add_subparsers(
810 help="Type of thing to download", dest="subcommand")
811 collection_parser = subparsers.add_parser(
b7bfef68 812 'collection', help="Download one or more entire collection(s)")
dbdb1782 813 collection_parser.add_argument(
b7bfef68 814 "owner", help="The owner of the collection(s) to get")
dbdb1782 815 collection_parser.add_argument(
73695baf 816 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
817 thing_parser = subparsers.add_parser(
818 'thing', help="Download a single thing.")
8cdd1b54
OM
819 thing_parser.add_argument(
820 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 821 user_parser = subparsers.add_parser(
73695baf 822 "user", help="Download all things by one or more users")
8cdd1b54
OM
823 user_parser.add_argument(
824 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
825 batch_parser = subparsers.add_parser(
826 "batch", help="Perform multiple actions written in a text file")
827 batch_parser.add_argument(
828 "batch_file", help="The name of the file to read.")
680039fe 829 subparsers.add_parser("version", help="Show the current version")
4a98996b 830
975060c9 831 args = parser.parse_args()
4a98996b
OM
832 if not args.subcommand:
833 parser.print_help()
834 sys.exit(1)
d66f1f78
OM
835 if not args.directory:
836 args.directory = os.getcwd()
4f94efc8
OM
837
838 logger = logging.getLogger()
839 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
840 logger.setLevel(logging.DEBUG)
841 console_handler = logging.StreamHandler()
842 console_handler.setLevel(args.log_level.upper())
843
e45ba963 844 if args.api_key:
714415bd 845 api_key = args.api_key
e45ba963
OM
846 else:
847 try:
848 with open("api.key") as fh:
714415bd 849 api_key = fh.read().strip()
e45ba963
OM
850 except Exception as e:
851 logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
852 logging.error("Exception: {}".format(e))
853 return
854
4f94efc8
OM
855 logger.addHandler(console_handler)
856 if args.log_file:
857 file_handler = logging.FileHandler(args.log_file)
858 file_handler.setLevel(logging.DEBUG)
859 file_handler.setFormatter(formatter)
860 logger.addHandler(file_handler)
fa2f3251 861
6a777954
OM
862 # Start downloader
863 thing_queue = multiprocessing.JoinableQueue()
864 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
714415bd 865 downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
6a777954
OM
866 for downloader in downloaders:
867 downloader.start()
868
4a98996b 869 if args.subcommand.startswith("collection"):
b7bfef68 870 for collection in args.collections:
714415bd 871 Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
4a98996b 872 if args.subcommand == "thing":
b7bfef68 873 for thing in args.things:
6a777954 874 thing_queue.put(thing)
3522a3bf 875 if args.subcommand == "user":
b7bfef68 876 for user in args.users:
714415bd 877 Designs(user, args.directory, args.quick, args.compress, api_key).download()
db8066ec
OM
878 if args.subcommand == "version":
879 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 880 if args.subcommand == "batch":
dc7d51fa 881 do_batch(args.batch_file, args.directory, args.quick, args.compress, api_key)
1ab49020 882
6a777954 883 # Stop the downloader processes
73695baf 884 for _ in downloaders:
6a777954 885 thing_queue.put(None)
975060c9 886
d194b140 887
73695baf 888if __name__ == "__main__":
0930777e 889 multiprocessing.freeze_support()
975060c9 890 main()