remove globals
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
b497d705 17from dataclasses import dataclass
9828dabe 18import py7zr
8ed15058
OM
19import glob
20import shutil
975060c9 21
ae598d73
OM
22SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
23
8ed15058
OM
24# I don't think this is exported by datetime
25DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
3ac180ed
OM
26# Windows cannot handle : in filenames
27SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
8ed15058 28
73695baf
OM
29API_BASE = "https://api.thingiverse.com"
30ACCESS_QP = "access_token={}"
31PAGE_QP = "page={}"
714415bd 32API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
e45ba963 33API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
975060c9 34
e45ba963
OM
35# Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
36API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
37API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
38
39API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
40API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
41API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
10f0238d 42API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
e45ba963 43
6a777954 44DOWNLOADER_COUNT = 1
7b84ba6d 45RETRY_COUNT = 3
6a777954 46
65bd8b43
OM
47MAX_PATH_LENGTH = 250
48
eb7a88fb 49VERSION = "0.10.3"
dbdb1782 50
8ed15058 51TIMESTAMP_FILE = "timestamp.txt"
b497d705 52
e45ba963 53SESSION = requests.Session()
b497d705 54
73695baf 55
e45ba963
OM
56@dataclass
57class ThingLink:
58 thing_id: str
59 name: str
60 api_link: str
b497d705 61
73695baf 62
b497d705
OM
63@dataclass
64class FileLink:
65 name: str
ae598d73
OM
66 last_update: datetime.datetime
67 link: str
68
73695baf 69
e45ba963
OM
70@dataclass
71class ImageLink:
72 name: str
73 link: str
74
73695baf 75
ae598d73 76class FileLinks:
73695baf
OM
77 def __init__(self, initial_links=None):
78 if initial_links is None:
79 initial_links = []
ae598d73
OM
80 self.links = []
81 self.last_update = None
73695baf 82 for link in initial_links:
ae598d73
OM
83 self.append(link)
84
85 def __iter__(self):
86 return iter(self.links)
87
88 def __getitem__(self, item):
89 return self.links[item]
90
91 def __len__(self):
92 return len(self.links)
93
94 def append(self, link):
95 try:
96 self.last_update = max(self.last_update, link.last_update)
97 except TypeError:
98 self.last_update = link.last_update
99 self.links.append(link)
8ed15058 100
b497d705 101
7b84ba6d
OM
102class State(enum.Enum):
103 OK = enum.auto()
104 FAILED = enum.auto()
105 ALREADY_DOWNLOADED = enum.auto()
106
73695baf 107
e45ba963
OM
108def sanitise_url(url):
109 """ remove api keys from an url
110 """
111 return re.sub(r'access_token=\w*',
112 'access_token=***',
113 url)
114
73695baf 115
e45ba963
OM
116def strip_time(date_obj):
117 """ Takes a datetime object and returns another with the time set to 00:00
118 """
119 return datetime.datetime.combine(date_obj.date(), datetime.time())
120
73695baf 121
8ed15058
OM
122def rename_unique(dir_name, target_dir_name):
123 """ Move a directory sideways to a new name, ensuring it is unique.
65bd8b43 124 """
8ed15058 125 target_dir = target_dir_name
65bd8b43
OM
126 inc = 0
127 while os.path.exists(target_dir):
73695baf
OM
128 target_dir = "{}_{}".format(target_dir_name, inc)
129 inc += 1
65bd8b43 130 os.rename(dir_name, target_dir)
8ed15058
OM
131 return target_dir
132
133
134def fail_dir(dir_name):
135 """ When a download has failed, move it sideways.
136 """
73695baf 137 return rename_unique(dir_name, "{}_failed".format(dir_name))
65bd8b43
OM
138
139
140def truncate_name(file_name):
141 """ Ensure the filename is not too long for, well windows basically.
142 """
143 path = os.path.abspath(file_name)
144 if len(path) <= MAX_PATH_LENGTH:
145 return path
65bd8b43
OM
146 base, extension = os.path.splitext(path)
147 inc = 0
148 new_path = "{}_{}{}".format(base, inc, extension)
149 while os.path.exists(new_path):
150 new_path = "{}_{}{}".format(base, inc, extension)
151 inc += 1
152 return new_path
153
154
975060c9
OM
155def slugify(value):
156 """
d194b140
OM
157 Normalise string, removes invalid for filename charactersr
158 and converts string to lowercase.
975060c9 159 """
e45ba963 160 logging.debug("Sluggyfying {}".format(value))
d194b140 161 value = unicodedata.normalize('NFKC', value).lower().strip()
73695baf 162 value = re.sub(r'[\\/<>:?*|"]', '', value)
65bd8b43 163 value = re.sub(r'\.*$', '', value)
eb7a88fb 164 return value.strip()
975060c9 165
b497d705 166
6a777954
OM
167class Downloader(multiprocessing.Process):
168 """
169 Class to handle downloading the things we have found to get.
170 """
171
eb4e5a3f 172 def __init__(self, thing_queue, download_directory, compress, api_key):
6a777954
OM
173 multiprocessing.Process.__init__(self)
174 # TODO: add parameters
175 self.thing_queue = thing_queue
176 self.download_directory = download_directory
ae598d73 177 self.compress = compress
eb4e5a3f 178 self.api_key = api_key
6a777954
OM
179
180 def run(self):
181 """ actual download loop.
182 """
183 while True:
eb4e5a3f 184 thing_id = self.thing_queue.get()
6a777954
OM
185 if thing_id is None:
186 logging.info("Shutting download queue")
187 self.thing_queue.task_done()
188 break
eb4e5a3f
OM
189 thing = None
190 if isinstance(thing_id, str):
191 thing = Thing.from_thing_id(thing_id)
192 if isinstance(thing_id, ThingLink):
193 thing = Thing(thing_id)
194 if not thing:
195 logging.error("Don't know how to handle thing_id {}".format(thing_id))
196 else:
197 logging.info("Handling id {}".format(thing_id))
198 thing.download(self.download_directory, self.compress, self.api_key)
6a777954
OM
199 self.thing_queue.task_done()
200 return
201
7b84ba6d 202
3522a3bf 203class Grouping:
d66f1f78 204 """ Holds details of a group of things for download
3c82f75b
OM
205 This is effectively (although not actually) an abstract class
206 - use Collection or Designs instead.
207 """
dbdb1782 208
714415bd 209 def __init__(self, quick, compress, api_key):
975060c9
OM
210 self.things = []
211 self.total = 0
212 self.req_id = None
213 self.last_page = 0
214 self.per_page = None
7b84ba6d 215 # Should we stop downloading when we hit a known datestamp?
73695baf 216 self.quick = quick
ae598d73 217 self.compress = compress
714415bd 218 self.api_key = api_key
948bd56f 219 # These should be set by child classes.
3522a3bf
OM
220 self.url = None
221 self.download_dir = None
975060c9 222
714415bd 223
73695baf 224 @property
3522a3bf
OM
225 def get(self):
226 """ retrieve the things of the grouping. """
975060c9
OM
227 if self.things:
228 # We've already done it.
229 return self.things
230
3522a3bf
OM
231 # Check for initialisation:
232 if not self.url:
fa2f3251 233 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
234 raise ValueError("No URL set - object not initialised properly?")
235
236 # Get the internal details of the grouping.
e45ba963 237 logging.debug("Querying {}".format(sanitise_url(self.url)))
73695baf
OM
238
239 # self.url should already have been formatted as we don't need pagination
240 logging.info("requesting:{}".format(sanitise_url(self.url)))
241 current_req = SESSION.get(self.url)
242 if current_req.status_code != 200:
243 logging.error(
244 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
245 current_req.text))
e45ba963 246 else:
73695baf
OM
247 current_json = current_req.json()
248 for thing in current_json:
249 logging.info(thing)
250 self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
e45ba963 251 logging.info("Found {} things.".format(len(self.things)))
975060c9
OM
252 return self.things
253
254 def download(self):
255 """ Downloads all the files in a collection """
256 if not self.things:
714415bd 257 self.get
3522a3bf
OM
258
259 if not self.download_dir:
dbdb1782
OM
260 raise ValueError(
261 "No download_dir set - invalidly initialised object?")
3522a3bf 262
975060c9 263 base_dir = os.getcwd()
975060c9 264 try:
3522a3bf 265 os.mkdir(self.download_dir)
975060c9 266 except FileExistsError:
fa2f3251 267 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 268 .format(self.download_dir))
fa2f3251 269 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 270 for idx, thing in enumerate(self.things):
fb28c59b 271 logging.info("Downloading thing {} - {}".format(idx, thing))
714415bd 272 return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
eb4e5a3f 273 if self.quick and return_code == State.ALREADY_DOWNLOADED:
7b84ba6d
OM
274 logging.info("Caught up, stopping.")
275 return
975060c9 276
73695baf 277
3522a3bf
OM
278class Collection(Grouping):
279 """ Holds details of a collection. """
dbdb1782 280
714415bd
OM
281 def __init__(self, user, name, directory, quick, compress, api_key):
282 Grouping.__init__(self, quick, compress, api_key)
3522a3bf
OM
283 self.user = user
284 self.name = name
e45ba963
OM
285 self.paginated = False
286 # need to figure out the the ID for the collection
714415bd 287 collection_url = API_USER_COLLECTIONS.format(user, api_key)
e45ba963
OM
288 try:
289 current_req = SESSION.get(collection_url)
290 except requests.exceptions.ConnectionError as error:
73695baf
OM
291 logging.error("Unable to connect for collections for user {}: {}".format(
292 self.user, error))
e45ba963
OM
293 return
294 if current_req.status_code != 200:
73695baf
OM
295 logging.error(
296 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
297 current_req.text))
e45ba963
OM
298 return
299 collection_list = current_req.json()
300 try:
301 # case insensitive to retain parity with previous behaviour
302 collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
303 except IndexError:
304 logging.error("Unable to find collection {} for user {}".format(name, user))
305 return
306 self.collection_id = collection['id']
714415bd 307 self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
e45ba963 308
d66f1f78 309 self.download_dir = os.path.join(directory,
3c82f75b 310 "{}-{}".format(slugify(self.user), slugify(self.name)))
3522a3bf 311
dbdb1782 312
3522a3bf
OM
313class Designs(Grouping):
314 """ Holds details of all of a users' designs. """
dbdb1782 315
714415bd
OM
316 def __init__(self, user, directory, quick, compress, api_key):
317 Grouping.__init__(self, quick, compress, api_key)
3522a3bf 318 self.user = user
714415bd 319 self.url = API_USER_DESIGNS.format(user, api_key)
dbdb1782
OM
320 self.download_dir = os.path.join(
321 directory, "{} designs".format(slugify(self.user)))
975060c9 322
dbdb1782 323
3c82f75b
OM
324class Thing:
325 """ An individual design on thingiverse. """
dbdb1782 326
e45ba963
OM
327 def __init__(self, thing_link):
328 self.thing_id = thing_link.thing_id
329 self.name = thing_link.name
3c82f75b
OM
330 self.last_time = None
331 self._parsed = False
332 self._needs_download = True
333 self.text = None
3c82f75b 334 self.download_dir = None
ae598d73
OM
335 self.time_stamp = None
336 self._file_links = FileLinks()
e45ba963 337 self._image_links = []
975060c9 338
eb4e5a3f
OM
339 @classmethod
340 def from_thing_id(cls, thing_id):
341 """
342 Factory method that looks up a thing by ID and creates a Thing object for it
343 :param thing_id: to look up
344 :return: Thing or None
345 """
346 return Thing(ThingLink(thing_id, "", ""))
347
348
349 def _parse(self, base_dir, api_key):
3c82f75b
OM
350 """ Work out what, if anything needs to be done. """
351 if self._parsed:
352 return
e36c2a07 353
e45ba963 354 # First get the broad details
eb4e5a3f
OM
355 url = API_THING_DETAILS.format(self.thing_id, api_key)
356 logging.error(url)
e0e69fc6 357 try:
e45ba963 358 current_req = SESSION.get(url)
e0e69fc6 359 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
360 logging.error("Unable to connect for thing {}: {}".format(
361 self.thing_id, error))
362 return
e45ba963
OM
363 # Check for DMCA
364 if current_req.status_code == 403:
365 logging.error("Access to thing {} is forbidden".format(self.thing_id))
fb28c59b 366 return
e45ba963 367 if current_req.status_code != 200:
73695baf
OM
368 logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
369 current_req.text))
e45ba963
OM
370 return
371
372 thing_json = current_req.json()
373 try:
374 self._license = thing_json['license']
375 except KeyError:
376 logging.warning("No license found for thing {}?".format(self.thing_id))
377
378 # TODO: Get non-html version of this?
379 try:
380 self._details = thing_json['details']
381 except KeyError:
382 logging.warning("No description found for thing {}?".format(self.thing_id))
e0e69fc6 383
eb4e5a3f
OM
384 if not self.name:
385 # Probably generated with factory method.
386 try:
387 self.name = thing_json['name']
388 except KeyError:
389 logging.warning("No name found for thing {}?".format(self.thing_id))
390 self.name = self.thing_id
391
e45ba963 392 # Now get the file details
eb4e5a3f 393 file_url = API_THING_FILES.format(self.thing_id, api_key)
e45ba963
OM
394
395 try:
396 current_req = SESSION.get(file_url)
397 except requests.exceptions.ConnectionError as error:
398 logging.error("Unable to connect for thing {}: {}".format(
399 self.thing_id, error))
400 return
401
402 if current_req.status_code != 200:
73695baf
OM
403 logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
404 current_req.text))
e45ba963
OM
405 return
406
407 link_list = current_req.json()
408
409 if not link_list:
73695baf
OM
410 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
411 self.thing_id))
e45ba963
OM
412
413 for link in link_list:
414 logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
fb28c59b 415 try:
e45ba963 416 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
73695baf 417 self._file_links.append(
eb4e5a3f 418 FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
fb28c59b 419 except ValueError:
e45ba963
OM
420 logging.error(link['date'])
421
422 # Finally get the image links
eb4e5a3f 423 image_url = API_THING_IMAGES.format(self.thing_id, api_key)
e45ba963
OM
424
425 try:
426 current_req = SESSION.get(image_url)
427 except requests.exceptions.ConnectionError as error:
428 logging.error("Unable to connect for thing {}: {}".format(
429 self.thing_id, error))
430 return
431
432 if current_req.status_code != 200:
73695baf
OM
433 logging.error(
434 "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
435 current_req.text))
e45ba963 436 return
fb28c59b 437
e45ba963 438 image_list = current_req.json()
e0e69fc6 439
e45ba963 440 if not image_list:
73695baf
OM
441 logging.warning(
442 "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
443 self.thing_id))
e0e69fc6 444
e45ba963
OM
445 for image in image_list:
446 logging.debug("parsing image: {}".format(image))
73695baf 447 name = None
e45ba963
OM
448 try:
449 name = slugify(image['name'])
450 # TODO: fallback to other types
73695baf 451 url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
e45ba963
OM
452 except KeyError:
453 logging.warning("Missing image for {}".format(name))
454 self._image_links.append(ImageLink(name, url))
455
456 self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
8ed15058
OM
457 self.download_dir = os.path.join(base_dir, self.slug)
458
459 self._handle_old_directory(base_dir)
3c82f75b 460
e45ba963 461 logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
8ed15058 462 latest, self.last_time = self._find_last_download(base_dir)
fa2f3251 463
8ed15058 464 if not latest:
73695baf
OM
465 # Not yet downloaded
466 self._parsed = True
467 return
3c82f75b 468
8ed15058 469 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
470
471 # OK, so we have a timestamp, lets see if there is anything new to get
e45ba963
OM
472 # First off, are we comparing an old download that threw away the timestamp?
473 ignore_time = self.last_time == strip_time(self.last_time)
ae598d73 474 try:
e45ba963
OM
475 # TODO: Allow for comparison at the exact time
476 files_last_update = self._file_links.last_update
477 if ignore_time:
478 logging.info("Dropping time from comparison stamp as old-style download dir")
479 files_last_update = strip_time(files_last_update)
480
e45ba963 481 if files_last_update > self.last_time:
dbdb1782 482 logging.info(
ae598d73 483 "Found new/updated files {}".format(self._file_links.last_update))
3c82f75b
OM
484 self._needs_download = True
485 self._parsed = True
486 return
ae598d73
OM
487 except TypeError:
488 logging.warning("No files found for {}.".format(self.thing_id))
b497d705 489
3c82f75b 490 # Got here, so nope, no new files.
3c82f75b
OM
491 self._needs_download = False
492 self._parsed = True
493
8ed15058
OM
494 def _handle_old_directory(self, base_dir):
495 """ Deal with any old directories from previous versions of the code.
496 """
e45ba963 497 old_dir = os.path.join(base_dir, slugify(self.name))
8ed15058
OM
498 if os.path.exists(old_dir):
499 logging.warning("Found old style download_dir. Moving.")
500 rename_unique(old_dir, self.download_dir)
501
73695baf 502 def _handle_outdated_directory(self):
8ed15058
OM
503 """ Move the current download directory sideways if the thing has changed.
504 """
505 if not os.path.exists(self.download_dir):
506 # No old directory to move.
507 return None
508 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
509 if not os.path.exists(timestamp_file):
510 # Old form of download directory
511 target_dir_name = "{} - old".format(self.download_dir)
512 else:
3ac180ed 513 target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
8ed15058
OM
514 return rename_unique(self.download_dir, target_dir_name)
515
516 def _find_last_download(self, base_dir):
517 """ Look for the most recent previous download (if any) of the thing.
518 """
519 logging.info("Looking for old things")
520
521 # First the DL directory itself.
522 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
523
524 latest = None
525 latest_time = None
526
527 try:
528 logging.debug("Checking for existing download in normal place.")
529 with open(timestamp_file) as ts_fh:
530 timestamp_text = ts_fh.read().strip()
531 latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
532 latest = self.download_dir
533 except FileNotFoundError:
534 # No existing download directory. huh.
535 pass
536 except TypeError:
537 logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
538
539 # TODO: Maybe look for old download directories.
540
8ed15058
OM
541 # Now look for 7z files
542 candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
543 # +3 to allow for ' - '
73695baf 544 leading_length = len(self.slug) + 3
8ed15058
OM
545 for path in candidates:
546 candidate = os.path.basename(path)
547 try:
548 logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
3ac180ed 549 candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
8ed15058
OM
550 except ValueError:
551 logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
552 continue
553 try:
554 if candidate_time > latest_time:
555 latest_time = candidate_time
556 latest = candidate
557 except TypeError:
558 latest_time = candidate_time
559 latest = candidate
73695baf 560 logging.info("Found last old thing: {} / {}".format(latest, latest_time))
eb4e5a3f 561 return latest, latest_time
8ed15058 562
eb4e5a3f 563 def download(self, base_dir, compress, api_key):
7b84ba6d
OM
564 """ Download all files for a given thing.
565 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
566 """
3c82f75b 567 if not self._parsed:
eb4e5a3f 568 self._parse(base_dir, api_key)
3c82f75b 569
e0e69fc6 570 if not self._parsed:
8cdd1b54
OM
571 logging.error(
572 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 573 return State.FAILED
e0e69fc6 574
3c82f75b 575 if not self._needs_download:
e45ba963 576 logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
7b84ba6d 577 return State.ALREADY_DOWNLOADED
3c82f75b 578
247c2cd5 579 if not self._file_links:
73695baf
OM
580 logging.error(
581 "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
247c2cd5
OM
582 return State.FAILED
583
3c82f75b 584 # Have we already downloaded some things?
73695baf 585 renamed_dir = self._handle_outdated_directory()
3c82f75b
OM
586
587 # Get the list of files to download
3c82f75b
OM
588
589 new_file_links = []
590 old_file_links = []
ae598d73 591 self.time_stamp = None
3c82f75b
OM
592
593 if not self.last_time:
594 # If we don't have anything to copy from, then it is all new.
b497d705
OM
595 logging.debug("No last time, downloading all files")
596 new_file_links = self._file_links
ae598d73 597 self.time_stamp = new_file_links[0].last_update
73695baf 598
b497d705 599 for file_link in new_file_links:
ae598d73
OM
600 self.time_stamp = max(self.time_stamp, file_link.last_update)
601 logging.debug("New timestamp will be {}".format(self.time_stamp))
3c82f75b 602 else:
ae598d73 603 self.time_stamp = self.last_time
b497d705
OM
604 for file_link in self._file_links:
605 if file_link.last_update > self.last_time:
3c82f75b 606 new_file_links.append(file_link)
ae598d73 607 self.time_stamp = max(self.time_stamp, file_link.last_update)
3c82f75b
OM
608 else:
609 old_file_links.append(file_link)
3c82f75b 610
ae598d73 611 logging.debug("new timestamp {}".format(self.time_stamp))
3c82f75b
OM
612
613 # OK. Time to get to work.
fa2f3251 614 logging.debug("Generating download_dir")
3c82f75b 615 os.mkdir(self.download_dir)
b497d705 616 filelist_file = os.path.join(self.download_dir, "filelist.txt")
eb7a88fb 617 logging.error("\nd:{}\nf:{}".format(self.download_dir, filelist_file))
d194b140 618 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705 619 for fl in self._file_links:
73695baf 620 fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
b497d705 621
3c82f75b 622 # First grab the cached files (if any)
fa2f3251 623 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
e6d8def4
OM
624 if renamed_dir:
625 for file_link in old_file_links:
626 try:
627 old_file = os.path.join(renamed_dir, file_link.name)
628 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
629 logging.debug("Copying {} to {}".format(old_file, new_file))
630 copyfile(old_file, new_file)
631 except FileNotFoundError:
632 logging.warning(
633 "Unable to find {} in old archive, redownloading".format(file_link.name))
634 new_file_links.append(file_link)
635 except TypeError:
636 # Not altogether sure how this could occur, possibly with some combination of the old file types
637 logging.warning(
638 "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
639 new_file_links.append(file_link)
640
3c82f75b 641 # Now download the new ones
dbdb1782 642 logging.info("Downloading {} new files of {}".format(
b497d705 643 len(new_file_links), len(self._file_links)))
3c82f75b 644 try:
b497d705 645 for file_link in new_file_links:
65bd8b43 646 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 647 logging.debug("Downloading {} from {} to {}".format(
b497d705 648 file_link.name, file_link.link, file_name))
10f0238d 649 data_req = SESSION.get(file_link.link)
e45ba963 650 if data_req.status_code != 200:
73695baf
OM
651 logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
652 sanitise_url(file_link.link),
653 data_req.text))
e45ba963
OM
654 fail_dir(self.download_dir)
655 return State.FAILED
e45ba963 656
3c82f75b
OM
657 with open(file_name, 'wb') as handle:
658 handle.write(data_req.content)
659 except Exception as exception:
b497d705 660 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 661 fail_dir(self.download_dir)
7b84ba6d 662 return State.FAILED
3c82f75b 663
e45ba963 664 # People like images.
680039fe 665 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 666 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
667 try:
668 os.mkdir(image_dir)
fb28c59b 669 for imagelink in self._image_links:
e45ba963
OM
670 filename = os.path.join(image_dir, imagelink.name)
671 image_req = SESSION.get(imagelink.link)
672 if image_req.status_code != 200:
73695baf
OM
673 logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
674 sanitise_url(imagelink.link),
675 image_req.text))
e45ba963
OM
676 fail_dir(self.download_dir)
677 return State.FAILED
678 with open(truncate_name(filename), 'wb') as handle:
680039fe
OM
679 handle.write(image_req.content)
680 except Exception as exception:
e45ba963 681 logging.error("Failed to download {} - {}".format(imagelink.name, exception))
65bd8b43 682 fail_dir(self.download_dir)
7b84ba6d 683 return State.FAILED
680039fe 684
4f75dd69 685 # Best get some licenses
e45ba963 686 logging.info("writing license file")
4f75dd69 687 try:
fb28c59b 688 if self._license:
73695baf
OM
689 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
690 encoding="utf-8") as license_handle:
fb28c59b 691 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
692 except IOError as exception:
693 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 694
e45ba963
OM
695 logging.info("writing readme")
696 try:
697 if self._details:
73695baf
OM
698 with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
699 encoding="utf-8") as readme_handle:
e45ba963
OM
700 readme_handle.write("{}\n".format(self._details))
701 except IOError as exception:
702 logging.warning("Failed to write readme! {}".format(exception))
703
3c82f75b
OM
704 try:
705 # Now write the timestamp
73695baf 706 with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
ae598d73 707 timestamp_handle.write(self.time_stamp.__str__())
3c82f75b 708 except Exception as exception:
e45ba963 709 logging.error("Failed to write timestamp file - {}".format(exception))
65bd8b43 710 fail_dir(self.download_dir)
7b84ba6d 711 return State.FAILED
3c82f75b 712 self._needs_download = False
e45ba963 713 logging.debug("Download of {} finished".format(self.name))
ae598d73
OM
714 if not compress:
715 return State.OK
716
ae598d73 717 thing_dir = "{} - {} - {}".format(self.thing_id,
73695baf
OM
718 slugify(self.name),
719 self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
ae598d73 720 file_name = os.path.join(base_dir,
73695baf 721 "{}.7z".format(thing_dir))
ae598d73 722 logging.debug("Compressing {} to {}".format(
e45ba963 723 self.name,
ae598d73 724 file_name))
ae598d73 725 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
ae598d73 726 archive.writeall(self.download_dir, thing_dir)
e45ba963 727 logging.debug("Compression of {} finished.".format(self.name))
8ed15058 728 shutil.rmtree(self.download_dir)
e45ba963 729 logging.debug("Removed temporary download dir of {}.".format(self.name))
7b84ba6d 730 return State.OK
975060c9 731
dbdb1782 732
ae598d73 733def do_batch(batch_file, download_dir, quick, compress):
1ab49020
OM
734 """ Read a file in line by line, parsing each as a set of calls to this script."""
735 with open(batch_file) as handle:
736 for line in handle:
737 line = line.strip()
cf280385
M
738 if not line:
739 # Skip empty lines
740 continue
1ab49020
OM
741 logging.info("Handling instruction {}".format(line))
742 command_arr = line.split()
743 if command_arr[0] == "thing":
dbdb1782
OM
744 logging.debug(
745 "Handling batch thing instruction: {}".format(line))
eb4e5a3f 746 Thing.from_thing_id(command_arr[1]).download(download_dir, compress)
1ab49020
OM
747 continue
748 if command_arr[0] == "collection":
dbdb1782
OM
749 logging.debug(
750 "Handling batch collection instruction: {}".format(line))
751 Collection(command_arr[1], command_arr[2],
ae598d73 752 download_dir, quick, compress).download()
1ab49020
OM
753 continue
754 if command_arr[0] == "user":
dbdb1782
OM
755 logging.debug(
756 "Handling batch collection instruction: {}".format(line))
ae598d73 757 Designs(command_arr[1], download_dir, quick, compress).download()
1ab49020
OM
758 continue
759 logging.warning("Unable to parse current instruction. Skipping.")
760
dbdb1782 761
975060c9
OM
762def main():
763 """ Entry point for script being run as a command. """
764 parser = argparse.ArgumentParser()
dbdb1782 765 parser.add_argument("-l", "--log-level", choices=[
73695baf 766 'debug', 'info', 'warning'], default='info', help="level of logging desired")
dbdb1782
OM
767 parser.add_argument("-d", "--directory",
768 help="Target directory to download into")
4f94efc8
OM
769 parser.add_argument("-f", "--log-file",
770 help="Place to log debug information to")
7b84ba6d
OM
771 parser.add_argument("-q", "--quick", action="store_true",
772 help="Assume date ordering on posts")
ae598d73
OM
773 parser.add_argument("-c", "--compress", action="store_true",
774 help="Compress files")
e45ba963
OM
775 parser.add_argument("-a", "--api-key",
776 help="API key for thingiverse")
7b84ba6d 777
dbdb1782
OM
778 subparsers = parser.add_subparsers(
779 help="Type of thing to download", dest="subcommand")
780 collection_parser = subparsers.add_parser(
b7bfef68 781 'collection', help="Download one or more entire collection(s)")
dbdb1782 782 collection_parser.add_argument(
b7bfef68 783 "owner", help="The owner of the collection(s) to get")
dbdb1782 784 collection_parser.add_argument(
73695baf 785 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
786 thing_parser = subparsers.add_parser(
787 'thing', help="Download a single thing.")
8cdd1b54
OM
788 thing_parser.add_argument(
789 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 790 user_parser = subparsers.add_parser(
73695baf 791 "user", help="Download all things by one or more users")
8cdd1b54
OM
792 user_parser.add_argument(
793 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
794 batch_parser = subparsers.add_parser(
795 "batch", help="Perform multiple actions written in a text file")
796 batch_parser.add_argument(
797 "batch_file", help="The name of the file to read.")
680039fe 798 subparsers.add_parser("version", help="Show the current version")
4a98996b 799
975060c9 800 args = parser.parse_args()
4a98996b
OM
801 if not args.subcommand:
802 parser.print_help()
803 sys.exit(1)
d66f1f78
OM
804 if not args.directory:
805 args.directory = os.getcwd()
4f94efc8
OM
806
807 logger = logging.getLogger()
808 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
809 logger.setLevel(logging.DEBUG)
810 console_handler = logging.StreamHandler()
811 console_handler.setLevel(args.log_level.upper())
812
714415bd 813
e45ba963 814 if args.api_key:
714415bd 815 api_key = args.api_key
e45ba963
OM
816 else:
817 try:
818 with open("api.key") as fh:
714415bd 819 api_key = fh.read().strip()
e45ba963
OM
820 except Exception as e:
821 logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
822 logging.error("Exception: {}".format(e))
823 return
824
4f94efc8
OM
825 logger.addHandler(console_handler)
826 if args.log_file:
827 file_handler = logging.FileHandler(args.log_file)
828 file_handler.setLevel(logging.DEBUG)
829 file_handler.setFormatter(formatter)
830 logger.addHandler(file_handler)
fa2f3251 831
6a777954
OM
832 # Start downloader
833 thing_queue = multiprocessing.JoinableQueue()
834 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
714415bd 835 downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
6a777954
OM
836 for downloader in downloaders:
837 downloader.start()
838
4a98996b 839 if args.subcommand.startswith("collection"):
b7bfef68 840 for collection in args.collections:
714415bd 841 Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
4a98996b 842 if args.subcommand == "thing":
b7bfef68 843 for thing in args.things:
6a777954 844 thing_queue.put(thing)
3522a3bf 845 if args.subcommand == "user":
b7bfef68 846 for user in args.users:
714415bd 847 Designs(user, args.directory, args.quick, args.compress, api_key).download()
db8066ec
OM
848 if args.subcommand == "version":
849 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 850 if args.subcommand == "batch":
ae598d73 851 do_batch(args.batch_file, args.directory, args.quick, args.compress)
1ab49020 852
6a777954 853 # Stop the downloader processes
73695baf 854 for _ in downloaders:
6a777954 855 thing_queue.put(None)
975060c9 856
d194b140 857
73695baf 858if __name__ == "__main__":
0930777e 859 multiprocessing.freeze_support()
975060c9 860 main()