fix single thing downloads
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
b497d705 17from dataclasses import dataclass
9828dabe 18import py7zr
8ed15058
OM
19import glob
20import shutil
975060c9 21
ae598d73
OM
22SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
23
8ed15058
OM
24# I don't think this is exported by datetime
25DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
3ac180ed
OM
26# Windows cannot handle : in filenames
27SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
8ed15058 28
73695baf
OM
29API_BASE = "https://api.thingiverse.com"
30ACCESS_QP = "access_token={}"
31PAGE_QP = "page={}"
e45ba963
OM
32API_USER_DESIGNS = API_BASE + "/users/{}/things/"
33API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
975060c9 34
e45ba963
OM
35# Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
36API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
37API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
38
39API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
40API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
41API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
10f0238d 42API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
e45ba963
OM
43
44API_KEY = None
dd8c35f4 45
6a777954 46DOWNLOADER_COUNT = 1
7b84ba6d 47RETRY_COUNT = 3
6a777954 48
65bd8b43
OM
49MAX_PATH_LENGTH = 250
50
10f0238d 51VERSION = "0.10.2"
dbdb1782 52
8ed15058 53TIMESTAMP_FILE = "timestamp.txt"
b497d705 54
e45ba963 55SESSION = requests.Session()
b497d705 56
73695baf 57
e45ba963
OM
58@dataclass
59class ThingLink:
60 thing_id: str
61 name: str
62 api_link: str
b497d705 63
73695baf 64
b497d705
OM
65@dataclass
66class FileLink:
67 name: str
ae598d73
OM
68 last_update: datetime.datetime
69 link: str
70
73695baf 71
e45ba963
OM
72@dataclass
73class ImageLink:
74 name: str
75 link: str
76
73695baf 77
ae598d73 78class FileLinks:
73695baf
OM
79 def __init__(self, initial_links=None):
80 if initial_links is None:
81 initial_links = []
ae598d73
OM
82 self.links = []
83 self.last_update = None
73695baf 84 for link in initial_links:
ae598d73
OM
85 self.append(link)
86
87 def __iter__(self):
88 return iter(self.links)
89
90 def __getitem__(self, item):
91 return self.links[item]
92
93 def __len__(self):
94 return len(self.links)
95
96 def append(self, link):
97 try:
98 self.last_update = max(self.last_update, link.last_update)
99 except TypeError:
100 self.last_update = link.last_update
101 self.links.append(link)
8ed15058 102
b497d705 103
7b84ba6d
OM
104class State(enum.Enum):
105 OK = enum.auto()
106 FAILED = enum.auto()
107 ALREADY_DOWNLOADED = enum.auto()
108
73695baf 109
e45ba963
OM
110def sanitise_url(url):
111 """ remove api keys from an url
112 """
113 return re.sub(r'access_token=\w*',
114 'access_token=***',
115 url)
116
73695baf 117
e45ba963
OM
118def strip_time(date_obj):
119 """ Takes a datetime object and returns another with the time set to 00:00
120 """
121 return datetime.datetime.combine(date_obj.date(), datetime.time())
122
73695baf 123
8ed15058
OM
124def rename_unique(dir_name, target_dir_name):
125 """ Move a directory sideways to a new name, ensuring it is unique.
65bd8b43 126 """
8ed15058 127 target_dir = target_dir_name
65bd8b43
OM
128 inc = 0
129 while os.path.exists(target_dir):
73695baf
OM
130 target_dir = "{}_{}".format(target_dir_name, inc)
131 inc += 1
65bd8b43 132 os.rename(dir_name, target_dir)
8ed15058
OM
133 return target_dir
134
135
136def fail_dir(dir_name):
137 """ When a download has failed, move it sideways.
138 """
73695baf 139 return rename_unique(dir_name, "{}_failed".format(dir_name))
65bd8b43
OM
140
141
142def truncate_name(file_name):
143 """ Ensure the filename is not too long for, well windows basically.
144 """
145 path = os.path.abspath(file_name)
146 if len(path) <= MAX_PATH_LENGTH:
147 return path
65bd8b43
OM
148 base, extension = os.path.splitext(path)
149 inc = 0
150 new_path = "{}_{}{}".format(base, inc, extension)
151 while os.path.exists(new_path):
152 new_path = "{}_{}{}".format(base, inc, extension)
153 inc += 1
154 return new_path
155
156
975060c9
OM
157def slugify(value):
158 """
d194b140
OM
159 Normalise string, removes invalid for filename charactersr
160 and converts string to lowercase.
975060c9 161 """
e45ba963 162 logging.debug("Sluggyfying {}".format(value))
d194b140 163 value = unicodedata.normalize('NFKC', value).lower().strip()
73695baf 164 value = re.sub(r'[\\/<>:?*|"]', '', value)
65bd8b43
OM
165 value = re.sub(r'\.*$', '', value)
166 return value
975060c9 167
b497d705 168
6a777954
OM
169class Downloader(multiprocessing.Process):
170 """
171 Class to handle downloading the things we have found to get.
172 """
173
eb4e5a3f 174 def __init__(self, thing_queue, download_directory, compress, api_key):
6a777954
OM
175 multiprocessing.Process.__init__(self)
176 # TODO: add parameters
177 self.thing_queue = thing_queue
178 self.download_directory = download_directory
ae598d73 179 self.compress = compress
eb4e5a3f 180 self.api_key = api_key
6a777954
OM
181
182 def run(self):
183 """ actual download loop.
184 """
185 while True:
eb4e5a3f 186 thing_id = self.thing_queue.get()
6a777954
OM
187 if thing_id is None:
188 logging.info("Shutting download queue")
189 self.thing_queue.task_done()
190 break
eb4e5a3f
OM
191 thing = None
192 if isinstance(thing_id, str):
193 thing = Thing.from_thing_id(thing_id)
194 if isinstance(thing_id, ThingLink):
195 thing = Thing(thing_id)
196 if not thing:
197 logging.error("Don't know how to handle thing_id {}".format(thing_id))
198 else:
199 logging.info("Handling id {}".format(thing_id))
200 thing.download(self.download_directory, self.compress, self.api_key)
6a777954
OM
201 self.thing_queue.task_done()
202 return
203
7b84ba6d 204
3522a3bf 205class Grouping:
d66f1f78 206 """ Holds details of a group of things for download
3c82f75b
OM
207 This is effectively (although not actually) an abstract class
208 - use Collection or Designs instead.
209 """
dbdb1782 210
ae598d73 211 def __init__(self, quick, compress):
975060c9
OM
212 self.things = []
213 self.total = 0
214 self.req_id = None
215 self.last_page = 0
216 self.per_page = None
7b84ba6d 217 # Should we stop downloading when we hit a known datestamp?
73695baf 218 self.quick = quick
ae598d73 219 self.compress = compress
948bd56f 220 # These should be set by child classes.
3522a3bf
OM
221 self.url = None
222 self.download_dir = None
975060c9 223
73695baf 224 @property
3522a3bf
OM
225 def get(self):
226 """ retrieve the things of the grouping. """
975060c9
OM
227 if self.things:
228 # We've already done it.
229 return self.things
230
3522a3bf
OM
231 # Check for initialisation:
232 if not self.url:
fa2f3251 233 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
234 raise ValueError("No URL set - object not initialised properly?")
235
236 # Get the internal details of the grouping.
e45ba963 237 logging.debug("Querying {}".format(sanitise_url(self.url)))
73695baf
OM
238
239 # self.url should already have been formatted as we don't need pagination
240 logging.info("requesting:{}".format(sanitise_url(self.url)))
241 current_req = SESSION.get(self.url)
242 if current_req.status_code != 200:
243 logging.error(
244 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
245 current_req.text))
e45ba963 246 else:
73695baf
OM
247 current_json = current_req.json()
248 for thing in current_json:
249 logging.info(thing)
250 self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
e45ba963 251 logging.info("Found {} things.".format(len(self.things)))
975060c9
OM
252 return self.things
253
254 def download(self):
255 """ Downloads all the files in a collection """
256 if not self.things:
3522a3bf
OM
257 self.get()
258
259 if not self.download_dir:
dbdb1782
OM
260 raise ValueError(
261 "No download_dir set - invalidly initialised object?")
3522a3bf 262
975060c9 263 base_dir = os.getcwd()
975060c9 264 try:
3522a3bf 265 os.mkdir(self.download_dir)
975060c9 266 except FileExistsError:
fa2f3251 267 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 268 .format(self.download_dir))
fa2f3251 269 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 270 for idx, thing in enumerate(self.things):
fb28c59b 271 logging.info("Downloading thing {} - {}".format(idx, thing))
eb4e5a3f
OM
272 return_code = Thing(thing).download(self.download_dir, self.compress)
273 if self.quick and return_code == State.ALREADY_DOWNLOADED:
7b84ba6d
OM
274 logging.info("Caught up, stopping.")
275 return
975060c9 276
73695baf 277
3522a3bf
OM
278class Collection(Grouping):
279 """ Holds details of a collection. """
dbdb1782 280
ae598d73
OM
281 def __init__(self, user, name, directory, quick, compress):
282 Grouping.__init__(self, quick, compress)
3522a3bf
OM
283 self.user = user
284 self.name = name
e45ba963
OM
285 self.paginated = False
286 # need to figure out the the ID for the collection
287 collection_url = API_USER_COLLECTIONS.format(user, API_KEY)
288 try:
289 current_req = SESSION.get(collection_url)
290 except requests.exceptions.ConnectionError as error:
73695baf
OM
291 logging.error("Unable to connect for collections for user {}: {}".format(
292 self.user, error))
e45ba963
OM
293 return
294 if current_req.status_code != 200:
73695baf
OM
295 logging.error(
296 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
297 current_req.text))
e45ba963
OM
298 return
299 collection_list = current_req.json()
300 try:
301 # case insensitive to retain parity with previous behaviour
302 collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
303 except IndexError:
304 logging.error("Unable to find collection {} for user {}".format(name, user))
305 return
306 self.collection_id = collection['id']
307 self.url = API_COLLECTION_THINGS.format(self.collection_id, API_KEY)
308
d66f1f78 309 self.download_dir = os.path.join(directory,
3c82f75b 310 "{}-{}".format(slugify(self.user), slugify(self.name)))
3522a3bf 311
dbdb1782 312
3522a3bf
OM
313class Designs(Grouping):
314 """ Holds details of all of a users' designs. """
dbdb1782 315
ae598d73
OM
316 def __init__(self, user, directory, quick, compress):
317 Grouping.__init__(self, quick, compress)
3522a3bf 318 self.user = user
e45ba963
OM
319 self.url = API_USER_DESIGNS.format(user)
320 self.paginated = True
dbdb1782
OM
321 self.download_dir = os.path.join(
322 directory, "{} designs".format(slugify(self.user)))
975060c9 323
dbdb1782 324
3c82f75b
OM
325class Thing:
326 """ An individual design on thingiverse. """
dbdb1782 327
e45ba963
OM
328 def __init__(self, thing_link):
329 self.thing_id = thing_link.thing_id
330 self.name = thing_link.name
3c82f75b
OM
331 self.last_time = None
332 self._parsed = False
333 self._needs_download = True
334 self.text = None
3c82f75b 335 self.download_dir = None
ae598d73
OM
336 self.time_stamp = None
337 self._file_links = FileLinks()
e45ba963 338 self._image_links = []
975060c9 339
eb4e5a3f
OM
340 @classmethod
341 def from_thing_id(cls, thing_id):
342 """
343 Factory method that looks up a thing by ID and creates a Thing object for it
344 :param thing_id: to look up
345 :return: Thing or None
346 """
347 return Thing(ThingLink(thing_id, "", ""))
348
349
350 def _parse(self, base_dir, api_key):
3c82f75b
OM
351 """ Work out what, if anything needs to be done. """
352 if self._parsed:
353 return
e36c2a07 354
e45ba963 355 # First get the broad details
eb4e5a3f
OM
356 url = API_THING_DETAILS.format(self.thing_id, api_key)
357 logging.error(url)
e0e69fc6 358 try:
e45ba963 359 current_req = SESSION.get(url)
e0e69fc6 360 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
361 logging.error("Unable to connect for thing {}: {}".format(
362 self.thing_id, error))
363 return
e45ba963
OM
364 # Check for DMCA
365 if current_req.status_code == 403:
366 logging.error("Access to thing {} is forbidden".format(self.thing_id))
fb28c59b 367 return
e45ba963 368 if current_req.status_code != 200:
73695baf
OM
369 logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
370 current_req.text))
e45ba963
OM
371 return
372
373 thing_json = current_req.json()
374 try:
375 self._license = thing_json['license']
376 except KeyError:
377 logging.warning("No license found for thing {}?".format(self.thing_id))
378
379 # TODO: Get non-html version of this?
380 try:
381 self._details = thing_json['details']
382 except KeyError:
383 logging.warning("No description found for thing {}?".format(self.thing_id))
e0e69fc6 384
eb4e5a3f
OM
385 if not self.name:
386 # Probably generated with factory method.
387 try:
388 self.name = thing_json['name']
389 except KeyError:
390 logging.warning("No name found for thing {}?".format(self.thing_id))
391 self.name = self.thing_id
392
e45ba963 393 # Now get the file details
eb4e5a3f 394 file_url = API_THING_FILES.format(self.thing_id, api_key)
e45ba963
OM
395
396 try:
397 current_req = SESSION.get(file_url)
398 except requests.exceptions.ConnectionError as error:
399 logging.error("Unable to connect for thing {}: {}".format(
400 self.thing_id, error))
401 return
402
403 if current_req.status_code != 200:
73695baf
OM
404 logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
405 current_req.text))
e45ba963
OM
406 return
407
408 link_list = current_req.json()
409
410 if not link_list:
73695baf
OM
411 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
412 self.thing_id))
e45ba963
OM
413
414 for link in link_list:
415 logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
fb28c59b 416 try:
e45ba963 417 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
73695baf 418 self._file_links.append(
eb4e5a3f 419 FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
fb28c59b 420 except ValueError:
e45ba963
OM
421 logging.error(link['date'])
422
423 # Finally get the image links
eb4e5a3f 424 image_url = API_THING_IMAGES.format(self.thing_id, api_key)
e45ba963
OM
425
426 try:
427 current_req = SESSION.get(image_url)
428 except requests.exceptions.ConnectionError as error:
429 logging.error("Unable to connect for thing {}: {}".format(
430 self.thing_id, error))
431 return
432
433 if current_req.status_code != 200:
73695baf
OM
434 logging.error(
435 "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
436 current_req.text))
e45ba963 437 return
fb28c59b 438
e45ba963 439 image_list = current_req.json()
e0e69fc6 440
e45ba963 441 if not image_list:
73695baf
OM
442 logging.warning(
443 "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
444 self.thing_id))
e0e69fc6 445
e45ba963
OM
446 for image in image_list:
447 logging.debug("parsing image: {}".format(image))
73695baf 448 name = None
e45ba963
OM
449 try:
450 name = slugify(image['name'])
451 # TODO: fallback to other types
73695baf 452 url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
e45ba963
OM
453 except KeyError:
454 logging.warning("Missing image for {}".format(name))
455 self._image_links.append(ImageLink(name, url))
456
457 self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
8ed15058
OM
458 self.download_dir = os.path.join(base_dir, self.slug)
459
460 self._handle_old_directory(base_dir)
3c82f75b 461
e45ba963 462 logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
8ed15058 463 latest, self.last_time = self._find_last_download(base_dir)
fa2f3251 464
8ed15058 465 if not latest:
73695baf
OM
466 # Not yet downloaded
467 self._parsed = True
468 return
3c82f75b 469
8ed15058 470 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
471
472 # OK, so we have a timestamp, lets see if there is anything new to get
e45ba963
OM
473 # First off, are we comparing an old download that threw away the timestamp?
474 ignore_time = self.last_time == strip_time(self.last_time)
ae598d73 475 try:
e45ba963
OM
476 # TODO: Allow for comparison at the exact time
477 files_last_update = self._file_links.last_update
478 if ignore_time:
479 logging.info("Dropping time from comparison stamp as old-style download dir")
480 files_last_update = strip_time(files_last_update)
481
e45ba963 482 if files_last_update > self.last_time:
dbdb1782 483 logging.info(
ae598d73 484 "Found new/updated files {}".format(self._file_links.last_update))
3c82f75b
OM
485 self._needs_download = True
486 self._parsed = True
487 return
ae598d73
OM
488 except TypeError:
489 logging.warning("No files found for {}.".format(self.thing_id))
b497d705 490
3c82f75b 491 # Got here, so nope, no new files.
3c82f75b
OM
492 self._needs_download = False
493 self._parsed = True
494
8ed15058
OM
495 def _handle_old_directory(self, base_dir):
496 """ Deal with any old directories from previous versions of the code.
497 """
e45ba963 498 old_dir = os.path.join(base_dir, slugify(self.name))
8ed15058
OM
499 if os.path.exists(old_dir):
500 logging.warning("Found old style download_dir. Moving.")
501 rename_unique(old_dir, self.download_dir)
502
73695baf 503 def _handle_outdated_directory(self):
8ed15058
OM
504 """ Move the current download directory sideways if the thing has changed.
505 """
506 if not os.path.exists(self.download_dir):
507 # No old directory to move.
508 return None
509 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
510 if not os.path.exists(timestamp_file):
511 # Old form of download directory
512 target_dir_name = "{} - old".format(self.download_dir)
513 else:
3ac180ed 514 target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
8ed15058
OM
515 return rename_unique(self.download_dir, target_dir_name)
516
517 def _find_last_download(self, base_dir):
518 """ Look for the most recent previous download (if any) of the thing.
519 """
520 logging.info("Looking for old things")
521
522 # First the DL directory itself.
523 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
524
525 latest = None
526 latest_time = None
527
528 try:
529 logging.debug("Checking for existing download in normal place.")
530 with open(timestamp_file) as ts_fh:
531 timestamp_text = ts_fh.read().strip()
532 latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
533 latest = self.download_dir
534 except FileNotFoundError:
535 # No existing download directory. huh.
536 pass
537 except TypeError:
538 logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
539
540 # TODO: Maybe look for old download directories.
541
8ed15058
OM
542 # Now look for 7z files
543 candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
544 # +3 to allow for ' - '
73695baf 545 leading_length = len(self.slug) + 3
8ed15058
OM
546 for path in candidates:
547 candidate = os.path.basename(path)
548 try:
549 logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
3ac180ed 550 candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
8ed15058
OM
551 except ValueError:
552 logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
553 continue
554 try:
555 if candidate_time > latest_time:
556 latest_time = candidate_time
557 latest = candidate
558 except TypeError:
559 latest_time = candidate_time
560 latest = candidate
73695baf 561 logging.info("Found last old thing: {} / {}".format(latest, latest_time))
eb4e5a3f 562 return latest, latest_time
8ed15058 563
eb4e5a3f 564 def download(self, base_dir, compress, api_key):
7b84ba6d
OM
565 """ Download all files for a given thing.
566 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
567 """
3c82f75b 568 if not self._parsed:
eb4e5a3f 569 self._parse(base_dir, api_key)
3c82f75b 570
e0e69fc6 571 if not self._parsed:
8cdd1b54
OM
572 logging.error(
573 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 574 return State.FAILED
e0e69fc6 575
3c82f75b 576 if not self._needs_download:
e45ba963 577 logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
7b84ba6d 578 return State.ALREADY_DOWNLOADED
3c82f75b 579
247c2cd5 580 if not self._file_links:
73695baf
OM
581 logging.error(
582 "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
247c2cd5
OM
583 return State.FAILED
584
3c82f75b 585 # Have we already downloaded some things?
73695baf 586 renamed_dir = self._handle_outdated_directory()
3c82f75b
OM
587
588 # Get the list of files to download
3c82f75b
OM
589
590 new_file_links = []
591 old_file_links = []
ae598d73 592 self.time_stamp = None
3c82f75b
OM
593
594 if not self.last_time:
595 # If we don't have anything to copy from, then it is all new.
b497d705
OM
596 logging.debug("No last time, downloading all files")
597 new_file_links = self._file_links
ae598d73 598 self.time_stamp = new_file_links[0].last_update
73695baf 599
b497d705 600 for file_link in new_file_links:
ae598d73
OM
601 self.time_stamp = max(self.time_stamp, file_link.last_update)
602 logging.debug("New timestamp will be {}".format(self.time_stamp))
3c82f75b 603 else:
ae598d73 604 self.time_stamp = self.last_time
b497d705
OM
605 for file_link in self._file_links:
606 if file_link.last_update > self.last_time:
3c82f75b 607 new_file_links.append(file_link)
ae598d73 608 self.time_stamp = max(self.time_stamp, file_link.last_update)
3c82f75b
OM
609 else:
610 old_file_links.append(file_link)
3c82f75b 611
ae598d73 612 logging.debug("new timestamp {}".format(self.time_stamp))
3c82f75b
OM
613
614 # OK. Time to get to work.
fa2f3251 615 logging.debug("Generating download_dir")
3c82f75b 616 os.mkdir(self.download_dir)
b497d705 617 filelist_file = os.path.join(self.download_dir, "filelist.txt")
d194b140 618 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705 619 for fl in self._file_links:
73695baf 620 fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
b497d705 621
3c82f75b 622 # First grab the cached files (if any)
fa2f3251 623 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
e6d8def4
OM
624 if renamed_dir:
625 for file_link in old_file_links:
626 try:
627 old_file = os.path.join(renamed_dir, file_link.name)
628 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
629 logging.debug("Copying {} to {}".format(old_file, new_file))
630 copyfile(old_file, new_file)
631 except FileNotFoundError:
632 logging.warning(
633 "Unable to find {} in old archive, redownloading".format(file_link.name))
634 new_file_links.append(file_link)
635 except TypeError:
636 # Not altogether sure how this could occur, possibly with some combination of the old file types
637 logging.warning(
638 "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
639 new_file_links.append(file_link)
640
3c82f75b 641 # Now download the new ones
dbdb1782 642 logging.info("Downloading {} new files of {}".format(
b497d705 643 len(new_file_links), len(self._file_links)))
3c82f75b 644 try:
b497d705 645 for file_link in new_file_links:
65bd8b43 646 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 647 logging.debug("Downloading {} from {} to {}".format(
b497d705 648 file_link.name, file_link.link, file_name))
10f0238d 649 data_req = SESSION.get(file_link.link)
e45ba963 650 if data_req.status_code != 200:
73695baf
OM
651 logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
652 sanitise_url(file_link.link),
653 data_req.text))
e45ba963
OM
654 fail_dir(self.download_dir)
655 return State.FAILED
e45ba963 656
3c82f75b
OM
657 with open(file_name, 'wb') as handle:
658 handle.write(data_req.content)
659 except Exception as exception:
b497d705 660 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 661 fail_dir(self.download_dir)
7b84ba6d 662 return State.FAILED
3c82f75b 663
e45ba963 664 # People like images.
680039fe 665 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 666 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
667 try:
668 os.mkdir(image_dir)
fb28c59b 669 for imagelink in self._image_links:
e45ba963
OM
670 filename = os.path.join(image_dir, imagelink.name)
671 image_req = SESSION.get(imagelink.link)
672 if image_req.status_code != 200:
73695baf
OM
673 logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
674 sanitise_url(imagelink.link),
675 image_req.text))
e45ba963
OM
676 fail_dir(self.download_dir)
677 return State.FAILED
678 with open(truncate_name(filename), 'wb') as handle:
680039fe
OM
679 handle.write(image_req.content)
680 except Exception as exception:
e45ba963 681 logging.error("Failed to download {} - {}".format(imagelink.name, exception))
65bd8b43 682 fail_dir(self.download_dir)
7b84ba6d 683 return State.FAILED
680039fe 684
4f75dd69 685 # Best get some licenses
e45ba963 686 logging.info("writing license file")
4f75dd69 687 try:
fb28c59b 688 if self._license:
73695baf
OM
689 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
690 encoding="utf-8") as license_handle:
fb28c59b 691 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
692 except IOError as exception:
693 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 694
e45ba963
OM
695 logging.info("writing readme")
696 try:
697 if self._details:
73695baf
OM
698 with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
699 encoding="utf-8") as readme_handle:
e45ba963
OM
700 readme_handle.write("{}\n".format(self._details))
701 except IOError as exception:
702 logging.warning("Failed to write readme! {}".format(exception))
703
3c82f75b
OM
704 try:
705 # Now write the timestamp
73695baf 706 with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
ae598d73 707 timestamp_handle.write(self.time_stamp.__str__())
3c82f75b 708 except Exception as exception:
e45ba963 709 logging.error("Failed to write timestamp file - {}".format(exception))
65bd8b43 710 fail_dir(self.download_dir)
7b84ba6d 711 return State.FAILED
3c82f75b 712 self._needs_download = False
e45ba963 713 logging.debug("Download of {} finished".format(self.name))
ae598d73
OM
714 if not compress:
715 return State.OK
716
ae598d73 717 thing_dir = "{} - {} - {}".format(self.thing_id,
73695baf
OM
718 slugify(self.name),
719 self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
ae598d73 720 file_name = os.path.join(base_dir,
73695baf 721 "{}.7z".format(thing_dir))
ae598d73 722 logging.debug("Compressing {} to {}".format(
e45ba963 723 self.name,
ae598d73 724 file_name))
ae598d73 725 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
ae598d73 726 archive.writeall(self.download_dir, thing_dir)
e45ba963 727 logging.debug("Compression of {} finished.".format(self.name))
8ed15058 728 shutil.rmtree(self.download_dir)
e45ba963 729 logging.debug("Removed temporary download dir of {}.".format(self.name))
7b84ba6d 730 return State.OK
975060c9 731
dbdb1782 732
ae598d73 733def do_batch(batch_file, download_dir, quick, compress):
1ab49020
OM
734 """ Read a file in line by line, parsing each as a set of calls to this script."""
735 with open(batch_file) as handle:
736 for line in handle:
737 line = line.strip()
cf280385
M
738 if not line:
739 # Skip empty lines
740 continue
1ab49020
OM
741 logging.info("Handling instruction {}".format(line))
742 command_arr = line.split()
743 if command_arr[0] == "thing":
dbdb1782
OM
744 logging.debug(
745 "Handling batch thing instruction: {}".format(line))
eb4e5a3f 746 Thing.from_thing_id(command_arr[1]).download(download_dir, compress)
1ab49020
OM
747 continue
748 if command_arr[0] == "collection":
dbdb1782
OM
749 logging.debug(
750 "Handling batch collection instruction: {}".format(line))
751 Collection(command_arr[1], command_arr[2],
ae598d73 752 download_dir, quick, compress).download()
1ab49020
OM
753 continue
754 if command_arr[0] == "user":
dbdb1782
OM
755 logging.debug(
756 "Handling batch collection instruction: {}".format(line))
ae598d73 757 Designs(command_arr[1], download_dir, quick, compress).download()
1ab49020
OM
758 continue
759 logging.warning("Unable to parse current instruction. Skipping.")
760
dbdb1782 761
975060c9
OM
762def main():
763 """ Entry point for script being run as a command. """
764 parser = argparse.ArgumentParser()
dbdb1782 765 parser.add_argument("-l", "--log-level", choices=[
73695baf 766 'debug', 'info', 'warning'], default='info', help="level of logging desired")
dbdb1782
OM
767 parser.add_argument("-d", "--directory",
768 help="Target directory to download into")
4f94efc8
OM
769 parser.add_argument("-f", "--log-file",
770 help="Place to log debug information to")
7b84ba6d
OM
771 parser.add_argument("-q", "--quick", action="store_true",
772 help="Assume date ordering on posts")
ae598d73
OM
773 parser.add_argument("-c", "--compress", action="store_true",
774 help="Compress files")
e45ba963
OM
775 parser.add_argument("-a", "--api-key",
776 help="API key for thingiverse")
7b84ba6d 777
dbdb1782
OM
778 subparsers = parser.add_subparsers(
779 help="Type of thing to download", dest="subcommand")
780 collection_parser = subparsers.add_parser(
b7bfef68 781 'collection', help="Download one or more entire collection(s)")
dbdb1782 782 collection_parser.add_argument(
b7bfef68 783 "owner", help="The owner of the collection(s) to get")
dbdb1782 784 collection_parser.add_argument(
73695baf 785 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
786 thing_parser = subparsers.add_parser(
787 'thing', help="Download a single thing.")
8cdd1b54
OM
788 thing_parser.add_argument(
789 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 790 user_parser = subparsers.add_parser(
73695baf 791 "user", help="Download all things by one or more users")
8cdd1b54
OM
792 user_parser.add_argument(
793 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
794 batch_parser = subparsers.add_parser(
795 "batch", help="Perform multiple actions written in a text file")
796 batch_parser.add_argument(
797 "batch_file", help="The name of the file to read.")
680039fe 798 subparsers.add_parser("version", help="Show the current version")
4a98996b 799
975060c9 800 args = parser.parse_args()
4a98996b
OM
801 if not args.subcommand:
802 parser.print_help()
803 sys.exit(1)
d66f1f78
OM
804 if not args.directory:
805 args.directory = os.getcwd()
4f94efc8
OM
806
807 logger = logging.getLogger()
808 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
809 logger.setLevel(logging.DEBUG)
810 console_handler = logging.StreamHandler()
811 console_handler.setLevel(args.log_level.upper())
812
e45ba963
OM
813 global API_KEY
814 if args.api_key:
73695baf 815 API_KEY = args.api_key
e45ba963
OM
816 else:
817 try:
818 with open("api.key") as fh:
73695baf 819 API_KEY = fh.read().strip()
e45ba963
OM
820 except Exception as e:
821 logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
822 logging.error("Exception: {}".format(e))
823 return
824
4f94efc8
OM
825 logger.addHandler(console_handler)
826 if args.log_file:
827 file_handler = logging.FileHandler(args.log_file)
828 file_handler.setLevel(logging.DEBUG)
829 file_handler.setFormatter(formatter)
830 logger.addHandler(file_handler)
fa2f3251 831
6a777954
OM
832 # Start downloader
833 thing_queue = multiprocessing.JoinableQueue()
834 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
eb4e5a3f 835 downloaders = [Downloader(thing_queue, args.directory, args.compress, API_KEY) for _ in range(DOWNLOADER_COUNT)]
6a777954
OM
836 for downloader in downloaders:
837 downloader.start()
838
4a98996b 839 if args.subcommand.startswith("collection"):
b7bfef68 840 for collection in args.collections:
ae598d73 841 Collection(args.owner, collection, args.directory, args.quick, args.compress).download()
4a98996b 842 if args.subcommand == "thing":
b7bfef68 843 for thing in args.things:
6a777954 844 thing_queue.put(thing)
3522a3bf 845 if args.subcommand == "user":
b7bfef68 846 for user in args.users:
ae598d73 847 Designs(user, args.directory, args.quick, args.compress).download()
db8066ec
OM
848 if args.subcommand == "version":
849 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 850 if args.subcommand == "batch":
ae598d73 851 do_batch(args.batch_file, args.directory, args.quick, args.compress)
1ab49020 852
6a777954 853 # Stop the downloader processes
73695baf 854 for _ in downloaders:
6a777954 855 thing_queue.put(None)
975060c9 856
d194b140 857
73695baf 858if __name__ == "__main__":
0930777e 859 multiprocessing.freeze_support()
975060c9 860 main()