fix individual thing grabs
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
b497d705 17from dataclasses import dataclass
9828dabe 18import py7zr
8ed15058
OM
19import glob
20import shutil
975060c9 21
ae598d73
OM
22SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
23
8ed15058
OM
24# I don't think this is exported by datetime
25DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
3ac180ed
OM
26# Windows cannot handle : in filenames
27SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
8ed15058 28
73695baf
OM
29API_BASE = "https://api.thingiverse.com"
30ACCESS_QP = "access_token={}"
31PAGE_QP = "page={}"
e45ba963
OM
32API_USER_DESIGNS = API_BASE + "/users/{}/things/"
33API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
975060c9 34
e45ba963
OM
35# Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
36API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
37API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
38
39API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
40API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
41API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
10f0238d 42API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
e45ba963
OM
43
44API_KEY = None
dd8c35f4 45
6a777954 46DOWNLOADER_COUNT = 1
7b84ba6d 47RETRY_COUNT = 3
6a777954 48
65bd8b43
OM
49MAX_PATH_LENGTH = 250
50
eb7a88fb 51VERSION = "0.10.3"
dbdb1782 52
8ed15058 53TIMESTAMP_FILE = "timestamp.txt"
b497d705 54
e45ba963 55SESSION = requests.Session()
b497d705 56
73695baf 57
e45ba963
OM
58@dataclass
59class ThingLink:
60 thing_id: str
61 name: str
62 api_link: str
b497d705 63
73695baf 64
b497d705
OM
65@dataclass
66class FileLink:
67 name: str
ae598d73
OM
68 last_update: datetime.datetime
69 link: str
70
73695baf 71
e45ba963
OM
72@dataclass
73class ImageLink:
74 name: str
75 link: str
76
73695baf 77
ae598d73 78class FileLinks:
73695baf
OM
79 def __init__(self, initial_links=None):
80 if initial_links is None:
81 initial_links = []
ae598d73
OM
82 self.links = []
83 self.last_update = None
73695baf 84 for link in initial_links:
ae598d73
OM
85 self.append(link)
86
87 def __iter__(self):
88 return iter(self.links)
89
90 def __getitem__(self, item):
91 return self.links[item]
92
93 def __len__(self):
94 return len(self.links)
95
96 def append(self, link):
97 try:
98 self.last_update = max(self.last_update, link.last_update)
99 except TypeError:
100 self.last_update = link.last_update
101 self.links.append(link)
8ed15058 102
b497d705 103
7b84ba6d
OM
104class State(enum.Enum):
105 OK = enum.auto()
106 FAILED = enum.auto()
107 ALREADY_DOWNLOADED = enum.auto()
108
73695baf 109
e45ba963
OM
110def sanitise_url(url):
111 """ remove api keys from an url
112 """
113 return re.sub(r'access_token=\w*',
114 'access_token=***',
115 url)
116
73695baf 117
e45ba963
OM
118def strip_time(date_obj):
119 """ Takes a datetime object and returns another with the time set to 00:00
120 """
121 return datetime.datetime.combine(date_obj.date(), datetime.time())
122
73695baf 123
8ed15058
OM
124def rename_unique(dir_name, target_dir_name):
125 """ Move a directory sideways to a new name, ensuring it is unique.
65bd8b43 126 """
8ed15058 127 target_dir = target_dir_name
65bd8b43
OM
128 inc = 0
129 while os.path.exists(target_dir):
73695baf
OM
130 target_dir = "{}_{}".format(target_dir_name, inc)
131 inc += 1
65bd8b43 132 os.rename(dir_name, target_dir)
8ed15058
OM
133 return target_dir
134
135
136def fail_dir(dir_name):
137 """ When a download has failed, move it sideways.
138 """
73695baf 139 return rename_unique(dir_name, "{}_failed".format(dir_name))
65bd8b43
OM
140
141
142def truncate_name(file_name):
143 """ Ensure the filename is not too long for, well windows basically.
144 """
145 path = os.path.abspath(file_name)
146 if len(path) <= MAX_PATH_LENGTH:
147 return path
65bd8b43
OM
148 base, extension = os.path.splitext(path)
149 inc = 0
150 new_path = "{}_{}{}".format(base, inc, extension)
151 while os.path.exists(new_path):
152 new_path = "{}_{}{}".format(base, inc, extension)
153 inc += 1
154 return new_path
155
156
975060c9
OM
157def slugify(value):
158 """
d194b140
OM
159 Normalise string, removes invalid for filename charactersr
160 and converts string to lowercase.
975060c9 161 """
e45ba963 162 logging.debug("Sluggyfying {}".format(value))
d194b140 163 value = unicodedata.normalize('NFKC', value).lower().strip()
73695baf 164 value = re.sub(r'[\\/<>:?*|"]', '', value)
65bd8b43 165 value = re.sub(r'\.*$', '', value)
eb7a88fb 166 return value.strip()
975060c9 167
b497d705 168
6a777954
OM
169class Downloader(multiprocessing.Process):
170 """
171 Class to handle downloading the things we have found to get.
172 """
173
eb4e5a3f 174 def __init__(self, thing_queue, download_directory, compress, api_key):
6a777954
OM
175 multiprocessing.Process.__init__(self)
176 # TODO: add parameters
177 self.thing_queue = thing_queue
178 self.download_directory = download_directory
ae598d73 179 self.compress = compress
eb4e5a3f 180 self.api_key = api_key
6a777954
OM
181
182 def run(self):
183 """ actual download loop.
184 """
185 while True:
eb4e5a3f 186 thing_id = self.thing_queue.get()
6a777954
OM
187 if thing_id is None:
188 logging.info("Shutting download queue")
189 self.thing_queue.task_done()
190 break
eb4e5a3f
OM
191 thing = None
192 if isinstance(thing_id, str):
193 thing = Thing.from_thing_id(thing_id)
194 if isinstance(thing_id, ThingLink):
195 thing = Thing(thing_id)
196 if not thing:
197 logging.error("Don't know how to handle thing_id {}".format(thing_id))
198 else:
199 logging.info("Handling id {}".format(thing_id))
200 thing.download(self.download_directory, self.compress, self.api_key)
6a777954
OM
201 self.thing_queue.task_done()
202 return
203
7b84ba6d 204
3522a3bf 205class Grouping:
d66f1f78 206 """ Holds details of a group of things for download
3c82f75b
OM
207 This is effectively (although not actually) an abstract class
208 - use Collection or Designs instead.
209 """
dbdb1782 210
ae598d73 211 def __init__(self, quick, compress):
975060c9
OM
212 self.things = []
213 self.total = 0
214 self.req_id = None
215 self.last_page = 0
216 self.per_page = None
7b84ba6d 217 # Should we stop downloading when we hit a known datestamp?
73695baf 218 self.quick = quick
ae598d73 219 self.compress = compress
948bd56f 220 # These should be set by child classes.
3522a3bf
OM
221 self.url = None
222 self.download_dir = None
975060c9 223
73695baf 224 @property
3522a3bf
OM
225 def get(self):
226 """ retrieve the things of the grouping. """
975060c9
OM
227 if self.things:
228 # We've already done it.
229 return self.things
230
3522a3bf
OM
231 # Check for initialisation:
232 if not self.url:
fa2f3251 233 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
234 raise ValueError("No URL set - object not initialised properly?")
235
236 # Get the internal details of the grouping.
e45ba963 237 logging.debug("Querying {}".format(sanitise_url(self.url)))
73695baf
OM
238
239 # self.url should already have been formatted as we don't need pagination
240 logging.info("requesting:{}".format(sanitise_url(self.url)))
241 current_req = SESSION.get(self.url)
242 if current_req.status_code != 200:
243 logging.error(
244 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
245 current_req.text))
e45ba963 246 else:
73695baf
OM
247 current_json = current_req.json()
248 for thing in current_json:
249 logging.info(thing)
250 self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
e45ba963 251 logging.info("Found {} things.".format(len(self.things)))
975060c9
OM
252 return self.things
253
254 def download(self):
255 """ Downloads all the files in a collection """
256 if not self.things:
3522a3bf
OM
257 self.get()
258
259 if not self.download_dir:
dbdb1782
OM
260 raise ValueError(
261 "No download_dir set - invalidly initialised object?")
3522a3bf 262
975060c9 263 base_dir = os.getcwd()
975060c9 264 try:
3522a3bf 265 os.mkdir(self.download_dir)
975060c9 266 except FileExistsError:
fa2f3251 267 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 268 .format(self.download_dir))
fa2f3251 269 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 270 for idx, thing in enumerate(self.things):
fb28c59b 271 logging.info("Downloading thing {} - {}".format(idx, thing))
eb4e5a3f
OM
272 return_code = Thing(thing).download(self.download_dir, self.compress)
273 if self.quick and return_code == State.ALREADY_DOWNLOADED:
7b84ba6d
OM
274 logging.info("Caught up, stopping.")
275 return
975060c9 276
73695baf 277
3522a3bf
OM
278class Collection(Grouping):
279 """ Holds details of a collection. """
dbdb1782 280
ae598d73
OM
281 def __init__(self, user, name, directory, quick, compress):
282 Grouping.__init__(self, quick, compress)
3522a3bf
OM
283 self.user = user
284 self.name = name
e45ba963
OM
285 self.paginated = False
286 # need to figure out the the ID for the collection
287 collection_url = API_USER_COLLECTIONS.format(user, API_KEY)
288 try:
289 current_req = SESSION.get(collection_url)
290 except requests.exceptions.ConnectionError as error:
73695baf
OM
291 logging.error("Unable to connect for collections for user {}: {}".format(
292 self.user, error))
e45ba963
OM
293 return
294 if current_req.status_code != 200:
73695baf
OM
295 logging.error(
296 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
297 current_req.text))
e45ba963
OM
298 return
299 collection_list = current_req.json()
300 try:
301 # case insensitive to retain parity with previous behaviour
302 collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
303 except IndexError:
304 logging.error("Unable to find collection {} for user {}".format(name, user))
305 return
306 self.collection_id = collection['id']
307 self.url = API_COLLECTION_THINGS.format(self.collection_id, API_KEY)
308
d66f1f78 309 self.download_dir = os.path.join(directory,
3c82f75b 310 "{}-{}".format(slugify(self.user), slugify(self.name)))
3522a3bf 311
dbdb1782 312
3522a3bf
OM
313class Designs(Grouping):
314 """ Holds details of all of a users' designs. """
dbdb1782 315
ae598d73
OM
316 def __init__(self, user, directory, quick, compress):
317 Grouping.__init__(self, quick, compress)
3522a3bf 318 self.user = user
e45ba963
OM
319 self.url = API_USER_DESIGNS.format(user)
320 self.paginated = True
dbdb1782
OM
321 self.download_dir = os.path.join(
322 directory, "{} designs".format(slugify(self.user)))
975060c9 323
dbdb1782 324
3c82f75b
OM
325class Thing:
326 """ An individual design on thingiverse. """
dbdb1782 327
e45ba963
OM
328 def __init__(self, thing_link):
329 self.thing_id = thing_link.thing_id
330 self.name = thing_link.name
3c82f75b
OM
331 self.last_time = None
332 self._parsed = False
333 self._needs_download = True
334 self.text = None
3c82f75b 335 self.download_dir = None
ae598d73
OM
336 self.time_stamp = None
337 self._file_links = FileLinks()
e45ba963 338 self._image_links = []
975060c9 339
eb4e5a3f
OM
340 @classmethod
341 def from_thing_id(cls, thing_id):
342 """
343 Factory method that looks up a thing by ID and creates a Thing object for it
344 :param thing_id: to look up
345 :return: Thing or None
346 """
347 return Thing(ThingLink(thing_id, "", ""))
348
349
350 def _parse(self, base_dir, api_key):
3c82f75b
OM
351 """ Work out what, if anything needs to be done. """
352 if self._parsed:
353 return
e36c2a07 354
e45ba963 355 # First get the broad details
eb4e5a3f
OM
356 url = API_THING_DETAILS.format(self.thing_id, api_key)
357 logging.error(url)
e0e69fc6 358 try:
e45ba963 359 current_req = SESSION.get(url)
e0e69fc6 360 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
361 logging.error("Unable to connect for thing {}: {}".format(
362 self.thing_id, error))
363 return
e45ba963
OM
364 # Check for DMCA
365 if current_req.status_code == 403:
366 logging.error("Access to thing {} is forbidden".format(self.thing_id))
fb28c59b 367 return
e45ba963 368 if current_req.status_code != 200:
73695baf
OM
369 logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
370 current_req.text))
e45ba963
OM
371 return
372
373 thing_json = current_req.json()
374 try:
375 self._license = thing_json['license']
376 except KeyError:
377 logging.warning("No license found for thing {}?".format(self.thing_id))
378
379 # TODO: Get non-html version of this?
380 try:
381 self._details = thing_json['details']
382 except KeyError:
383 logging.warning("No description found for thing {}?".format(self.thing_id))
e0e69fc6 384
eb4e5a3f
OM
385 if not self.name:
386 # Probably generated with factory method.
387 try:
388 self.name = thing_json['name']
389 except KeyError:
390 logging.warning("No name found for thing {}?".format(self.thing_id))
391 self.name = self.thing_id
392
e45ba963 393 # Now get the file details
eb4e5a3f 394 file_url = API_THING_FILES.format(self.thing_id, api_key)
e45ba963
OM
395
396 try:
397 current_req = SESSION.get(file_url)
398 except requests.exceptions.ConnectionError as error:
399 logging.error("Unable to connect for thing {}: {}".format(
400 self.thing_id, error))
401 return
402
403 if current_req.status_code != 200:
73695baf
OM
404 logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
405 current_req.text))
e45ba963
OM
406 return
407
408 link_list = current_req.json()
409
410 if not link_list:
73695baf
OM
411 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
412 self.thing_id))
e45ba963
OM
413
414 for link in link_list:
415 logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
fb28c59b 416 try:
e45ba963 417 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
73695baf 418 self._file_links.append(
eb4e5a3f 419 FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
fb28c59b 420 except ValueError:
e45ba963
OM
421 logging.error(link['date'])
422
423 # Finally get the image links
eb4e5a3f 424 image_url = API_THING_IMAGES.format(self.thing_id, api_key)
e45ba963
OM
425
426 try:
427 current_req = SESSION.get(image_url)
428 except requests.exceptions.ConnectionError as error:
429 logging.error("Unable to connect for thing {}: {}".format(
430 self.thing_id, error))
431 return
432
433 if current_req.status_code != 200:
73695baf
OM
434 logging.error(
435 "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
436 current_req.text))
e45ba963 437 return
fb28c59b 438
e45ba963 439 image_list = current_req.json()
e0e69fc6 440
e45ba963 441 if not image_list:
73695baf
OM
442 logging.warning(
443 "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
444 self.thing_id))
e0e69fc6 445
e45ba963
OM
446 for image in image_list:
447 logging.debug("parsing image: {}".format(image))
73695baf 448 name = None
e45ba963
OM
449 try:
450 name = slugify(image['name'])
451 # TODO: fallback to other types
73695baf 452 url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
e45ba963
OM
453 except KeyError:
454 logging.warning("Missing image for {}".format(name))
455 self._image_links.append(ImageLink(name, url))
456
457 self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
8ed15058
OM
458 self.download_dir = os.path.join(base_dir, self.slug)
459
460 self._handle_old_directory(base_dir)
3c82f75b 461
e45ba963 462 logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
8ed15058 463 latest, self.last_time = self._find_last_download(base_dir)
fa2f3251 464
8ed15058 465 if not latest:
73695baf
OM
466 # Not yet downloaded
467 self._parsed = True
468 return
3c82f75b 469
8ed15058 470 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
471
472 # OK, so we have a timestamp, lets see if there is anything new to get
e45ba963
OM
473 # First off, are we comparing an old download that threw away the timestamp?
474 ignore_time = self.last_time == strip_time(self.last_time)
ae598d73 475 try:
e45ba963
OM
476 # TODO: Allow for comparison at the exact time
477 files_last_update = self._file_links.last_update
478 if ignore_time:
479 logging.info("Dropping time from comparison stamp as old-style download dir")
480 files_last_update = strip_time(files_last_update)
481
e45ba963 482 if files_last_update > self.last_time:
dbdb1782 483 logging.info(
ae598d73 484 "Found new/updated files {}".format(self._file_links.last_update))
3c82f75b
OM
485 self._needs_download = True
486 self._parsed = True
487 return
ae598d73
OM
488 except TypeError:
489 logging.warning("No files found for {}.".format(self.thing_id))
b497d705 490
3c82f75b 491 # Got here, so nope, no new files.
3c82f75b
OM
492 self._needs_download = False
493 self._parsed = True
494
8ed15058
OM
495 def _handle_old_directory(self, base_dir):
496 """ Deal with any old directories from previous versions of the code.
497 """
e45ba963 498 old_dir = os.path.join(base_dir, slugify(self.name))
8ed15058
OM
499 if os.path.exists(old_dir):
500 logging.warning("Found old style download_dir. Moving.")
501 rename_unique(old_dir, self.download_dir)
502
73695baf 503 def _handle_outdated_directory(self):
8ed15058
OM
504 """ Move the current download directory sideways if the thing has changed.
505 """
506 if not os.path.exists(self.download_dir):
507 # No old directory to move.
508 return None
509 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
510 if not os.path.exists(timestamp_file):
511 # Old form of download directory
512 target_dir_name = "{} - old".format(self.download_dir)
513 else:
3ac180ed 514 target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
8ed15058
OM
515 return rename_unique(self.download_dir, target_dir_name)
516
517 def _find_last_download(self, base_dir):
518 """ Look for the most recent previous download (if any) of the thing.
519 """
520 logging.info("Looking for old things")
521
522 # First the DL directory itself.
523 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
524
525 latest = None
526 latest_time = None
527
528 try:
529 logging.debug("Checking for existing download in normal place.")
530 with open(timestamp_file) as ts_fh:
531 timestamp_text = ts_fh.read().strip()
532 latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
533 latest = self.download_dir
534 except FileNotFoundError:
535 # No existing download directory. huh.
536 pass
537 except TypeError:
538 logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
539
540 # TODO: Maybe look for old download directories.
541
8ed15058
OM
542 # Now look for 7z files
543 candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
544 # +3 to allow for ' - '
73695baf 545 leading_length = len(self.slug) + 3
8ed15058
OM
546 for path in candidates:
547 candidate = os.path.basename(path)
548 try:
549 logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
3ac180ed 550 candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
8ed15058
OM
551 except ValueError:
552 logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
553 continue
554 try:
555 if candidate_time > latest_time:
556 latest_time = candidate_time
557 latest = candidate
558 except TypeError:
559 latest_time = candidate_time
560 latest = candidate
73695baf 561 logging.info("Found last old thing: {} / {}".format(latest, latest_time))
eb4e5a3f 562 return latest, latest_time
8ed15058 563
eb4e5a3f 564 def download(self, base_dir, compress, api_key):
7b84ba6d
OM
565 """ Download all files for a given thing.
566 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
567 """
3c82f75b 568 if not self._parsed:
eb4e5a3f 569 self._parse(base_dir, api_key)
3c82f75b 570
e0e69fc6 571 if not self._parsed:
8cdd1b54
OM
572 logging.error(
573 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 574 return State.FAILED
e0e69fc6 575
3c82f75b 576 if not self._needs_download:
e45ba963 577 logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
7b84ba6d 578 return State.ALREADY_DOWNLOADED
3c82f75b 579
247c2cd5 580 if not self._file_links:
73695baf
OM
581 logging.error(
582 "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
247c2cd5
OM
583 return State.FAILED
584
3c82f75b 585 # Have we already downloaded some things?
73695baf 586 renamed_dir = self._handle_outdated_directory()
3c82f75b
OM
587
588 # Get the list of files to download
3c82f75b
OM
589
590 new_file_links = []
591 old_file_links = []
ae598d73 592 self.time_stamp = None
3c82f75b
OM
593
594 if not self.last_time:
595 # If we don't have anything to copy from, then it is all new.
b497d705
OM
596 logging.debug("No last time, downloading all files")
597 new_file_links = self._file_links
ae598d73 598 self.time_stamp = new_file_links[0].last_update
73695baf 599
b497d705 600 for file_link in new_file_links:
ae598d73
OM
601 self.time_stamp = max(self.time_stamp, file_link.last_update)
602 logging.debug("New timestamp will be {}".format(self.time_stamp))
3c82f75b 603 else:
ae598d73 604 self.time_stamp = self.last_time
b497d705
OM
605 for file_link in self._file_links:
606 if file_link.last_update > self.last_time:
3c82f75b 607 new_file_links.append(file_link)
ae598d73 608 self.time_stamp = max(self.time_stamp, file_link.last_update)
3c82f75b
OM
609 else:
610 old_file_links.append(file_link)
3c82f75b 611
ae598d73 612 logging.debug("new timestamp {}".format(self.time_stamp))
3c82f75b
OM
613
614 # OK. Time to get to work.
fa2f3251 615 logging.debug("Generating download_dir")
3c82f75b 616 os.mkdir(self.download_dir)
b497d705 617 filelist_file = os.path.join(self.download_dir, "filelist.txt")
eb7a88fb 618 logging.error("\nd:{}\nf:{}".format(self.download_dir, filelist_file))
d194b140 619 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705 620 for fl in self._file_links:
73695baf 621 fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
b497d705 622
3c82f75b 623 # First grab the cached files (if any)
fa2f3251 624 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
e6d8def4
OM
625 if renamed_dir:
626 for file_link in old_file_links:
627 try:
628 old_file = os.path.join(renamed_dir, file_link.name)
629 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
630 logging.debug("Copying {} to {}".format(old_file, new_file))
631 copyfile(old_file, new_file)
632 except FileNotFoundError:
633 logging.warning(
634 "Unable to find {} in old archive, redownloading".format(file_link.name))
635 new_file_links.append(file_link)
636 except TypeError:
637 # Not altogether sure how this could occur, possibly with some combination of the old file types
638 logging.warning(
639 "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
640 new_file_links.append(file_link)
641
3c82f75b 642 # Now download the new ones
dbdb1782 643 logging.info("Downloading {} new files of {}".format(
b497d705 644 len(new_file_links), len(self._file_links)))
3c82f75b 645 try:
b497d705 646 for file_link in new_file_links:
65bd8b43 647 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 648 logging.debug("Downloading {} from {} to {}".format(
b497d705 649 file_link.name, file_link.link, file_name))
10f0238d 650 data_req = SESSION.get(file_link.link)
e45ba963 651 if data_req.status_code != 200:
73695baf
OM
652 logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
653 sanitise_url(file_link.link),
654 data_req.text))
e45ba963
OM
655 fail_dir(self.download_dir)
656 return State.FAILED
e45ba963 657
3c82f75b
OM
658 with open(file_name, 'wb') as handle:
659 handle.write(data_req.content)
660 except Exception as exception:
b497d705 661 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 662 fail_dir(self.download_dir)
7b84ba6d 663 return State.FAILED
3c82f75b 664
e45ba963 665 # People like images.
680039fe 666 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 667 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
668 try:
669 os.mkdir(image_dir)
fb28c59b 670 for imagelink in self._image_links:
e45ba963
OM
671 filename = os.path.join(image_dir, imagelink.name)
672 image_req = SESSION.get(imagelink.link)
673 if image_req.status_code != 200:
73695baf
OM
674 logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
675 sanitise_url(imagelink.link),
676 image_req.text))
e45ba963
OM
677 fail_dir(self.download_dir)
678 return State.FAILED
679 with open(truncate_name(filename), 'wb') as handle:
680039fe
OM
680 handle.write(image_req.content)
681 except Exception as exception:
e45ba963 682 logging.error("Failed to download {} - {}".format(imagelink.name, exception))
65bd8b43 683 fail_dir(self.download_dir)
7b84ba6d 684 return State.FAILED
680039fe 685
4f75dd69 686 # Best get some licenses
e45ba963 687 logging.info("writing license file")
4f75dd69 688 try:
fb28c59b 689 if self._license:
73695baf
OM
690 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
691 encoding="utf-8") as license_handle:
fb28c59b 692 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
693 except IOError as exception:
694 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 695
e45ba963
OM
696 logging.info("writing readme")
697 try:
698 if self._details:
73695baf
OM
699 with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
700 encoding="utf-8") as readme_handle:
e45ba963
OM
701 readme_handle.write("{}\n".format(self._details))
702 except IOError as exception:
703 logging.warning("Failed to write readme! {}".format(exception))
704
3c82f75b
OM
705 try:
706 # Now write the timestamp
73695baf 707 with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
ae598d73 708 timestamp_handle.write(self.time_stamp.__str__())
3c82f75b 709 except Exception as exception:
e45ba963 710 logging.error("Failed to write timestamp file - {}".format(exception))
65bd8b43 711 fail_dir(self.download_dir)
7b84ba6d 712 return State.FAILED
3c82f75b 713 self._needs_download = False
e45ba963 714 logging.debug("Download of {} finished".format(self.name))
ae598d73
OM
715 if not compress:
716 return State.OK
717
ae598d73 718 thing_dir = "{} - {} - {}".format(self.thing_id,
73695baf
OM
719 slugify(self.name),
720 self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
ae598d73 721 file_name = os.path.join(base_dir,
73695baf 722 "{}.7z".format(thing_dir))
ae598d73 723 logging.debug("Compressing {} to {}".format(
e45ba963 724 self.name,
ae598d73 725 file_name))
ae598d73 726 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
ae598d73 727 archive.writeall(self.download_dir, thing_dir)
e45ba963 728 logging.debug("Compression of {} finished.".format(self.name))
8ed15058 729 shutil.rmtree(self.download_dir)
e45ba963 730 logging.debug("Removed temporary download dir of {}.".format(self.name))
7b84ba6d 731 return State.OK
975060c9 732
dbdb1782 733
ae598d73 734def do_batch(batch_file, download_dir, quick, compress):
1ab49020
OM
735 """ Read a file in line by line, parsing each as a set of calls to this script."""
736 with open(batch_file) as handle:
737 for line in handle:
738 line = line.strip()
cf280385
M
739 if not line:
740 # Skip empty lines
741 continue
1ab49020
OM
742 logging.info("Handling instruction {}".format(line))
743 command_arr = line.split()
744 if command_arr[0] == "thing":
dbdb1782
OM
745 logging.debug(
746 "Handling batch thing instruction: {}".format(line))
eb4e5a3f 747 Thing.from_thing_id(command_arr[1]).download(download_dir, compress)
1ab49020
OM
748 continue
749 if command_arr[0] == "collection":
dbdb1782
OM
750 logging.debug(
751 "Handling batch collection instruction: {}".format(line))
752 Collection(command_arr[1], command_arr[2],
ae598d73 753 download_dir, quick, compress).download()
1ab49020
OM
754 continue
755 if command_arr[0] == "user":
dbdb1782
OM
756 logging.debug(
757 "Handling batch collection instruction: {}".format(line))
ae598d73 758 Designs(command_arr[1], download_dir, quick, compress).download()
1ab49020
OM
759 continue
760 logging.warning("Unable to parse current instruction. Skipping.")
761
dbdb1782 762
975060c9
OM
763def main():
764 """ Entry point for script being run as a command. """
765 parser = argparse.ArgumentParser()
dbdb1782 766 parser.add_argument("-l", "--log-level", choices=[
73695baf 767 'debug', 'info', 'warning'], default='info', help="level of logging desired")
dbdb1782
OM
768 parser.add_argument("-d", "--directory",
769 help="Target directory to download into")
4f94efc8
OM
770 parser.add_argument("-f", "--log-file",
771 help="Place to log debug information to")
7b84ba6d
OM
772 parser.add_argument("-q", "--quick", action="store_true",
773 help="Assume date ordering on posts")
ae598d73
OM
774 parser.add_argument("-c", "--compress", action="store_true",
775 help="Compress files")
e45ba963
OM
776 parser.add_argument("-a", "--api-key",
777 help="API key for thingiverse")
7b84ba6d 778
dbdb1782
OM
779 subparsers = parser.add_subparsers(
780 help="Type of thing to download", dest="subcommand")
781 collection_parser = subparsers.add_parser(
b7bfef68 782 'collection', help="Download one or more entire collection(s)")
dbdb1782 783 collection_parser.add_argument(
b7bfef68 784 "owner", help="The owner of the collection(s) to get")
dbdb1782 785 collection_parser.add_argument(
73695baf 786 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
787 thing_parser = subparsers.add_parser(
788 'thing', help="Download a single thing.")
8cdd1b54
OM
789 thing_parser.add_argument(
790 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 791 user_parser = subparsers.add_parser(
73695baf 792 "user", help="Download all things by one or more users")
8cdd1b54
OM
793 user_parser.add_argument(
794 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
795 batch_parser = subparsers.add_parser(
796 "batch", help="Perform multiple actions written in a text file")
797 batch_parser.add_argument(
798 "batch_file", help="The name of the file to read.")
680039fe 799 subparsers.add_parser("version", help="Show the current version")
4a98996b 800
975060c9 801 args = parser.parse_args()
4a98996b
OM
802 if not args.subcommand:
803 parser.print_help()
804 sys.exit(1)
d66f1f78
OM
805 if not args.directory:
806 args.directory = os.getcwd()
4f94efc8
OM
807
808 logger = logging.getLogger()
809 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
810 logger.setLevel(logging.DEBUG)
811 console_handler = logging.StreamHandler()
812 console_handler.setLevel(args.log_level.upper())
813
e45ba963
OM
814 global API_KEY
815 if args.api_key:
73695baf 816 API_KEY = args.api_key
e45ba963
OM
817 else:
818 try:
819 with open("api.key") as fh:
73695baf 820 API_KEY = fh.read().strip()
e45ba963
OM
821 except Exception as e:
822 logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
823 logging.error("Exception: {}".format(e))
824 return
825
4f94efc8
OM
826 logger.addHandler(console_handler)
827 if args.log_file:
828 file_handler = logging.FileHandler(args.log_file)
829 file_handler.setLevel(logging.DEBUG)
830 file_handler.setFormatter(formatter)
831 logger.addHandler(file_handler)
fa2f3251 832
6a777954
OM
833 # Start downloader
834 thing_queue = multiprocessing.JoinableQueue()
835 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
eb4e5a3f 836 downloaders = [Downloader(thing_queue, args.directory, args.compress, API_KEY) for _ in range(DOWNLOADER_COUNT)]
6a777954
OM
837 for downloader in downloaders:
838 downloader.start()
839
4a98996b 840 if args.subcommand.startswith("collection"):
b7bfef68 841 for collection in args.collections:
ae598d73 842 Collection(args.owner, collection, args.directory, args.quick, args.compress).download()
4a98996b 843 if args.subcommand == "thing":
b7bfef68 844 for thing in args.things:
6a777954 845 thing_queue.put(thing)
3522a3bf 846 if args.subcommand == "user":
b7bfef68 847 for user in args.users:
ae598d73 848 Designs(user, args.directory, args.quick, args.compress).download()
db8066ec
OM
849 if args.subcommand == "version":
850 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 851 if args.subcommand == "batch":
ae598d73 852 do_batch(args.batch_file, args.directory, args.quick, args.compress)
1ab49020 853
6a777954 854 # Stop the downloader processes
73695baf 855 for _ in downloaders:
6a777954 856 thing_queue.put(None)
975060c9 857
d194b140 858
73695baf 859if __name__ == "__main__":
0930777e 860 multiprocessing.freeze_support()
975060c9 861 main()