Make readmes text files
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
b497d705 17from dataclasses import dataclass
9828dabe 18import py7zr
8ed15058
OM
19import glob
20import shutil
e1306099
OM
21from io import StringIO
22from html.parser import HTMLParser
975060c9 23
ae598d73
OM
24SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
25
8ed15058
OM
26# I don't think this is exported by datetime
27DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
3ac180ed
OM
28# Windows cannot handle : in filenames
29SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
8ed15058 30
73695baf
OM
31API_BASE = "https://api.thingiverse.com"
32ACCESS_QP = "access_token={}"
33PAGE_QP = "page={}"
714415bd 34API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
e45ba963 35API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
975060c9 36
e45ba963
OM
37# Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
38API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
39API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
40
41API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
42API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
43API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
10f0238d 44API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
e45ba963 45
6a777954 46DOWNLOADER_COUNT = 1
7b84ba6d 47RETRY_COUNT = 3
6a777954 48
65bd8b43
OM
49MAX_PATH_LENGTH = 250
50
e1306099 51VERSION = "0.10.4"
dbdb1782 52
8ed15058 53TIMESTAMP_FILE = "timestamp.txt"
b497d705 54
e45ba963 55SESSION = requests.Session()
b497d705 56
73695baf 57
e1306099
OM
58class MLStripper(HTMLParser):
59 """ Turns HTML markup into plain text
60 """
61
62 def error(self, message):
63 raise ValueError(message)
64
65 def __init__(self):
66 super().__init__()
67 self.reset()
68 self.strict = False
69 self.convert_charrefs= True
70 self.text = StringIO()
71
72 def handle_data(self, d):
73 self.text.write(d)
74
75 def get_data(self):
76 return self.text.getvalue()
77
78 @staticmethod
79 def strip_tags(html):
80 s = MLStripper()
81 s.feed(html)
82 return s.get_data()
83
e45ba963
OM
84@dataclass
85class ThingLink:
86 thing_id: str
87 name: str
88 api_link: str
b497d705 89
73695baf 90
b497d705
OM
91@dataclass
92class FileLink:
93 name: str
ae598d73
OM
94 last_update: datetime.datetime
95 link: str
96
73695baf 97
e45ba963
OM
98@dataclass
99class ImageLink:
100 name: str
101 link: str
102
73695baf 103
ae598d73 104class FileLinks:
73695baf
OM
105 def __init__(self, initial_links=None):
106 if initial_links is None:
107 initial_links = []
ae598d73
OM
108 self.links = []
109 self.last_update = None
73695baf 110 for link in initial_links:
ae598d73
OM
111 self.append(link)
112
113 def __iter__(self):
114 return iter(self.links)
115
116 def __getitem__(self, item):
117 return self.links[item]
118
119 def __len__(self):
120 return len(self.links)
121
122 def append(self, link):
123 try:
124 self.last_update = max(self.last_update, link.last_update)
125 except TypeError:
126 self.last_update = link.last_update
127 self.links.append(link)
8ed15058 128
b497d705 129
7b84ba6d
OM
130class State(enum.Enum):
131 OK = enum.auto()
132 FAILED = enum.auto()
133 ALREADY_DOWNLOADED = enum.auto()
134
73695baf 135
e45ba963
OM
136def sanitise_url(url):
137 """ remove api keys from an url
138 """
139 return re.sub(r'access_token=\w*',
140 'access_token=***',
141 url)
142
73695baf 143
e45ba963
OM
144def strip_time(date_obj):
145 """ Takes a datetime object and returns another with the time set to 00:00
146 """
147 return datetime.datetime.combine(date_obj.date(), datetime.time())
148
73695baf 149
8ed15058
OM
150def rename_unique(dir_name, target_dir_name):
151 """ Move a directory sideways to a new name, ensuring it is unique.
65bd8b43 152 """
8ed15058 153 target_dir = target_dir_name
65bd8b43
OM
154 inc = 0
155 while os.path.exists(target_dir):
73695baf
OM
156 target_dir = "{}_{}".format(target_dir_name, inc)
157 inc += 1
65bd8b43 158 os.rename(dir_name, target_dir)
8ed15058
OM
159 return target_dir
160
161
162def fail_dir(dir_name):
163 """ When a download has failed, move it sideways.
164 """
73695baf 165 return rename_unique(dir_name, "{}_failed".format(dir_name))
65bd8b43
OM
166
167
168def truncate_name(file_name):
169 """ Ensure the filename is not too long for, well windows basically.
170 """
171 path = os.path.abspath(file_name)
172 if len(path) <= MAX_PATH_LENGTH:
173 return path
65bd8b43
OM
174 base, extension = os.path.splitext(path)
175 inc = 0
176 new_path = "{}_{}{}".format(base, inc, extension)
177 while os.path.exists(new_path):
178 new_path = "{}_{}{}".format(base, inc, extension)
179 inc += 1
180 return new_path
181
182
975060c9
OM
183def slugify(value):
184 """
d194b140
OM
185 Normalise string, removes invalid for filename charactersr
186 and converts string to lowercase.
975060c9 187 """
e45ba963 188 logging.debug("Sluggyfying {}".format(value))
d194b140 189 value = unicodedata.normalize('NFKC', value).lower().strip()
73695baf 190 value = re.sub(r'[\\/<>:?*|"]', '', value)
65bd8b43 191 value = re.sub(r'\.*$', '', value)
eb7a88fb 192 return value.strip()
975060c9 193
b497d705 194
6a777954
OM
195class Downloader(multiprocessing.Process):
196 """
197 Class to handle downloading the things we have found to get.
198 """
199
eb4e5a3f 200 def __init__(self, thing_queue, download_directory, compress, api_key):
6a777954
OM
201 multiprocessing.Process.__init__(self)
202 # TODO: add parameters
203 self.thing_queue = thing_queue
204 self.download_directory = download_directory
ae598d73 205 self.compress = compress
eb4e5a3f 206 self.api_key = api_key
6a777954
OM
207
208 def run(self):
209 """ actual download loop.
210 """
211 while True:
eb4e5a3f 212 thing_id = self.thing_queue.get()
6a777954
OM
213 if thing_id is None:
214 logging.info("Shutting download queue")
215 self.thing_queue.task_done()
216 break
eb4e5a3f
OM
217 thing = None
218 if isinstance(thing_id, str):
219 thing = Thing.from_thing_id(thing_id)
220 if isinstance(thing_id, ThingLink):
221 thing = Thing(thing_id)
222 if not thing:
223 logging.error("Don't know how to handle thing_id {}".format(thing_id))
224 else:
225 logging.info("Handling id {}".format(thing_id))
226 thing.download(self.download_directory, self.compress, self.api_key)
6a777954
OM
227 self.thing_queue.task_done()
228 return
229
7b84ba6d 230
3522a3bf 231class Grouping:
d66f1f78 232 """ Holds details of a group of things for download
3c82f75b
OM
233 This is effectively (although not actually) an abstract class
234 - use Collection or Designs instead.
235 """
dbdb1782 236
714415bd 237 def __init__(self, quick, compress, api_key):
975060c9
OM
238 self.things = []
239 self.total = 0
240 self.req_id = None
241 self.last_page = 0
242 self.per_page = None
7b84ba6d 243 # Should we stop downloading when we hit a known datestamp?
73695baf 244 self.quick = quick
ae598d73 245 self.compress = compress
714415bd 246 self.api_key = api_key
948bd56f 247 # These should be set by child classes.
3522a3bf
OM
248 self.url = None
249 self.download_dir = None
975060c9 250
714415bd 251
73695baf 252 @property
3522a3bf
OM
253 def get(self):
254 """ retrieve the things of the grouping. """
975060c9
OM
255 if self.things:
256 # We've already done it.
257 return self.things
258
3522a3bf
OM
259 # Check for initialisation:
260 if not self.url:
fa2f3251 261 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
262 raise ValueError("No URL set - object not initialised properly?")
263
264 # Get the internal details of the grouping.
e45ba963 265 logging.debug("Querying {}".format(sanitise_url(self.url)))
73695baf
OM
266
267 # self.url should already have been formatted as we don't need pagination
268 logging.info("requesting:{}".format(sanitise_url(self.url)))
269 current_req = SESSION.get(self.url)
270 if current_req.status_code != 200:
271 logging.error(
272 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
273 current_req.text))
e45ba963 274 else:
73695baf
OM
275 current_json = current_req.json()
276 for thing in current_json:
277 logging.info(thing)
278 self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
e45ba963 279 logging.info("Found {} things.".format(len(self.things)))
975060c9
OM
280 return self.things
281
282 def download(self):
283 """ Downloads all the files in a collection """
284 if not self.things:
714415bd 285 self.get
3522a3bf
OM
286
287 if not self.download_dir:
dbdb1782
OM
288 raise ValueError(
289 "No download_dir set - invalidly initialised object?")
3522a3bf 290
975060c9 291 base_dir = os.getcwd()
975060c9 292 try:
3522a3bf 293 os.mkdir(self.download_dir)
975060c9 294 except FileExistsError:
fa2f3251 295 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 296 .format(self.download_dir))
fa2f3251 297 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 298 for idx, thing in enumerate(self.things):
fb28c59b 299 logging.info("Downloading thing {} - {}".format(idx, thing))
714415bd 300 return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
eb4e5a3f 301 if self.quick and return_code == State.ALREADY_DOWNLOADED:
7b84ba6d
OM
302 logging.info("Caught up, stopping.")
303 return
975060c9 304
73695baf 305
3522a3bf
OM
306class Collection(Grouping):
307 """ Holds details of a collection. """
dbdb1782 308
714415bd
OM
309 def __init__(self, user, name, directory, quick, compress, api_key):
310 Grouping.__init__(self, quick, compress, api_key)
3522a3bf
OM
311 self.user = user
312 self.name = name
e45ba963
OM
313 self.paginated = False
314 # need to figure out the the ID for the collection
714415bd 315 collection_url = API_USER_COLLECTIONS.format(user, api_key)
e45ba963
OM
316 try:
317 current_req = SESSION.get(collection_url)
318 except requests.exceptions.ConnectionError as error:
73695baf
OM
319 logging.error("Unable to connect for collections for user {}: {}".format(
320 self.user, error))
e45ba963
OM
321 return
322 if current_req.status_code != 200:
73695baf
OM
323 logging.error(
324 "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
325 current_req.text))
e45ba963
OM
326 return
327 collection_list = current_req.json()
328 try:
329 # case insensitive to retain parity with previous behaviour
330 collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
331 except IndexError:
332 logging.error("Unable to find collection {} for user {}".format(name, user))
333 return
334 self.collection_id = collection['id']
714415bd 335 self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
e45ba963 336
d66f1f78 337 self.download_dir = os.path.join(directory,
3c82f75b 338 "{}-{}".format(slugify(self.user), slugify(self.name)))
3522a3bf 339
dbdb1782 340
3522a3bf
OM
341class Designs(Grouping):
342 """ Holds details of all of a users' designs. """
dbdb1782 343
714415bd
OM
344 def __init__(self, user, directory, quick, compress, api_key):
345 Grouping.__init__(self, quick, compress, api_key)
3522a3bf 346 self.user = user
714415bd 347 self.url = API_USER_DESIGNS.format(user, api_key)
dbdb1782
OM
348 self.download_dir = os.path.join(
349 directory, "{} designs".format(slugify(self.user)))
975060c9 350
dbdb1782 351
3c82f75b
OM
352class Thing:
353 """ An individual design on thingiverse. """
dbdb1782 354
e45ba963
OM
355 def __init__(self, thing_link):
356 self.thing_id = thing_link.thing_id
357 self.name = thing_link.name
3c82f75b
OM
358 self.last_time = None
359 self._parsed = False
360 self._needs_download = True
361 self.text = None
3c82f75b 362 self.download_dir = None
ae598d73
OM
363 self.time_stamp = None
364 self._file_links = FileLinks()
e45ba963 365 self._image_links = []
975060c9 366
eb4e5a3f
OM
367 @classmethod
368 def from_thing_id(cls, thing_id):
369 """
370 Factory method that looks up a thing by ID and creates a Thing object for it
371 :param thing_id: to look up
372 :return: Thing or None
373 """
374 return Thing(ThingLink(thing_id, "", ""))
375
376
377 def _parse(self, base_dir, api_key):
3c82f75b
OM
378 """ Work out what, if anything needs to be done. """
379 if self._parsed:
380 return
e36c2a07 381
e45ba963 382 # First get the broad details
eb4e5a3f 383 url = API_THING_DETAILS.format(self.thing_id, api_key)
e0e69fc6 384 try:
e45ba963 385 current_req = SESSION.get(url)
e0e69fc6 386 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
387 logging.error("Unable to connect for thing {}: {}".format(
388 self.thing_id, error))
389 return
e45ba963
OM
390 # Check for DMCA
391 if current_req.status_code == 403:
392 logging.error("Access to thing {} is forbidden".format(self.thing_id))
fb28c59b 393 return
e45ba963 394 if current_req.status_code != 200:
73695baf
OM
395 logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
396 current_req.text))
e45ba963
OM
397 return
398
399 thing_json = current_req.json()
400 try:
401 self._license = thing_json['license']
402 except KeyError:
403 logging.warning("No license found for thing {}?".format(self.thing_id))
404
e1306099 405 details = None
e45ba963 406 try:
e1306099 407 details = thing_json['details']
e45ba963
OM
408 except KeyError:
409 logging.warning("No description found for thing {}?".format(self.thing_id))
e0e69fc6 410
e1306099
OM
411
412 if details:
413 try:
414 self._details = MLStripper.strip_tags(details)
415 except ValueError as e:
416 logging.warning("Unable to strip HTML from readme: {}".format(e))
417 self._details = details
418
419
eb4e5a3f
OM
420 if not self.name:
421 # Probably generated with factory method.
422 try:
423 self.name = thing_json['name']
424 except KeyError:
425 logging.warning("No name found for thing {}?".format(self.thing_id))
426 self.name = self.thing_id
427
e45ba963 428 # Now get the file details
eb4e5a3f 429 file_url = API_THING_FILES.format(self.thing_id, api_key)
e45ba963
OM
430
431 try:
432 current_req = SESSION.get(file_url)
433 except requests.exceptions.ConnectionError as error:
434 logging.error("Unable to connect for thing {}: {}".format(
435 self.thing_id, error))
436 return
437
438 if current_req.status_code != 200:
73695baf
OM
439 logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
440 current_req.text))
e45ba963
OM
441 return
442
443 link_list = current_req.json()
444
445 if not link_list:
73695baf
OM
446 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
447 self.thing_id))
e45ba963
OM
448
449 for link in link_list:
450 logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
fb28c59b 451 try:
e45ba963 452 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
73695baf 453 self._file_links.append(
eb4e5a3f 454 FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
fb28c59b 455 except ValueError:
e45ba963
OM
456 logging.error(link['date'])
457
458 # Finally get the image links
eb4e5a3f 459 image_url = API_THING_IMAGES.format(self.thing_id, api_key)
e45ba963
OM
460
461 try:
462 current_req = SESSION.get(image_url)
463 except requests.exceptions.ConnectionError as error:
464 logging.error("Unable to connect for thing {}: {}".format(
465 self.thing_id, error))
466 return
467
468 if current_req.status_code != 200:
73695baf
OM
469 logging.error(
470 "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
471 current_req.text))
e45ba963 472 return
fb28c59b 473
e45ba963 474 image_list = current_req.json()
e0e69fc6 475
e45ba963 476 if not image_list:
73695baf
OM
477 logging.warning(
478 "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
479 self.thing_id))
e0e69fc6 480
e45ba963
OM
481 for image in image_list:
482 logging.debug("parsing image: {}".format(image))
73695baf 483 name = None
e45ba963
OM
484 try:
485 name = slugify(image['name'])
486 # TODO: fallback to other types
73695baf 487 url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
e45ba963
OM
488 except KeyError:
489 logging.warning("Missing image for {}".format(name))
490 self._image_links.append(ImageLink(name, url))
491
492 self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
8ed15058
OM
493 self.download_dir = os.path.join(base_dir, self.slug)
494
495 self._handle_old_directory(base_dir)
3c82f75b 496
e45ba963 497 logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
8ed15058 498 latest, self.last_time = self._find_last_download(base_dir)
fa2f3251 499
8ed15058 500 if not latest:
73695baf
OM
501 # Not yet downloaded
502 self._parsed = True
503 return
3c82f75b 504
8ed15058 505 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
506
507 # OK, so we have a timestamp, lets see if there is anything new to get
e45ba963
OM
508 # First off, are we comparing an old download that threw away the timestamp?
509 ignore_time = self.last_time == strip_time(self.last_time)
ae598d73 510 try:
e45ba963
OM
511 # TODO: Allow for comparison at the exact time
512 files_last_update = self._file_links.last_update
513 if ignore_time:
514 logging.info("Dropping time from comparison stamp as old-style download dir")
515 files_last_update = strip_time(files_last_update)
516
e45ba963 517 if files_last_update > self.last_time:
dbdb1782 518 logging.info(
ae598d73 519 "Found new/updated files {}".format(self._file_links.last_update))
3c82f75b
OM
520 self._needs_download = True
521 self._parsed = True
522 return
ae598d73
OM
523 except TypeError:
524 logging.warning("No files found for {}.".format(self.thing_id))
b497d705 525
3c82f75b 526 # Got here, so nope, no new files.
3c82f75b
OM
527 self._needs_download = False
528 self._parsed = True
529
8ed15058
OM
530 def _handle_old_directory(self, base_dir):
531 """ Deal with any old directories from previous versions of the code.
532 """
e45ba963 533 old_dir = os.path.join(base_dir, slugify(self.name))
8ed15058
OM
534 if os.path.exists(old_dir):
535 logging.warning("Found old style download_dir. Moving.")
536 rename_unique(old_dir, self.download_dir)
537
73695baf 538 def _handle_outdated_directory(self):
8ed15058
OM
539 """ Move the current download directory sideways if the thing has changed.
540 """
541 if not os.path.exists(self.download_dir):
542 # No old directory to move.
543 return None
544 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
545 if not os.path.exists(timestamp_file):
546 # Old form of download directory
547 target_dir_name = "{} - old".format(self.download_dir)
548 else:
3ac180ed 549 target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
8ed15058
OM
550 return rename_unique(self.download_dir, target_dir_name)
551
552 def _find_last_download(self, base_dir):
553 """ Look for the most recent previous download (if any) of the thing.
554 """
555 logging.info("Looking for old things")
556
557 # First the DL directory itself.
558 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
559
560 latest = None
561 latest_time = None
562
563 try:
564 logging.debug("Checking for existing download in normal place.")
565 with open(timestamp_file) as ts_fh:
566 timestamp_text = ts_fh.read().strip()
567 latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
568 latest = self.download_dir
569 except FileNotFoundError:
570 # No existing download directory. huh.
571 pass
572 except TypeError:
573 logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
574
575 # TODO: Maybe look for old download directories.
576
8ed15058
OM
577 # Now look for 7z files
578 candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
579 # +3 to allow for ' - '
73695baf 580 leading_length = len(self.slug) + 3
8ed15058
OM
581 for path in candidates:
582 candidate = os.path.basename(path)
583 try:
584 logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
3ac180ed 585 candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
8ed15058
OM
586 except ValueError:
587 logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
588 continue
589 try:
590 if candidate_time > latest_time:
591 latest_time = candidate_time
592 latest = candidate
593 except TypeError:
594 latest_time = candidate_time
595 latest = candidate
73695baf 596 logging.info("Found last old thing: {} / {}".format(latest, latest_time))
eb4e5a3f 597 return latest, latest_time
8ed15058 598
eb4e5a3f 599 def download(self, base_dir, compress, api_key):
7b84ba6d
OM
600 """ Download all files for a given thing.
601 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
602 """
3c82f75b 603 if not self._parsed:
eb4e5a3f 604 self._parse(base_dir, api_key)
3c82f75b 605
e0e69fc6 606 if not self._parsed:
8cdd1b54
OM
607 logging.error(
608 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 609 return State.FAILED
e0e69fc6 610
3c82f75b 611 if not self._needs_download:
e45ba963 612 logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
7b84ba6d 613 return State.ALREADY_DOWNLOADED
3c82f75b 614
247c2cd5 615 if not self._file_links:
73695baf
OM
616 logging.error(
617 "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
247c2cd5
OM
618 return State.FAILED
619
3c82f75b 620 # Have we already downloaded some things?
73695baf 621 renamed_dir = self._handle_outdated_directory()
3c82f75b
OM
622
623 # Get the list of files to download
3c82f75b
OM
624
625 new_file_links = []
626 old_file_links = []
ae598d73 627 self.time_stamp = None
3c82f75b
OM
628
629 if not self.last_time:
630 # If we don't have anything to copy from, then it is all new.
b497d705
OM
631 logging.debug("No last time, downloading all files")
632 new_file_links = self._file_links
ae598d73 633 self.time_stamp = new_file_links[0].last_update
73695baf 634
b497d705 635 for file_link in new_file_links:
ae598d73
OM
636 self.time_stamp = max(self.time_stamp, file_link.last_update)
637 logging.debug("New timestamp will be {}".format(self.time_stamp))
3c82f75b 638 else:
ae598d73 639 self.time_stamp = self.last_time
b497d705
OM
640 for file_link in self._file_links:
641 if file_link.last_update > self.last_time:
3c82f75b 642 new_file_links.append(file_link)
ae598d73 643 self.time_stamp = max(self.time_stamp, file_link.last_update)
3c82f75b
OM
644 else:
645 old_file_links.append(file_link)
3c82f75b 646
ae598d73 647 logging.debug("new timestamp {}".format(self.time_stamp))
3c82f75b
OM
648
649 # OK. Time to get to work.
fa2f3251 650 logging.debug("Generating download_dir")
3c82f75b 651 os.mkdir(self.download_dir)
b497d705 652 filelist_file = os.path.join(self.download_dir, "filelist.txt")
d194b140 653 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705 654 for fl in self._file_links:
73695baf 655 fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
b497d705 656
3c82f75b 657 # First grab the cached files (if any)
fa2f3251 658 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
e6d8def4
OM
659 if renamed_dir:
660 for file_link in old_file_links:
661 try:
662 old_file = os.path.join(renamed_dir, file_link.name)
663 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
664 logging.debug("Copying {} to {}".format(old_file, new_file))
665 copyfile(old_file, new_file)
666 except FileNotFoundError:
667 logging.warning(
668 "Unable to find {} in old archive, redownloading".format(file_link.name))
669 new_file_links.append(file_link)
670 except TypeError:
671 # Not altogether sure how this could occur, possibly with some combination of the old file types
672 logging.warning(
673 "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
674 new_file_links.append(file_link)
675
3c82f75b 676 # Now download the new ones
dbdb1782 677 logging.info("Downloading {} new files of {}".format(
b497d705 678 len(new_file_links), len(self._file_links)))
3c82f75b 679 try:
b497d705 680 for file_link in new_file_links:
65bd8b43 681 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 682 logging.debug("Downloading {} from {} to {}".format(
b497d705 683 file_link.name, file_link.link, file_name))
10f0238d 684 data_req = SESSION.get(file_link.link)
e45ba963 685 if data_req.status_code != 200:
73695baf
OM
686 logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
687 sanitise_url(file_link.link),
688 data_req.text))
e45ba963
OM
689 fail_dir(self.download_dir)
690 return State.FAILED
e45ba963 691
3c82f75b
OM
692 with open(file_name, 'wb') as handle:
693 handle.write(data_req.content)
694 except Exception as exception:
b497d705 695 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 696 fail_dir(self.download_dir)
7b84ba6d 697 return State.FAILED
3c82f75b 698
e45ba963 699 # People like images.
680039fe 700 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 701 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
702 try:
703 os.mkdir(image_dir)
fb28c59b 704 for imagelink in self._image_links:
e45ba963
OM
705 filename = os.path.join(image_dir, imagelink.name)
706 image_req = SESSION.get(imagelink.link)
707 if image_req.status_code != 200:
73695baf
OM
708 logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
709 sanitise_url(imagelink.link),
710 image_req.text))
e45ba963
OM
711 fail_dir(self.download_dir)
712 return State.FAILED
713 with open(truncate_name(filename), 'wb') as handle:
680039fe
OM
714 handle.write(image_req.content)
715 except Exception as exception:
e45ba963 716 logging.error("Failed to download {} - {}".format(imagelink.name, exception))
65bd8b43 717 fail_dir(self.download_dir)
7b84ba6d 718 return State.FAILED
680039fe 719
4f75dd69 720 # Best get some licenses
e45ba963 721 logging.info("writing license file")
4f75dd69 722 try:
fb28c59b 723 if self._license:
73695baf
OM
724 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
725 encoding="utf-8") as license_handle:
fb28c59b 726 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
727 except IOError as exception:
728 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 729
e45ba963
OM
730 logging.info("writing readme")
731 try:
732 if self._details:
73695baf
OM
733 with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
734 encoding="utf-8") as readme_handle:
e45ba963
OM
735 readme_handle.write("{}\n".format(self._details))
736 except IOError as exception:
737 logging.warning("Failed to write readme! {}".format(exception))
738
3c82f75b
OM
739 try:
740 # Now write the timestamp
73695baf 741 with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
ae598d73 742 timestamp_handle.write(self.time_stamp.__str__())
3c82f75b 743 except Exception as exception:
e45ba963 744 logging.error("Failed to write timestamp file - {}".format(exception))
65bd8b43 745 fail_dir(self.download_dir)
7b84ba6d 746 return State.FAILED
3c82f75b 747 self._needs_download = False
e45ba963 748 logging.debug("Download of {} finished".format(self.name))
ae598d73
OM
749 if not compress:
750 return State.OK
751
ae598d73 752 thing_dir = "{} - {} - {}".format(self.thing_id,
73695baf
OM
753 slugify(self.name),
754 self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
ae598d73 755 file_name = os.path.join(base_dir,
73695baf 756 "{}.7z".format(thing_dir))
ae598d73 757 logging.debug("Compressing {} to {}".format(
e45ba963 758 self.name,
ae598d73 759 file_name))
ae598d73 760 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
ae598d73 761 archive.writeall(self.download_dir, thing_dir)
e45ba963 762 logging.debug("Compression of {} finished.".format(self.name))
8ed15058 763 shutil.rmtree(self.download_dir)
e45ba963 764 logging.debug("Removed temporary download dir of {}.".format(self.name))
7b84ba6d 765 return State.OK
975060c9 766
dbdb1782 767
ae598d73 768def do_batch(batch_file, download_dir, quick, compress):
1ab49020
OM
769 """ Read a file in line by line, parsing each as a set of calls to this script."""
770 with open(batch_file) as handle:
771 for line in handle:
772 line = line.strip()
cf280385
M
773 if not line:
774 # Skip empty lines
775 continue
1ab49020
OM
776 logging.info("Handling instruction {}".format(line))
777 command_arr = line.split()
778 if command_arr[0] == "thing":
dbdb1782
OM
779 logging.debug(
780 "Handling batch thing instruction: {}".format(line))
eb4e5a3f 781 Thing.from_thing_id(command_arr[1]).download(download_dir, compress)
1ab49020
OM
782 continue
783 if command_arr[0] == "collection":
dbdb1782
OM
784 logging.debug(
785 "Handling batch collection instruction: {}".format(line))
786 Collection(command_arr[1], command_arr[2],
ae598d73 787 download_dir, quick, compress).download()
1ab49020
OM
788 continue
789 if command_arr[0] == "user":
dbdb1782
OM
790 logging.debug(
791 "Handling batch collection instruction: {}".format(line))
ae598d73 792 Designs(command_arr[1], download_dir, quick, compress).download()
1ab49020
OM
793 continue
794 logging.warning("Unable to parse current instruction. Skipping.")
795
dbdb1782 796
975060c9
OM
797def main():
798 """ Entry point for script being run as a command. """
799 parser = argparse.ArgumentParser()
dbdb1782 800 parser.add_argument("-l", "--log-level", choices=[
73695baf 801 'debug', 'info', 'warning'], default='info', help="level of logging desired")
dbdb1782
OM
802 parser.add_argument("-d", "--directory",
803 help="Target directory to download into")
4f94efc8
OM
804 parser.add_argument("-f", "--log-file",
805 help="Place to log debug information to")
7b84ba6d
OM
806 parser.add_argument("-q", "--quick", action="store_true",
807 help="Assume date ordering on posts")
ae598d73
OM
808 parser.add_argument("-c", "--compress", action="store_true",
809 help="Compress files")
e45ba963
OM
810 parser.add_argument("-a", "--api-key",
811 help="API key for thingiverse")
7b84ba6d 812
dbdb1782
OM
813 subparsers = parser.add_subparsers(
814 help="Type of thing to download", dest="subcommand")
815 collection_parser = subparsers.add_parser(
b7bfef68 816 'collection', help="Download one or more entire collection(s)")
dbdb1782 817 collection_parser.add_argument(
b7bfef68 818 "owner", help="The owner of the collection(s) to get")
dbdb1782 819 collection_parser.add_argument(
73695baf 820 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
821 thing_parser = subparsers.add_parser(
822 'thing', help="Download a single thing.")
8cdd1b54
OM
823 thing_parser.add_argument(
824 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 825 user_parser = subparsers.add_parser(
73695baf 826 "user", help="Download all things by one or more users")
8cdd1b54
OM
827 user_parser.add_argument(
828 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
829 batch_parser = subparsers.add_parser(
830 "batch", help="Perform multiple actions written in a text file")
831 batch_parser.add_argument(
832 "batch_file", help="The name of the file to read.")
680039fe 833 subparsers.add_parser("version", help="Show the current version")
4a98996b 834
975060c9 835 args = parser.parse_args()
4a98996b
OM
836 if not args.subcommand:
837 parser.print_help()
838 sys.exit(1)
d66f1f78
OM
839 if not args.directory:
840 args.directory = os.getcwd()
4f94efc8
OM
841
842 logger = logging.getLogger()
843 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
844 logger.setLevel(logging.DEBUG)
845 console_handler = logging.StreamHandler()
846 console_handler.setLevel(args.log_level.upper())
847
714415bd 848
e45ba963 849 if args.api_key:
714415bd 850 api_key = args.api_key
e45ba963
OM
851 else:
852 try:
853 with open("api.key") as fh:
714415bd 854 api_key = fh.read().strip()
e45ba963
OM
855 except Exception as e:
856 logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
857 logging.error("Exception: {}".format(e))
858 return
859
4f94efc8
OM
860 logger.addHandler(console_handler)
861 if args.log_file:
862 file_handler = logging.FileHandler(args.log_file)
863 file_handler.setLevel(logging.DEBUG)
864 file_handler.setFormatter(formatter)
865 logger.addHandler(file_handler)
fa2f3251 866
6a777954
OM
867 # Start downloader
868 thing_queue = multiprocessing.JoinableQueue()
869 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
714415bd 870 downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
6a777954
OM
871 for downloader in downloaders:
872 downloader.start()
873
4a98996b 874 if args.subcommand.startswith("collection"):
b7bfef68 875 for collection in args.collections:
714415bd 876 Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
4a98996b 877 if args.subcommand == "thing":
b7bfef68 878 for thing in args.things:
6a777954 879 thing_queue.put(thing)
3522a3bf 880 if args.subcommand == "user":
b7bfef68 881 for user in args.users:
714415bd 882 Designs(user, args.directory, args.quick, args.compress, api_key).download()
db8066ec
OM
883 if args.subcommand == "version":
884 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 885 if args.subcommand == "batch":
ae598d73 886 do_batch(args.batch_file, args.directory, args.quick, args.compress)
1ab49020 887
6a777954 888 # Stop the downloader processes
73695baf 889 for _ in downloaders:
6a777954 890 thing_queue.put(None)
975060c9 891
d194b140 892
73695baf 893if __name__ == "__main__":
0930777e 894 multiprocessing.freeze_support()
975060c9 895 main()