Commit | Line | Data |
---|---|---|
975060c9 OM |
1 | #!/usr/bin/env python3 |
2 | """ | |
3 | Thingiverse bulk downloader | |
4 | """ | |
5 | ||
6 | import re | |
4a98996b | 7 | import sys |
975060c9 OM |
8 | import os |
9 | import argparse | |
10 | import unicodedata | |
11 | import requests | |
fa2f3251 | 12 | import logging |
6a777954 | 13 | import multiprocessing |
7b84ba6d | 14 | import enum |
fb28c59b | 15 | import datetime |
3c82f75b | 16 | from shutil import copyfile |
b497d705 | 17 | from dataclasses import dataclass |
d194b140 | 18 | import atexit |
9828dabe | 19 | import py7zr |
8ed15058 OM |
20 | import glob |
21 | import shutil | |
975060c9 | 22 | |
ae598d73 OM |
23 | SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}] |
24 | ||
8ed15058 OM |
25 | # I don't think this is exported by datetime |
26 | DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' | |
3ac180ed OM |
27 | # Windows cannot handle : in filenames |
28 | SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S' | |
8ed15058 | 29 | |
e45ba963 OM |
30 | API_BASE="https://api.thingiverse.com" |
31 | ACCESS_QP="access_token={}" | |
32 | PAGE_QP="page={}" | |
33 | API_USER_DESIGNS = API_BASE + "/users/{}/things/" | |
34 | API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP | |
975060c9 | 35 | |
e45ba963 OM |
36 | # Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS |
37 | API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP | |
38 | API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP | |
39 | ||
40 | API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP | |
41 | API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP | |
42 | API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP | |
43 | ||
44 | API_KEY = None | |
dd8c35f4 | 45 | |
6a777954 | 46 | DOWNLOADER_COUNT = 1 |
7b84ba6d | 47 | RETRY_COUNT = 3 |
6a777954 | 48 | |
65bd8b43 OM |
49 | MAX_PATH_LENGTH = 250 |
50 | ||
e45ba963 | 51 | VERSION = "0.10.0" |
dbdb1782 | 52 | |
8ed15058 | 53 | TIMESTAMP_FILE = "timestamp.txt" |
b497d705 | 54 | |
e45ba963 | 55 | SESSION = requests.Session() |
b497d705 | 56 | |
e45ba963 OM |
57 | @dataclass |
58 | class ThingLink: | |
59 | thing_id: str | |
60 | name: str | |
61 | api_link: str | |
b497d705 OM |
62 | |
63 | @dataclass | |
64 | class FileLink: | |
65 | name: str | |
ae598d73 OM |
66 | last_update: datetime.datetime |
67 | link: str | |
68 | ||
e45ba963 OM |
69 | @dataclass |
70 | class ImageLink: | |
71 | name: str | |
72 | link: str | |
73 | ||
ae598d73 OM |
74 | class FileLinks: |
75 | def __init__(self, initial_links=[]): | |
76 | self.links = [] | |
77 | self.last_update = None | |
78 | for link in initial_links: | |
79 | self.append(link) | |
80 | ||
81 | def __iter__(self): | |
82 | return iter(self.links) | |
83 | ||
84 | def __getitem__(self, item): | |
85 | return self.links[item] | |
86 | ||
87 | def __len__(self): | |
88 | return len(self.links) | |
89 | ||
90 | def append(self, link): | |
91 | try: | |
92 | self.last_update = max(self.last_update, link.last_update) | |
93 | except TypeError: | |
94 | self.last_update = link.last_update | |
95 | self.links.append(link) | |
8ed15058 | 96 | |
b497d705 | 97 | |
7b84ba6d OM |
98 | class State(enum.Enum): |
99 | OK = enum.auto() | |
100 | FAILED = enum.auto() | |
101 | ALREADY_DOWNLOADED = enum.auto() | |
102 | ||
e45ba963 OM |
103 | def sanitise_url(url): |
104 | """ remove api keys from an url | |
105 | """ | |
106 | return re.sub(r'access_token=\w*', | |
107 | 'access_token=***', | |
108 | url) | |
109 | ||
110 | def strip_time(date_obj): | |
111 | """ Takes a datetime object and returns another with the time set to 00:00 | |
112 | """ | |
113 | return datetime.datetime.combine(date_obj.date(), datetime.time()) | |
114 | ||
8ed15058 OM |
115 | def rename_unique(dir_name, target_dir_name): |
116 | """ Move a directory sideways to a new name, ensuring it is unique. | |
65bd8b43 | 117 | """ |
8ed15058 | 118 | target_dir = target_dir_name |
65bd8b43 OM |
119 | inc = 0 |
120 | while os.path.exists(target_dir): | |
8ed15058 | 121 | target_dir = "{}_{}".format(target_dir_name, inc) |
65bd8b43 OM |
122 | inc += 1 |
123 | os.rename(dir_name, target_dir) | |
8ed15058 OM |
124 | return target_dir |
125 | ||
126 | ||
127 | def fail_dir(dir_name): | |
128 | """ When a download has failed, move it sideways. | |
129 | """ | |
130 | return rename_unique(dir_name,"{}_failed".format(dir_name)) | |
65bd8b43 OM |
131 | |
132 | ||
133 | def truncate_name(file_name): | |
134 | """ Ensure the filename is not too long for, well windows basically. | |
135 | """ | |
136 | path = os.path.abspath(file_name) | |
137 | if len(path) <= MAX_PATH_LENGTH: | |
138 | return path | |
139 | to_cut = len(path) - (MAX_PATH_LENGTH + 3) | |
140 | base, extension = os.path.splitext(path) | |
141 | inc = 0 | |
142 | new_path = "{}_{}{}".format(base, inc, extension) | |
143 | while os.path.exists(new_path): | |
144 | new_path = "{}_{}{}".format(base, inc, extension) | |
145 | inc += 1 | |
146 | return new_path | |
147 | ||
148 | ||
dd8c35f4 OM |
149 | def strip_ws(value): |
150 | """ Remove whitespace from a string """ | |
151 | return str(NO_WHITESPACE_REGEX.sub('-', value)) | |
975060c9 | 152 | |
dbdb1782 | 153 | |
975060c9 OM |
154 | def slugify(value): |
155 | """ | |
d194b140 OM |
156 | Normalise string, removes invalid for filename charactersr |
157 | and converts string to lowercase. | |
975060c9 | 158 | """ |
e45ba963 | 159 | logging.debug("Sluggyfying {}".format(value)) |
d194b140 | 160 | value = unicodedata.normalize('NFKC', value).lower().strip() |
65bd8b43 OM |
161 | value = re.sub(r'[\\/<>:\?\*\|"]', '', value) |
162 | value = re.sub(r'\.*$', '', value) | |
163 | return value | |
975060c9 | 164 | |
b497d705 | 165 | |
6a777954 OM |
166 | class Downloader(multiprocessing.Process): |
167 | """ | |
168 | Class to handle downloading the things we have found to get. | |
169 | """ | |
170 | ||
ae598d73 | 171 | def __init__(self, thing_queue, download_directory, compress): |
6a777954 OM |
172 | multiprocessing.Process.__init__(self) |
173 | # TODO: add parameters | |
174 | self.thing_queue = thing_queue | |
175 | self.download_directory = download_directory | |
ae598d73 | 176 | self.compress = compress |
6a777954 OM |
177 | |
178 | def run(self): | |
179 | """ actual download loop. | |
180 | """ | |
181 | while True: | |
182 | thing_id = self.thing_queue.get() | |
183 | if thing_id is None: | |
184 | logging.info("Shutting download queue") | |
185 | self.thing_queue.task_done() | |
186 | break | |
187 | logging.info("Handling id {}".format(thing_id)) | |
ae598d73 | 188 | Thing(thing_id).download(self.download_directory, self.compress) |
6a777954 OM |
189 | self.thing_queue.task_done() |
190 | return | |
191 | ||
7b84ba6d | 192 | |
6a777954 OM |
193 | |
194 | ||
dbdb1782 | 195 | |
3522a3bf | 196 | class Grouping: |
d66f1f78 | 197 | """ Holds details of a group of things for download |
3c82f75b OM |
198 | This is effectively (although not actually) an abstract class |
199 | - use Collection or Designs instead. | |
200 | """ | |
dbdb1782 | 201 | |
ae598d73 | 202 | def __init__(self, quick, compress): |
975060c9 OM |
203 | self.things = [] |
204 | self.total = 0 | |
205 | self.req_id = None | |
206 | self.last_page = 0 | |
207 | self.per_page = None | |
7b84ba6d OM |
208 | # Should we stop downloading when we hit a known datestamp? |
209 | self.quick = quick | |
ae598d73 | 210 | self.compress = compress |
948bd56f | 211 | # These should be set by child classes. |
3522a3bf OM |
212 | self.url = None |
213 | self.download_dir = None | |
975060c9 | 214 | |
3522a3bf OM |
215 | def get(self): |
216 | """ retrieve the things of the grouping. """ | |
975060c9 OM |
217 | if self.things: |
218 | # We've already done it. | |
219 | return self.things | |
220 | ||
3522a3bf OM |
221 | # Check for initialisation: |
222 | if not self.url: | |
fa2f3251 | 223 | logging.error("No URL set - object not initialised properly?") |
3522a3bf OM |
224 | raise ValueError("No URL set - object not initialised properly?") |
225 | ||
226 | # Get the internal details of the grouping. | |
e45ba963 OM |
227 | logging.debug("Querying {}".format(sanitise_url(self.url))) |
228 | page = 0 | |
229 | # TODO:: Must be a way to refactor this cleanly | |
230 | if self.paginated: | |
231 | # Slightly nasty, but afaik python lacks a clean way to do partial string formatting. | |
232 | page_url = self.url + "?" + ACCESS_QP + "&" + PAGE_QP | |
233 | while True: | |
234 | page += 1 | |
235 | current_url = page_url.format(API_KEY, page) | |
236 | logging.info("requesting:{}".format(sanitise_url(current_url))) | |
237 | current_req = SESSION.get(current_url) | |
238 | if current_req.status_code != 200: | |
239 | logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(current_url), current_req.text)) | |
240 | break | |
241 | current_json = current_req.json() | |
242 | if not current_json: | |
243 | # No more! | |
244 | break | |
245 | for thing in current_json: | |
246 | self.things.append(ThingLink(thing['id'], thing['name'], thing['url'])) | |
247 | else: | |
248 | # self.url should already have been formatted as we don't need pagination | |
249 | logging.info("requesting:{}".format(sanitise_url(self.url))) | |
250 | current_req = SESSION.get(self.url) | |
251 | if current_req.status_code != 200: | |
252 | logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(current_url), current_req.text)) | |
253 | else: | |
254 | current_json = current_req.json() | |
255 | for thing in current_json: | |
256 | logging.info(thing) | |
257 | self.things.append(ThingLink(thing['id'], thing['name'], thing['url'])) | |
258 | logging.info("Found {} things.".format(len(self.things))) | |
975060c9 OM |
259 | return self.things |
260 | ||
261 | def download(self): | |
262 | """ Downloads all the files in a collection """ | |
263 | if not self.things: | |
3522a3bf OM |
264 | self.get() |
265 | ||
266 | if not self.download_dir: | |
dbdb1782 OM |
267 | raise ValueError( |
268 | "No download_dir set - invalidly initialised object?") | |
3522a3bf | 269 | |
975060c9 | 270 | base_dir = os.getcwd() |
975060c9 | 271 | try: |
3522a3bf | 272 | os.mkdir(self.download_dir) |
975060c9 | 273 | except FileExistsError: |
fa2f3251 | 274 | logging.info("Target directory {} already exists. Assuming a resume." |
dbdb1782 | 275 | .format(self.download_dir)) |
fa2f3251 | 276 | logging.info("Downloading {} thing(s).".format(self.total)) |
dbdb1782 | 277 | for idx, thing in enumerate(self.things): |
fb28c59b | 278 | logging.info("Downloading thing {} - {}".format(idx, thing)) |
ae598d73 | 279 | RC = Thing(thing).download(self.download_dir, self.compress) |
7b84ba6d OM |
280 | if self.quick and RC==State.ALREADY_DOWNLOADED: |
281 | logging.info("Caught up, stopping.") | |
282 | return | |
975060c9 | 283 | |
3522a3bf OM |
284 | class Collection(Grouping): |
285 | """ Holds details of a collection. """ | |
dbdb1782 | 286 | |
ae598d73 OM |
287 | def __init__(self, user, name, directory, quick, compress): |
288 | Grouping.__init__(self, quick, compress) | |
3522a3bf OM |
289 | self.user = user |
290 | self.name = name | |
e45ba963 OM |
291 | self.paginated = False |
292 | # need to figure out the the ID for the collection | |
293 | collection_url = API_USER_COLLECTIONS.format(user, API_KEY) | |
294 | try: | |
295 | current_req = SESSION.get(collection_url) | |
296 | except requests.exceptions.ConnectionError as error: | |
297 | logging.error("Unable to connect for thing {}: {}".format( | |
298 | self.thing_id, error)) | |
299 | return | |
300 | if current_req.status_code != 200: | |
301 | logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url), current_req.text)) | |
302 | return | |
303 | collection_list = current_req.json() | |
304 | try: | |
305 | # case insensitive to retain parity with previous behaviour | |
306 | collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0] | |
307 | except IndexError: | |
308 | logging.error("Unable to find collection {} for user {}".format(name, user)) | |
309 | return | |
310 | self.collection_id = collection['id'] | |
311 | self.url = API_COLLECTION_THINGS.format(self.collection_id, API_KEY) | |
312 | ||
d66f1f78 | 313 | self.download_dir = os.path.join(directory, |
3c82f75b | 314 | "{}-{}".format(slugify(self.user), slugify(self.name))) |
3522a3bf | 315 | |
dbdb1782 | 316 | |
3522a3bf OM |
317 | class Designs(Grouping): |
318 | """ Holds details of all of a users' designs. """ | |
dbdb1782 | 319 | |
ae598d73 OM |
320 | def __init__(self, user, directory, quick, compress): |
321 | Grouping.__init__(self, quick, compress) | |
3522a3bf | 322 | self.user = user |
e45ba963 OM |
323 | self.url = API_USER_DESIGNS.format(user) |
324 | self.paginated = True | |
dbdb1782 OM |
325 | self.download_dir = os.path.join( |
326 | directory, "{} designs".format(slugify(self.user))) | |
975060c9 | 327 | |
dbdb1782 | 328 | |
3c82f75b OM |
329 | class Thing: |
330 | """ An individual design on thingiverse. """ | |
dbdb1782 | 331 | |
e45ba963 OM |
332 | def __init__(self, thing_link): |
333 | self.thing_id = thing_link.thing_id | |
334 | self.name = thing_link.name | |
335 | self.api_link = thing_link.api_link | |
3c82f75b OM |
336 | self.last_time = None |
337 | self._parsed = False | |
338 | self._needs_download = True | |
339 | self.text = None | |
3c82f75b | 340 | self.download_dir = None |
ae598d73 OM |
341 | self.time_stamp = None |
342 | self._file_links = FileLinks() | |
e45ba963 | 343 | self._image_links = [] |
975060c9 | 344 | |
3c82f75b OM |
345 | def _parse(self, base_dir): |
346 | """ Work out what, if anything needs to be done. """ | |
347 | if self._parsed: | |
348 | return | |
e36c2a07 | 349 | |
e45ba963 OM |
350 | |
351 | # First get the broad details | |
352 | url = API_THING_DETAILS.format(self.thing_id, API_KEY) | |
e0e69fc6 | 353 | try: |
e45ba963 | 354 | current_req = SESSION.get(url) |
e0e69fc6 | 355 | except requests.exceptions.ConnectionError as error: |
8cdd1b54 OM |
356 | logging.error("Unable to connect for thing {}: {}".format( |
357 | self.thing_id, error)) | |
358 | return | |
e45ba963 OM |
359 | # Check for DMCA |
360 | if current_req.status_code == 403: | |
361 | logging.error("Access to thing {} is forbidden".format(self.thing_id)) | |
fb28c59b | 362 | return |
e45ba963 OM |
363 | if current_req.status_code != 200: |
364 | logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url), current_req.text)) | |
365 | return | |
366 | ||
367 | thing_json = current_req.json() | |
368 | try: | |
369 | self._license = thing_json['license'] | |
370 | except KeyError: | |
371 | logging.warning("No license found for thing {}?".format(self.thing_id)) | |
372 | ||
373 | # TODO: Get non-html version of this? | |
374 | try: | |
375 | self._details = thing_json['details'] | |
376 | except KeyError: | |
377 | logging.warning("No description found for thing {}?".format(self.thing_id)) | |
e0e69fc6 | 378 | |
e45ba963 OM |
379 | |
380 | ||
381 | # Now get the file details | |
382 | file_url = API_THING_FILES.format(self.thing_id, API_KEY) | |
383 | ||
384 | try: | |
385 | current_req = SESSION.get(file_url) | |
386 | except requests.exceptions.ConnectionError as error: | |
387 | logging.error("Unable to connect for thing {}: {}".format( | |
388 | self.thing_id, error)) | |
389 | return | |
390 | ||
391 | if current_req.status_code != 200: | |
392 | logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url), current_req.text)) | |
393 | return | |
394 | ||
395 | link_list = current_req.json() | |
396 | ||
397 | if not link_list: | |
247c2cd5 | 398 | logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(self.thing_id)) |
e45ba963 OM |
399 | |
400 | for link in link_list: | |
401 | logging.debug("Parsing link: {}".format(sanitise_url(link['url']))) | |
fb28c59b | 402 | try: |
e45ba963 OM |
403 | datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT) |
404 | self._file_links.append(FileLink(link['name'], datestamp, link['url'])) | |
fb28c59b | 405 | except ValueError: |
e45ba963 OM |
406 | logging.error(link['date']) |
407 | ||
408 | # Finally get the image links | |
409 | image_url = API_THING_IMAGES.format(self.thing_id, API_KEY) | |
410 | ||
411 | try: | |
412 | current_req = SESSION.get(image_url) | |
413 | except requests.exceptions.ConnectionError as error: | |
414 | logging.error("Unable to connect for thing {}: {}".format( | |
415 | self.thing_id, error)) | |
416 | return | |
417 | ||
418 | if current_req.status_code != 200: | |
419 | logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url), current_req.text)) | |
420 | return | |
fb28c59b | 421 | |
e45ba963 | 422 | image_list = current_req.json() |
e0e69fc6 | 423 | |
e45ba963 OM |
424 | if not image_list: |
425 | logging.warning("No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(self.thing_id)) | |
e0e69fc6 | 426 | |
e45ba963 OM |
427 | for image in image_list: |
428 | logging.debug("parsing image: {}".format(image)) | |
429 | try: | |
430 | name = slugify(image['name']) | |
431 | # TODO: fallback to other types | |
432 | url = [x for x in image['sizes'] if x['type']=='display' and x['size']=='large'][0]['url'] | |
433 | except KeyError: | |
434 | logging.warning("Missing image for {}".format(name)) | |
435 | self._image_links.append(ImageLink(name, url)) | |
436 | ||
437 | self.slug = "{} - {}".format(self.thing_id, slugify(self.name)) | |
8ed15058 OM |
438 | self.download_dir = os.path.join(base_dir, self.slug) |
439 | ||
440 | self._handle_old_directory(base_dir) | |
3c82f75b | 441 | |
e45ba963 | 442 | logging.debug("Parsing {} ({})".format(self.thing_id, self.name)) |
8ed15058 | 443 | latest, self.last_time = self._find_last_download(base_dir) |
fa2f3251 | 444 | |
8ed15058 | 445 | if not latest: |
3b497b1a M |
446 | # Not yet downloaded |
447 | self._parsed = True | |
448 | return | |
3c82f75b | 449 | |
3c82f75b | 450 | |
8ed15058 | 451 | logging.info("last downloaded version: {}".format(self.last_time)) |
3c82f75b OM |
452 | |
453 | # OK, so we have a timestamp, lets see if there is anything new to get | |
e45ba963 OM |
454 | # First off, are we comparing an old download that threw away the timestamp? |
455 | ignore_time = self.last_time == strip_time(self.last_time) | |
ae598d73 | 456 | try: |
e45ba963 OM |
457 | # TODO: Allow for comparison at the exact time |
458 | files_last_update = self._file_links.last_update | |
459 | if ignore_time: | |
460 | logging.info("Dropping time from comparison stamp as old-style download dir") | |
461 | files_last_update = strip_time(files_last_update) | |
462 | ||
463 | ||
464 | if files_last_update > self.last_time: | |
dbdb1782 | 465 | logging.info( |
ae598d73 | 466 | "Found new/updated files {}".format(self._file_links.last_update)) |
3c82f75b OM |
467 | self._needs_download = True |
468 | self._parsed = True | |
469 | return | |
ae598d73 OM |
470 | except TypeError: |
471 | logging.warning("No files found for {}.".format(self.thing_id)) | |
b497d705 | 472 | |
3c82f75b | 473 | # Got here, so nope, no new files. |
3c82f75b OM |
474 | self._needs_download = False |
475 | self._parsed = True | |
476 | ||
8ed15058 OM |
477 | def _handle_old_directory(self, base_dir): |
478 | """ Deal with any old directories from previous versions of the code. | |
479 | """ | |
e45ba963 | 480 | old_dir = os.path.join(base_dir, slugify(self.name)) |
8ed15058 OM |
481 | if os.path.exists(old_dir): |
482 | logging.warning("Found old style download_dir. Moving.") | |
483 | rename_unique(old_dir, self.download_dir) | |
484 | ||
485 | def _handle_outdated_directory(self, base_dir): | |
486 | """ Move the current download directory sideways if the thing has changed. | |
487 | """ | |
488 | if not os.path.exists(self.download_dir): | |
489 | # No old directory to move. | |
490 | return None | |
491 | timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE) | |
492 | if not os.path.exists(timestamp_file): | |
493 | # Old form of download directory | |
494 | target_dir_name = "{} - old".format(self.download_dir) | |
495 | else: | |
3ac180ed | 496 | target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT)) |
8ed15058 OM |
497 | return rename_unique(self.download_dir, target_dir_name) |
498 | ||
499 | def _find_last_download(self, base_dir): | |
500 | """ Look for the most recent previous download (if any) of the thing. | |
501 | """ | |
502 | logging.info("Looking for old things") | |
503 | ||
504 | # First the DL directory itself. | |
505 | timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE) | |
506 | ||
507 | latest = None | |
508 | latest_time = None | |
509 | ||
510 | try: | |
511 | logging.debug("Checking for existing download in normal place.") | |
512 | with open(timestamp_file) as ts_fh: | |
513 | timestamp_text = ts_fh.read().strip() | |
514 | latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT) | |
515 | latest = self.download_dir | |
516 | except FileNotFoundError: | |
517 | # No existing download directory. huh. | |
518 | pass | |
519 | except TypeError: | |
520 | logging.warning("Invalid timestamp file found in {}".format(self.download_dir)) | |
521 | ||
522 | # TODO: Maybe look for old download directories. | |
523 | ||
524 | ||
525 | # Now look for 7z files | |
526 | candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id))) | |
527 | # +3 to allow for ' - ' | |
528 | leading_length =len(self.slug)+3 | |
529 | for path in candidates: | |
530 | candidate = os.path.basename(path) | |
531 | try: | |
532 | logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3])) | |
3ac180ed | 533 | candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT) |
8ed15058 OM |
534 | except ValueError: |
535 | logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate)) | |
536 | continue | |
537 | try: | |
538 | if candidate_time > latest_time: | |
539 | latest_time = candidate_time | |
540 | latest = candidate | |
541 | except TypeError: | |
542 | latest_time = candidate_time | |
543 | latest = candidate | |
544 | logging.info("Found last old thing: {} / {}".format(latest,latest_time)) | |
545 | return (latest, latest_time) | |
546 | ||
547 | ||
548 | ||
ae598d73 | 549 | def download(self, base_dir, compress): |
7b84ba6d OM |
550 | """ Download all files for a given thing. |
551 | Returns True iff the thing is now downloaded (not iff it downloads the thing!) | |
552 | """ | |
3c82f75b OM |
553 | if not self._parsed: |
554 | self._parse(base_dir) | |
555 | ||
e0e69fc6 | 556 | if not self._parsed: |
8cdd1b54 OM |
557 | logging.error( |
558 | "Unable to parse {} - aborting download".format(self.thing_id)) | |
7b84ba6d | 559 | return State.FAILED |
e0e69fc6 | 560 | |
3c82f75b | 561 | if not self._needs_download: |
e45ba963 | 562 | logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name)) |
7b84ba6d | 563 | return State.ALREADY_DOWNLOADED |
3c82f75b | 564 | |
247c2cd5 | 565 | if not self._file_links: |
e45ba963 | 566 | logging.error("{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name)) |
247c2cd5 OM |
567 | return State.FAILED |
568 | ||
3c82f75b | 569 | # Have we already downloaded some things? |
8ed15058 | 570 | renamed_dir = self._handle_outdated_directory(base_dir) |
3c82f75b OM |
571 | |
572 | # Get the list of files to download | |
3c82f75b OM |
573 | |
574 | new_file_links = [] | |
575 | old_file_links = [] | |
ae598d73 | 576 | self.time_stamp = None |
3c82f75b OM |
577 | |
578 | if not self.last_time: | |
579 | # If we don't have anything to copy from, then it is all new. | |
b497d705 OM |
580 | logging.debug("No last time, downloading all files") |
581 | new_file_links = self._file_links | |
ae598d73 | 582 | self.time_stamp = new_file_links[0].last_update |
b497d705 OM |
583 | |
584 | for file_link in new_file_links: | |
ae598d73 OM |
585 | self.time_stamp = max(self.time_stamp, file_link.last_update) |
586 | logging.debug("New timestamp will be {}".format(self.time_stamp)) | |
3c82f75b | 587 | else: |
ae598d73 | 588 | self.time_stamp = self.last_time |
b497d705 OM |
589 | for file_link in self._file_links: |
590 | if file_link.last_update > self.last_time: | |
3c82f75b | 591 | new_file_links.append(file_link) |
ae598d73 | 592 | self.time_stamp = max(self.time_stamp, file_link.last_update) |
3c82f75b OM |
593 | else: |
594 | old_file_links.append(file_link) | |
3c82f75b | 595 | |
ae598d73 | 596 | logging.debug("new timestamp {}".format(self.time_stamp)) |
3c82f75b OM |
597 | |
598 | # OK. Time to get to work. | |
fa2f3251 | 599 | logging.debug("Generating download_dir") |
3c82f75b | 600 | os.mkdir(self.download_dir) |
b497d705 | 601 | filelist_file = os.path.join(self.download_dir, "filelist.txt") |
e45ba963 | 602 | url_suffix = "/?" + ACCESS_QP.format(API_KEY) |
d194b140 | 603 | with open(filelist_file, 'w', encoding="utf-8") as fl_handle: |
b497d705 | 604 | for fl in self._file_links: |
e45ba963 | 605 | fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update)) |
b497d705 OM |
606 | |
607 | ||
3c82f75b | 608 | # First grab the cached files (if any) |
fa2f3251 | 609 | logging.info("Copying {} unchanged files.".format(len(old_file_links))) |
3c82f75b | 610 | for file_link in old_file_links: |
8ed15058 | 611 | old_file = os.path.join(renamed_dir, file_link.name) |
65bd8b43 | 612 | new_file = truncate_name(os.path.join(self.download_dir, file_link.name)) |
3c82f75b | 613 | try: |
fa2f3251 | 614 | logging.debug("Copying {} to {}".format(old_file, new_file)) |
3c82f75b OM |
615 | copyfile(old_file, new_file) |
616 | except FileNotFoundError: | |
dbdb1782 OM |
617 | logging.warning( |
618 | "Unable to find {} in old archive, redownloading".format(file_link["title"])) | |
3c82f75b OM |
619 | new_file_links.append(file_link) |
620 | ||
621 | # Now download the new ones | |
dbdb1782 | 622 | logging.info("Downloading {} new files of {}".format( |
b497d705 | 623 | len(new_file_links), len(self._file_links))) |
3c82f75b | 624 | try: |
b497d705 | 625 | for file_link in new_file_links: |
65bd8b43 | 626 | file_name = truncate_name(os.path.join(self.download_dir, file_link.name)) |
dbdb1782 | 627 | logging.debug("Downloading {} from {} to {}".format( |
b497d705 | 628 | file_link.name, file_link.link, file_name)) |
e45ba963 OM |
629 | data_req = SESSION.get(file_link.link + url_suffix) |
630 | if data_req.status_code != 200: | |
631 | logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code, sanitise_url(file_link.link), data_req.text)) | |
632 | fail_dir(self.download_dir) | |
633 | return State.FAILED | |
634 | ||
635 | ||
3c82f75b OM |
636 | with open(file_name, 'wb') as handle: |
637 | handle.write(data_req.content) | |
638 | except Exception as exception: | |
b497d705 | 639 | logging.error("Failed to download {} - {}".format(file_link.name, exception)) |
65bd8b43 | 640 | fail_dir(self.download_dir) |
7b84ba6d | 641 | return State.FAILED |
3c82f75b | 642 | |
b497d705 | 643 | |
e45ba963 | 644 | # People like images. |
680039fe | 645 | image_dir = os.path.join(self.download_dir, 'images') |
fb28c59b | 646 | logging.info("Downloading {} images.".format(len(self._image_links))) |
680039fe OM |
647 | try: |
648 | os.mkdir(image_dir) | |
fb28c59b | 649 | for imagelink in self._image_links: |
e45ba963 OM |
650 | filename = os.path.join(image_dir, imagelink.name) |
651 | image_req = SESSION.get(imagelink.link) | |
652 | if image_req.status_code != 200: | |
653 | logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code, sanitise_url(file_link.link), image_req.text)) | |
654 | fail_dir(self.download_dir) | |
655 | return State.FAILED | |
656 | with open(truncate_name(filename), 'wb') as handle: | |
680039fe OM |
657 | handle.write(image_req.content) |
658 | except Exception as exception: | |
e45ba963 | 659 | logging.error("Failed to download {} - {}".format(imagelink.name, exception)) |
65bd8b43 | 660 | fail_dir(self.download_dir) |
7b84ba6d | 661 | return State.FAILED |
680039fe | 662 | |
4f75dd69 | 663 | # Best get some licenses |
e45ba963 | 664 | logging.info("writing license file") |
4f75dd69 | 665 | try: |
fb28c59b | 666 | if self._license: |
65bd8b43 | 667 | with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w', encoding="utf-8") as license_handle: |
fb28c59b | 668 | license_handle.write("{}\n".format(self._license)) |
4f75dd69 OM |
669 | except IOError as exception: |
670 | logging.warning("Failed to write license! {}".format(exception)) | |
fb28c59b | 671 | |
e45ba963 OM |
672 | logging.info("writing readme") |
673 | try: | |
674 | if self._details: | |
675 | with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w', encoding="utf-8") as readme_handle: | |
676 | readme_handle.write("{}\n".format(self._details)) | |
677 | except IOError as exception: | |
678 | logging.warning("Failed to write readme! {}".format(exception)) | |
679 | ||
3c82f75b OM |
680 | try: |
681 | # Now write the timestamp | |
8ed15058 | 682 | with open(os.path.join(self.download_dir,TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle: |
ae598d73 | 683 | timestamp_handle.write(self.time_stamp.__str__()) |
3c82f75b | 684 | except Exception as exception: |
e45ba963 | 685 | logging.error("Failed to write timestamp file - {}".format(exception)) |
65bd8b43 | 686 | fail_dir(self.download_dir) |
7b84ba6d | 687 | return State.FAILED |
3c82f75b | 688 | self._needs_download = False |
e45ba963 | 689 | logging.debug("Download of {} finished".format(self.name)) |
ae598d73 OM |
690 | if not compress: |
691 | return State.OK | |
692 | ||
693 | ||
694 | thing_dir = "{} - {} - {}".format(self.thing_id, | |
e45ba963 | 695 | slugify(self.name), |
3ac180ed | 696 | self.time_stamp.strftime(SAFE_DATETIME_FORMAT)) |
ae598d73 OM |
697 | file_name = os.path.join(base_dir, |
698 | "{}.7z".format(thing_dir)) | |
699 | logging.debug("Compressing {} to {}".format( | |
e45ba963 | 700 | self.name, |
ae598d73 | 701 | file_name)) |
ae598d73 | 702 | with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive: |
ae598d73 | 703 | archive.writeall(self.download_dir, thing_dir) |
e45ba963 | 704 | logging.debug("Compression of {} finished.".format(self.name)) |
8ed15058 | 705 | shutil.rmtree(self.download_dir) |
e45ba963 | 706 | logging.debug("Removed temporary download dir of {}.".format(self.name)) |
7b84ba6d | 707 | return State.OK |
975060c9 | 708 | |
dbdb1782 | 709 | |
ae598d73 OM |
710 | |
711 | ||
712 | def do_batch(batch_file, download_dir, quick, compress): | |
1ab49020 OM |
713 | """ Read a file in line by line, parsing each as a set of calls to this script.""" |
714 | with open(batch_file) as handle: | |
715 | for line in handle: | |
716 | line = line.strip() | |
cf280385 M |
717 | if not line: |
718 | # Skip empty lines | |
719 | continue | |
1ab49020 OM |
720 | logging.info("Handling instruction {}".format(line)) |
721 | command_arr = line.split() | |
722 | if command_arr[0] == "thing": | |
dbdb1782 OM |
723 | logging.debug( |
724 | "Handling batch thing instruction: {}".format(line)) | |
ae598d73 | 725 | Thing(command_arr[1]).download(download_dir, compress) |
1ab49020 OM |
726 | continue |
727 | if command_arr[0] == "collection": | |
dbdb1782 OM |
728 | logging.debug( |
729 | "Handling batch collection instruction: {}".format(line)) | |
730 | Collection(command_arr[1], command_arr[2], | |
ae598d73 | 731 | download_dir, quick, compress).download() |
1ab49020 OM |
732 | continue |
733 | if command_arr[0] == "user": | |
dbdb1782 OM |
734 | logging.debug( |
735 | "Handling batch collection instruction: {}".format(line)) | |
ae598d73 | 736 | Designs(command_arr[1], download_dir, quick, compress).download() |
1ab49020 OM |
737 | continue |
738 | logging.warning("Unable to parse current instruction. Skipping.") | |
739 | ||
dbdb1782 | 740 | |
975060c9 OM |
741 | def main(): |
742 | """ Entry point for script being run as a command. """ | |
743 | parser = argparse.ArgumentParser() | |
dbdb1782 OM |
744 | parser.add_argument("-l", "--log-level", choices=[ |
745 | 'debug', 'info', 'warning'], default='info', help="level of logging desired") | |
746 | parser.add_argument("-d", "--directory", | |
747 | help="Target directory to download into") | |
4f94efc8 OM |
748 | parser.add_argument("-f", "--log-file", |
749 | help="Place to log debug information to") | |
7b84ba6d OM |
750 | parser.add_argument("-q", "--quick", action="store_true", |
751 | help="Assume date ordering on posts") | |
ae598d73 OM |
752 | parser.add_argument("-c", "--compress", action="store_true", |
753 | help="Compress files") | |
e45ba963 OM |
754 | parser.add_argument("-a", "--api-key", |
755 | help="API key for thingiverse") | |
756 | ||
7b84ba6d | 757 | |
dbdb1782 OM |
758 | subparsers = parser.add_subparsers( |
759 | help="Type of thing to download", dest="subcommand") | |
760 | collection_parser = subparsers.add_parser( | |
b7bfef68 | 761 | 'collection', help="Download one or more entire collection(s)") |
dbdb1782 | 762 | collection_parser.add_argument( |
b7bfef68 | 763 | "owner", help="The owner of the collection(s) to get") |
dbdb1782 | 764 | collection_parser.add_argument( |
b7bfef68 | 765 | "collections", nargs="+", help="Space seperated list of the name(s) of collection to get") |
dbdb1782 OM |
766 | thing_parser = subparsers.add_parser( |
767 | 'thing', help="Download a single thing.") | |
8cdd1b54 OM |
768 | thing_parser.add_argument( |
769 | "things", nargs="*", help="Space seperated list of thing ID(s) to download") | |
dbdb1782 | 770 | user_parser = subparsers.add_parser( |
b7bfef68 | 771 | "user", help="Download all things by one or more users") |
8cdd1b54 OM |
772 | user_parser.add_argument( |
773 | "users", nargs="+", help="A space seperated list of the user(s) to get the designs of") | |
dbdb1782 OM |
774 | batch_parser = subparsers.add_parser( |
775 | "batch", help="Perform multiple actions written in a text file") | |
776 | batch_parser.add_argument( | |
777 | "batch_file", help="The name of the file to read.") | |
680039fe | 778 | subparsers.add_parser("version", help="Show the current version") |
4a98996b | 779 | |
975060c9 | 780 | args = parser.parse_args() |
4a98996b OM |
781 | if not args.subcommand: |
782 | parser.print_help() | |
783 | sys.exit(1) | |
d66f1f78 OM |
784 | if not args.directory: |
785 | args.directory = os.getcwd() | |
4f94efc8 OM |
786 | |
787 | logger = logging.getLogger() | |
788 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
789 | logger.setLevel(logging.DEBUG) | |
790 | console_handler = logging.StreamHandler() | |
791 | console_handler.setLevel(args.log_level.upper()) | |
792 | ||
e45ba963 OM |
793 | global API_KEY |
794 | if args.api_key: | |
795 | API_KEY=args.api_key | |
796 | else: | |
797 | try: | |
798 | with open("api.key") as fh: | |
799 | API_KEY=fh.read().strip() | |
800 | except Exception as e: | |
801 | logging.error("Either specify the api-key on the command line or in a file called 'api.key'") | |
802 | logging.error("Exception: {}".format(e)) | |
803 | return | |
804 | ||
4f94efc8 OM |
805 | logger.addHandler(console_handler) |
806 | if args.log_file: | |
807 | file_handler = logging.FileHandler(args.log_file) | |
808 | file_handler.setLevel(logging.DEBUG) | |
809 | file_handler.setFormatter(formatter) | |
810 | logger.addHandler(file_handler) | |
fa2f3251 | 811 | |
6a777954 OM |
812 | |
813 | # Start downloader | |
814 | thing_queue = multiprocessing.JoinableQueue() | |
815 | logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT)) | |
ae598d73 | 816 | downloaders = [Downloader(thing_queue, args.directory, args.compress) for _ in range(DOWNLOADER_COUNT)] |
6a777954 OM |
817 | for downloader in downloaders: |
818 | downloader.start() | |
819 | ||
820 | ||
4a98996b | 821 | if args.subcommand.startswith("collection"): |
b7bfef68 | 822 | for collection in args.collections: |
ae598d73 | 823 | Collection(args.owner, collection, args.directory, args.quick, args.compress).download() |
4a98996b | 824 | if args.subcommand == "thing": |
b7bfef68 | 825 | for thing in args.things: |
6a777954 | 826 | thing_queue.put(thing) |
3522a3bf | 827 | if args.subcommand == "user": |
b7bfef68 | 828 | for user in args.users: |
ae598d73 | 829 | Designs(user, args.directory, args.quick, args.compress).download() |
db8066ec OM |
830 | if args.subcommand == "version": |
831 | print("thingy_grabber.py version {}".format(VERSION)) | |
1ab49020 | 832 | if args.subcommand == "batch": |
ae598d73 | 833 | do_batch(args.batch_file, args.directory, args.quick, args.compress) |
1ab49020 | 834 | |
6a777954 OM |
835 | # Stop the downloader processes |
836 | for downloader in downloaders: | |
837 | thing_queue.put(None) | |
975060c9 | 838 | |
d194b140 | 839 | |
0930777e OM |
840 | if __name__ == "__main__": |
841 | multiprocessing.freeze_support() | |
975060c9 | 842 | main() |