Commit | Line | Data |
---|---|---|
975060c9 OM |
1 | #!/usr/bin/env python3 |
2 | """ | |
3 | Thingiverse bulk downloader | |
4 | """ | |
5 | ||
6 | import re | |
4a98996b | 7 | import sys |
975060c9 OM |
8 | import os |
9 | import argparse | |
10 | import unicodedata | |
11 | import requests | |
fa2f3251 | 12 | import logging |
6a777954 | 13 | import multiprocessing |
7b84ba6d | 14 | import enum |
fb28c59b | 15 | import datetime |
3c82f75b | 16 | from shutil import copyfile |
b497d705 | 17 | from dataclasses import dataclass |
d194b140 | 18 | import atexit |
9828dabe | 19 | import py7zr |
8ed15058 OM |
20 | import glob |
21 | import shutil | |
975060c9 | 22 | |
ae598d73 OM |
23 | SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}] |
24 | ||
8ed15058 OM |
25 | # I don't think this is exported by datetime |
26 | DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' | |
3ac180ed OM |
27 | # Windows cannot handle : in filenames |
28 | SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S' | |
8ed15058 | 29 | |
e45ba963 OM |
30 | API_BASE="https://api.thingiverse.com" |
31 | ACCESS_QP="access_token={}" | |
32 | PAGE_QP="page={}" | |
33 | API_USER_DESIGNS = API_BASE + "/users/{}/things/" | |
34 | API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP | |
975060c9 | 35 | |
e45ba963 OM |
36 | # Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS |
37 | API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP | |
38 | API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP | |
39 | ||
40 | API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP | |
41 | API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP | |
42 | API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP | |
10f0238d | 43 | API_THING_DOWNLOAD = "/download/?" + ACCESS_QP |
e45ba963 OM |
44 | |
45 | API_KEY = None | |
dd8c35f4 | 46 | |
6a777954 | 47 | DOWNLOADER_COUNT = 1 |
7b84ba6d | 48 | RETRY_COUNT = 3 |
6a777954 | 49 | |
65bd8b43 OM |
50 | MAX_PATH_LENGTH = 250 |
51 | ||
10f0238d | 52 | VERSION = "0.10.2" |
dbdb1782 | 53 | |
8ed15058 | 54 | TIMESTAMP_FILE = "timestamp.txt" |
b497d705 | 55 | |
e45ba963 | 56 | SESSION = requests.Session() |
b497d705 | 57 | |
e45ba963 OM |
58 | @dataclass |
59 | class ThingLink: | |
60 | thing_id: str | |
61 | name: str | |
62 | api_link: str | |
b497d705 OM |
63 | |
64 | @dataclass | |
65 | class FileLink: | |
66 | name: str | |
ae598d73 OM |
67 | last_update: datetime.datetime |
68 | link: str | |
69 | ||
e45ba963 OM |
70 | @dataclass |
71 | class ImageLink: | |
72 | name: str | |
73 | link: str | |
74 | ||
ae598d73 OM |
75 | class FileLinks: |
76 | def __init__(self, initial_links=[]): | |
77 | self.links = [] | |
78 | self.last_update = None | |
79 | for link in initial_links: | |
80 | self.append(link) | |
81 | ||
82 | def __iter__(self): | |
83 | return iter(self.links) | |
84 | ||
85 | def __getitem__(self, item): | |
86 | return self.links[item] | |
87 | ||
88 | def __len__(self): | |
89 | return len(self.links) | |
90 | ||
91 | def append(self, link): | |
92 | try: | |
93 | self.last_update = max(self.last_update, link.last_update) | |
94 | except TypeError: | |
95 | self.last_update = link.last_update | |
96 | self.links.append(link) | |
8ed15058 | 97 | |
b497d705 | 98 | |
7b84ba6d OM |
99 | class State(enum.Enum): |
100 | OK = enum.auto() | |
101 | FAILED = enum.auto() | |
102 | ALREADY_DOWNLOADED = enum.auto() | |
103 | ||
e45ba963 OM |
104 | def sanitise_url(url): |
105 | """ remove api keys from an url | |
106 | """ | |
107 | return re.sub(r'access_token=\w*', | |
108 | 'access_token=***', | |
109 | url) | |
110 | ||
111 | def strip_time(date_obj): | |
112 | """ Takes a datetime object and returns another with the time set to 00:00 | |
113 | """ | |
114 | return datetime.datetime.combine(date_obj.date(), datetime.time()) | |
115 | ||
8ed15058 OM |
116 | def rename_unique(dir_name, target_dir_name): |
117 | """ Move a directory sideways to a new name, ensuring it is unique. | |
65bd8b43 | 118 | """ |
8ed15058 | 119 | target_dir = target_dir_name |
65bd8b43 OM |
120 | inc = 0 |
121 | while os.path.exists(target_dir): | |
8ed15058 | 122 | target_dir = "{}_{}".format(target_dir_name, inc) |
65bd8b43 OM |
123 | inc += 1 |
124 | os.rename(dir_name, target_dir) | |
8ed15058 OM |
125 | return target_dir |
126 | ||
127 | ||
128 | def fail_dir(dir_name): | |
129 | """ When a download has failed, move it sideways. | |
130 | """ | |
131 | return rename_unique(dir_name,"{}_failed".format(dir_name)) | |
65bd8b43 OM |
132 | |
133 | ||
134 | def truncate_name(file_name): | |
135 | """ Ensure the filename is not too long for, well windows basically. | |
136 | """ | |
137 | path = os.path.abspath(file_name) | |
138 | if len(path) <= MAX_PATH_LENGTH: | |
139 | return path | |
140 | to_cut = len(path) - (MAX_PATH_LENGTH + 3) | |
141 | base, extension = os.path.splitext(path) | |
142 | inc = 0 | |
143 | new_path = "{}_{}{}".format(base, inc, extension) | |
144 | while os.path.exists(new_path): | |
145 | new_path = "{}_{}{}".format(base, inc, extension) | |
146 | inc += 1 | |
147 | return new_path | |
148 | ||
149 | ||
dd8c35f4 OM |
150 | def strip_ws(value): |
151 | """ Remove whitespace from a string """ | |
152 | return str(NO_WHITESPACE_REGEX.sub('-', value)) | |
975060c9 | 153 | |
dbdb1782 | 154 | |
975060c9 OM |
155 | def slugify(value): |
156 | """ | |
d194b140 OM |
157 | Normalise string, removes invalid for filename charactersr |
158 | and converts string to lowercase. | |
975060c9 | 159 | """ |
e45ba963 | 160 | logging.debug("Sluggyfying {}".format(value)) |
d194b140 | 161 | value = unicodedata.normalize('NFKC', value).lower().strip() |
65bd8b43 OM |
162 | value = re.sub(r'[\\/<>:\?\*\|"]', '', value) |
163 | value = re.sub(r'\.*$', '', value) | |
164 | return value | |
975060c9 | 165 | |
b497d705 | 166 | |
6a777954 OM |
167 | class Downloader(multiprocessing.Process): |
168 | """ | |
169 | Class to handle downloading the things we have found to get. | |
170 | """ | |
171 | ||
ae598d73 | 172 | def __init__(self, thing_queue, download_directory, compress): |
6a777954 OM |
173 | multiprocessing.Process.__init__(self) |
174 | # TODO: add parameters | |
175 | self.thing_queue = thing_queue | |
176 | self.download_directory = download_directory | |
ae598d73 | 177 | self.compress = compress |
6a777954 OM |
178 | |
179 | def run(self): | |
180 | """ actual download loop. | |
181 | """ | |
182 | while True: | |
183 | thing_id = self.thing_queue.get() | |
184 | if thing_id is None: | |
185 | logging.info("Shutting download queue") | |
186 | self.thing_queue.task_done() | |
187 | break | |
188 | logging.info("Handling id {}".format(thing_id)) | |
ae598d73 | 189 | Thing(thing_id).download(self.download_directory, self.compress) |
6a777954 OM |
190 | self.thing_queue.task_done() |
191 | return | |
192 | ||
7b84ba6d | 193 | |
6a777954 OM |
194 | |
195 | ||
dbdb1782 | 196 | |
3522a3bf | 197 | class Grouping: |
d66f1f78 | 198 | """ Holds details of a group of things for download |
3c82f75b OM |
199 | This is effectively (although not actually) an abstract class |
200 | - use Collection or Designs instead. | |
201 | """ | |
dbdb1782 | 202 | |
ae598d73 | 203 | def __init__(self, quick, compress): |
975060c9 OM |
204 | self.things = [] |
205 | self.total = 0 | |
206 | self.req_id = None | |
207 | self.last_page = 0 | |
208 | self.per_page = None | |
7b84ba6d OM |
209 | # Should we stop downloading when we hit a known datestamp? |
210 | self.quick = quick | |
ae598d73 | 211 | self.compress = compress |
948bd56f | 212 | # These should be set by child classes. |
3522a3bf OM |
213 | self.url = None |
214 | self.download_dir = None | |
975060c9 | 215 | |
3522a3bf OM |
216 | def get(self): |
217 | """ retrieve the things of the grouping. """ | |
975060c9 OM |
218 | if self.things: |
219 | # We've already done it. | |
220 | return self.things | |
221 | ||
3522a3bf OM |
222 | # Check for initialisation: |
223 | if not self.url: | |
fa2f3251 | 224 | logging.error("No URL set - object not initialised properly?") |
3522a3bf OM |
225 | raise ValueError("No URL set - object not initialised properly?") |
226 | ||
227 | # Get the internal details of the grouping. | |
e45ba963 OM |
228 | logging.debug("Querying {}".format(sanitise_url(self.url))) |
229 | page = 0 | |
230 | # TODO:: Must be a way to refactor this cleanly | |
231 | if self.paginated: | |
232 | # Slightly nasty, but afaik python lacks a clean way to do partial string formatting. | |
233 | page_url = self.url + "?" + ACCESS_QP + "&" + PAGE_QP | |
234 | while True: | |
235 | page += 1 | |
236 | current_url = page_url.format(API_KEY, page) | |
237 | logging.info("requesting:{}".format(sanitise_url(current_url))) | |
238 | current_req = SESSION.get(current_url) | |
239 | if current_req.status_code != 200: | |
240 | logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(current_url), current_req.text)) | |
241 | break | |
242 | current_json = current_req.json() | |
243 | if not current_json: | |
244 | # No more! | |
245 | break | |
246 | for thing in current_json: | |
247 | self.things.append(ThingLink(thing['id'], thing['name'], thing['url'])) | |
248 | else: | |
249 | # self.url should already have been formatted as we don't need pagination | |
250 | logging.info("requesting:{}".format(sanitise_url(self.url))) | |
251 | current_req = SESSION.get(self.url) | |
252 | if current_req.status_code != 200: | |
253 | logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(current_url), current_req.text)) | |
254 | else: | |
255 | current_json = current_req.json() | |
256 | for thing in current_json: | |
257 | logging.info(thing) | |
258 | self.things.append(ThingLink(thing['id'], thing['name'], thing['url'])) | |
259 | logging.info("Found {} things.".format(len(self.things))) | |
975060c9 OM |
260 | return self.things |
261 | ||
262 | def download(self): | |
263 | """ Downloads all the files in a collection """ | |
264 | if not self.things: | |
3522a3bf OM |
265 | self.get() |
266 | ||
267 | if not self.download_dir: | |
dbdb1782 OM |
268 | raise ValueError( |
269 | "No download_dir set - invalidly initialised object?") | |
3522a3bf | 270 | |
975060c9 | 271 | base_dir = os.getcwd() |
975060c9 | 272 | try: |
3522a3bf | 273 | os.mkdir(self.download_dir) |
975060c9 | 274 | except FileExistsError: |
fa2f3251 | 275 | logging.info("Target directory {} already exists. Assuming a resume." |
dbdb1782 | 276 | .format(self.download_dir)) |
fa2f3251 | 277 | logging.info("Downloading {} thing(s).".format(self.total)) |
dbdb1782 | 278 | for idx, thing in enumerate(self.things): |
fb28c59b | 279 | logging.info("Downloading thing {} - {}".format(idx, thing)) |
ae598d73 | 280 | RC = Thing(thing).download(self.download_dir, self.compress) |
7b84ba6d OM |
281 | if self.quick and RC==State.ALREADY_DOWNLOADED: |
282 | logging.info("Caught up, stopping.") | |
283 | return | |
975060c9 | 284 | |
3522a3bf OM |
285 | class Collection(Grouping): |
286 | """ Holds details of a collection. """ | |
dbdb1782 | 287 | |
ae598d73 OM |
288 | def __init__(self, user, name, directory, quick, compress): |
289 | Grouping.__init__(self, quick, compress) | |
3522a3bf OM |
290 | self.user = user |
291 | self.name = name | |
e45ba963 OM |
292 | self.paginated = False |
293 | # need to figure out the the ID for the collection | |
294 | collection_url = API_USER_COLLECTIONS.format(user, API_KEY) | |
295 | try: | |
296 | current_req = SESSION.get(collection_url) | |
297 | except requests.exceptions.ConnectionError as error: | |
298 | logging.error("Unable to connect for thing {}: {}".format( | |
299 | self.thing_id, error)) | |
300 | return | |
301 | if current_req.status_code != 200: | |
302 | logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url), current_req.text)) | |
303 | return | |
304 | collection_list = current_req.json() | |
305 | try: | |
306 | # case insensitive to retain parity with previous behaviour | |
307 | collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0] | |
308 | except IndexError: | |
309 | logging.error("Unable to find collection {} for user {}".format(name, user)) | |
310 | return | |
311 | self.collection_id = collection['id'] | |
312 | self.url = API_COLLECTION_THINGS.format(self.collection_id, API_KEY) | |
313 | ||
d66f1f78 | 314 | self.download_dir = os.path.join(directory, |
3c82f75b | 315 | "{}-{}".format(slugify(self.user), slugify(self.name))) |
3522a3bf | 316 | |
dbdb1782 | 317 | |
3522a3bf OM |
318 | class Designs(Grouping): |
319 | """ Holds details of all of a users' designs. """ | |
dbdb1782 | 320 | |
ae598d73 OM |
321 | def __init__(self, user, directory, quick, compress): |
322 | Grouping.__init__(self, quick, compress) | |
3522a3bf | 323 | self.user = user |
e45ba963 OM |
324 | self.url = API_USER_DESIGNS.format(user) |
325 | self.paginated = True | |
dbdb1782 OM |
326 | self.download_dir = os.path.join( |
327 | directory, "{} designs".format(slugify(self.user))) | |
975060c9 | 328 | |
dbdb1782 | 329 | |
3c82f75b OM |
330 | class Thing: |
331 | """ An individual design on thingiverse. """ | |
dbdb1782 | 332 | |
e45ba963 OM |
333 | def __init__(self, thing_link): |
334 | self.thing_id = thing_link.thing_id | |
335 | self.name = thing_link.name | |
336 | self.api_link = thing_link.api_link | |
3c82f75b OM |
337 | self.last_time = None |
338 | self._parsed = False | |
339 | self._needs_download = True | |
340 | self.text = None | |
3c82f75b | 341 | self.download_dir = None |
ae598d73 OM |
342 | self.time_stamp = None |
343 | self._file_links = FileLinks() | |
e45ba963 | 344 | self._image_links = [] |
975060c9 | 345 | |
3c82f75b OM |
346 | def _parse(self, base_dir): |
347 | """ Work out what, if anything needs to be done. """ | |
348 | if self._parsed: | |
349 | return | |
e36c2a07 | 350 | |
e45ba963 OM |
351 | |
352 | # First get the broad details | |
353 | url = API_THING_DETAILS.format(self.thing_id, API_KEY) | |
e0e69fc6 | 354 | try: |
e45ba963 | 355 | current_req = SESSION.get(url) |
e0e69fc6 | 356 | except requests.exceptions.ConnectionError as error: |
8cdd1b54 OM |
357 | logging.error("Unable to connect for thing {}: {}".format( |
358 | self.thing_id, error)) | |
359 | return | |
e45ba963 OM |
360 | # Check for DMCA |
361 | if current_req.status_code == 403: | |
362 | logging.error("Access to thing {} is forbidden".format(self.thing_id)) | |
fb28c59b | 363 | return |
e45ba963 OM |
364 | if current_req.status_code != 200: |
365 | logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url), current_req.text)) | |
366 | return | |
367 | ||
368 | thing_json = current_req.json() | |
369 | try: | |
370 | self._license = thing_json['license'] | |
371 | except KeyError: | |
372 | logging.warning("No license found for thing {}?".format(self.thing_id)) | |
373 | ||
374 | # TODO: Get non-html version of this? | |
375 | try: | |
376 | self._details = thing_json['details'] | |
377 | except KeyError: | |
378 | logging.warning("No description found for thing {}?".format(self.thing_id)) | |
e0e69fc6 | 379 | |
e45ba963 OM |
380 | |
381 | ||
382 | # Now get the file details | |
383 | file_url = API_THING_FILES.format(self.thing_id, API_KEY) | |
384 | ||
385 | try: | |
386 | current_req = SESSION.get(file_url) | |
387 | except requests.exceptions.ConnectionError as error: | |
388 | logging.error("Unable to connect for thing {}: {}".format( | |
389 | self.thing_id, error)) | |
390 | return | |
391 | ||
392 | if current_req.status_code != 200: | |
393 | logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url), current_req.text)) | |
394 | return | |
395 | ||
396 | link_list = current_req.json() | |
397 | ||
398 | if not link_list: | |
247c2cd5 | 399 | logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(self.thing_id)) |
e45ba963 OM |
400 | |
401 | for link in link_list: | |
402 | logging.debug("Parsing link: {}".format(sanitise_url(link['url']))) | |
fb28c59b | 403 | try: |
e45ba963 | 404 | datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT) |
10f0238d | 405 | self._file_links.append(FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(API_KEY))) |
fb28c59b | 406 | except ValueError: |
e45ba963 OM |
407 | logging.error(link['date']) |
408 | ||
409 | # Finally get the image links | |
410 | image_url = API_THING_IMAGES.format(self.thing_id, API_KEY) | |
411 | ||
412 | try: | |
413 | current_req = SESSION.get(image_url) | |
414 | except requests.exceptions.ConnectionError as error: | |
415 | logging.error("Unable to connect for thing {}: {}".format( | |
416 | self.thing_id, error)) | |
417 | return | |
418 | ||
419 | if current_req.status_code != 200: | |
420 | logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url), current_req.text)) | |
421 | return | |
fb28c59b | 422 | |
e45ba963 | 423 | image_list = current_req.json() |
e0e69fc6 | 424 | |
e45ba963 OM |
425 | if not image_list: |
426 | logging.warning("No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(self.thing_id)) | |
e0e69fc6 | 427 | |
e45ba963 OM |
428 | for image in image_list: |
429 | logging.debug("parsing image: {}".format(image)) | |
430 | try: | |
431 | name = slugify(image['name']) | |
432 | # TODO: fallback to other types | |
433 | url = [x for x in image['sizes'] if x['type']=='display' and x['size']=='large'][0]['url'] | |
434 | except KeyError: | |
435 | logging.warning("Missing image for {}".format(name)) | |
436 | self._image_links.append(ImageLink(name, url)) | |
437 | ||
438 | self.slug = "{} - {}".format(self.thing_id, slugify(self.name)) | |
8ed15058 OM |
439 | self.download_dir = os.path.join(base_dir, self.slug) |
440 | ||
441 | self._handle_old_directory(base_dir) | |
3c82f75b | 442 | |
e45ba963 | 443 | logging.debug("Parsing {} ({})".format(self.thing_id, self.name)) |
8ed15058 | 444 | latest, self.last_time = self._find_last_download(base_dir) |
fa2f3251 | 445 | |
8ed15058 | 446 | if not latest: |
3b497b1a M |
447 | # Not yet downloaded |
448 | self._parsed = True | |
449 | return | |
3c82f75b | 450 | |
3c82f75b | 451 | |
8ed15058 | 452 | logging.info("last downloaded version: {}".format(self.last_time)) |
3c82f75b OM |
453 | |
454 | # OK, so we have a timestamp, lets see if there is anything new to get | |
e45ba963 OM |
455 | # First off, are we comparing an old download that threw away the timestamp? |
456 | ignore_time = self.last_time == strip_time(self.last_time) | |
ae598d73 | 457 | try: |
e45ba963 OM |
458 | # TODO: Allow for comparison at the exact time |
459 | files_last_update = self._file_links.last_update | |
460 | if ignore_time: | |
461 | logging.info("Dropping time from comparison stamp as old-style download dir") | |
462 | files_last_update = strip_time(files_last_update) | |
463 | ||
464 | ||
465 | if files_last_update > self.last_time: | |
dbdb1782 | 466 | logging.info( |
ae598d73 | 467 | "Found new/updated files {}".format(self._file_links.last_update)) |
3c82f75b OM |
468 | self._needs_download = True |
469 | self._parsed = True | |
470 | return | |
ae598d73 OM |
471 | except TypeError: |
472 | logging.warning("No files found for {}.".format(self.thing_id)) | |
b497d705 | 473 | |
3c82f75b | 474 | # Got here, so nope, no new files. |
3c82f75b OM |
475 | self._needs_download = False |
476 | self._parsed = True | |
477 | ||
8ed15058 OM |
478 | def _handle_old_directory(self, base_dir): |
479 | """ Deal with any old directories from previous versions of the code. | |
480 | """ | |
e45ba963 | 481 | old_dir = os.path.join(base_dir, slugify(self.name)) |
8ed15058 OM |
482 | if os.path.exists(old_dir): |
483 | logging.warning("Found old style download_dir. Moving.") | |
484 | rename_unique(old_dir, self.download_dir) | |
485 | ||
486 | def _handle_outdated_directory(self, base_dir): | |
487 | """ Move the current download directory sideways if the thing has changed. | |
488 | """ | |
489 | if not os.path.exists(self.download_dir): | |
490 | # No old directory to move. | |
491 | return None | |
492 | timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE) | |
493 | if not os.path.exists(timestamp_file): | |
494 | # Old form of download directory | |
495 | target_dir_name = "{} - old".format(self.download_dir) | |
496 | else: | |
3ac180ed | 497 | target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT)) |
8ed15058 OM |
498 | return rename_unique(self.download_dir, target_dir_name) |
499 | ||
500 | def _find_last_download(self, base_dir): | |
501 | """ Look for the most recent previous download (if any) of the thing. | |
502 | """ | |
503 | logging.info("Looking for old things") | |
504 | ||
505 | # First the DL directory itself. | |
506 | timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE) | |
507 | ||
508 | latest = None | |
509 | latest_time = None | |
510 | ||
511 | try: | |
512 | logging.debug("Checking for existing download in normal place.") | |
513 | with open(timestamp_file) as ts_fh: | |
514 | timestamp_text = ts_fh.read().strip() | |
515 | latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT) | |
516 | latest = self.download_dir | |
517 | except FileNotFoundError: | |
518 | # No existing download directory. huh. | |
519 | pass | |
520 | except TypeError: | |
521 | logging.warning("Invalid timestamp file found in {}".format(self.download_dir)) | |
522 | ||
523 | # TODO: Maybe look for old download directories. | |
524 | ||
525 | ||
526 | # Now look for 7z files | |
527 | candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id))) | |
528 | # +3 to allow for ' - ' | |
529 | leading_length =len(self.slug)+3 | |
530 | for path in candidates: | |
531 | candidate = os.path.basename(path) | |
532 | try: | |
533 | logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3])) | |
3ac180ed | 534 | candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT) |
8ed15058 OM |
535 | except ValueError: |
536 | logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate)) | |
537 | continue | |
538 | try: | |
539 | if candidate_time > latest_time: | |
540 | latest_time = candidate_time | |
541 | latest = candidate | |
542 | except TypeError: | |
543 | latest_time = candidate_time | |
544 | latest = candidate | |
545 | logging.info("Found last old thing: {} / {}".format(latest,latest_time)) | |
546 | return (latest, latest_time) | |
547 | ||
548 | ||
549 | ||
ae598d73 | 550 | def download(self, base_dir, compress): |
7b84ba6d OM |
551 | """ Download all files for a given thing. |
552 | Returns True iff the thing is now downloaded (not iff it downloads the thing!) | |
553 | """ | |
3c82f75b OM |
554 | if not self._parsed: |
555 | self._parse(base_dir) | |
556 | ||
e0e69fc6 | 557 | if not self._parsed: |
8cdd1b54 OM |
558 | logging.error( |
559 | "Unable to parse {} - aborting download".format(self.thing_id)) | |
7b84ba6d | 560 | return State.FAILED |
e0e69fc6 | 561 | |
3c82f75b | 562 | if not self._needs_download: |
e45ba963 | 563 | logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name)) |
7b84ba6d | 564 | return State.ALREADY_DOWNLOADED |
3c82f75b | 565 | |
247c2cd5 | 566 | if not self._file_links: |
e45ba963 | 567 | logging.error("{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name)) |
247c2cd5 OM |
568 | return State.FAILED |
569 | ||
3c82f75b | 570 | # Have we already downloaded some things? |
8ed15058 | 571 | renamed_dir = self._handle_outdated_directory(base_dir) |
3c82f75b OM |
572 | |
573 | # Get the list of files to download | |
3c82f75b OM |
574 | |
575 | new_file_links = [] | |
576 | old_file_links = [] | |
ae598d73 | 577 | self.time_stamp = None |
3c82f75b OM |
578 | |
579 | if not self.last_time: | |
580 | # If we don't have anything to copy from, then it is all new. | |
b497d705 OM |
581 | logging.debug("No last time, downloading all files") |
582 | new_file_links = self._file_links | |
ae598d73 | 583 | self.time_stamp = new_file_links[0].last_update |
b497d705 OM |
584 | |
585 | for file_link in new_file_links: | |
ae598d73 OM |
586 | self.time_stamp = max(self.time_stamp, file_link.last_update) |
587 | logging.debug("New timestamp will be {}".format(self.time_stamp)) | |
3c82f75b | 588 | else: |
ae598d73 | 589 | self.time_stamp = self.last_time |
b497d705 OM |
590 | for file_link in self._file_links: |
591 | if file_link.last_update > self.last_time: | |
3c82f75b | 592 | new_file_links.append(file_link) |
ae598d73 | 593 | self.time_stamp = max(self.time_stamp, file_link.last_update) |
3c82f75b OM |
594 | else: |
595 | old_file_links.append(file_link) | |
3c82f75b | 596 | |
ae598d73 | 597 | logging.debug("new timestamp {}".format(self.time_stamp)) |
3c82f75b OM |
598 | |
599 | # OK. Time to get to work. | |
fa2f3251 | 600 | logging.debug("Generating download_dir") |
3c82f75b | 601 | os.mkdir(self.download_dir) |
b497d705 | 602 | filelist_file = os.path.join(self.download_dir, "filelist.txt") |
d194b140 | 603 | with open(filelist_file, 'w', encoding="utf-8") as fl_handle: |
b497d705 | 604 | for fl in self._file_links: |
e45ba963 | 605 | fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update)) |
b497d705 OM |
606 | |
607 | ||
3c82f75b | 608 | # First grab the cached files (if any) |
fa2f3251 | 609 | logging.info("Copying {} unchanged files.".format(len(old_file_links))) |
e6d8def4 OM |
610 | if renamed_dir: |
611 | for file_link in old_file_links: | |
612 | try: | |
613 | old_file = os.path.join(renamed_dir, file_link.name) | |
614 | new_file = truncate_name(os.path.join(self.download_dir, file_link.name)) | |
615 | logging.debug("Copying {} to {}".format(old_file, new_file)) | |
616 | copyfile(old_file, new_file) | |
617 | except FileNotFoundError: | |
618 | logging.warning( | |
619 | "Unable to find {} in old archive, redownloading".format(file_link.name)) | |
620 | new_file_links.append(file_link) | |
621 | except TypeError: | |
622 | # Not altogether sure how this could occur, possibly with some combination of the old file types | |
623 | logging.warning( | |
624 | "Typeerror looking for {} in {}".format(file_link.name, renamed_dir)) | |
625 | new_file_links.append(file_link) | |
626 | ||
3c82f75b OM |
627 | |
628 | # Now download the new ones | |
dbdb1782 | 629 | logging.info("Downloading {} new files of {}".format( |
b497d705 | 630 | len(new_file_links), len(self._file_links))) |
3c82f75b | 631 | try: |
b497d705 | 632 | for file_link in new_file_links: |
65bd8b43 | 633 | file_name = truncate_name(os.path.join(self.download_dir, file_link.name)) |
dbdb1782 | 634 | logging.debug("Downloading {} from {} to {}".format( |
b497d705 | 635 | file_link.name, file_link.link, file_name)) |
10f0238d | 636 | data_req = SESSION.get(file_link.link) |
e45ba963 OM |
637 | if data_req.status_code != 200: |
638 | logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code, sanitise_url(file_link.link), data_req.text)) | |
639 | fail_dir(self.download_dir) | |
640 | return State.FAILED | |
641 | ||
642 | ||
3c82f75b OM |
643 | with open(file_name, 'wb') as handle: |
644 | handle.write(data_req.content) | |
645 | except Exception as exception: | |
b497d705 | 646 | logging.error("Failed to download {} - {}".format(file_link.name, exception)) |
65bd8b43 | 647 | fail_dir(self.download_dir) |
7b84ba6d | 648 | return State.FAILED |
3c82f75b | 649 | |
b497d705 | 650 | |
e45ba963 | 651 | # People like images. |
680039fe | 652 | image_dir = os.path.join(self.download_dir, 'images') |
fb28c59b | 653 | logging.info("Downloading {} images.".format(len(self._image_links))) |
680039fe OM |
654 | try: |
655 | os.mkdir(image_dir) | |
fb28c59b | 656 | for imagelink in self._image_links: |
e45ba963 OM |
657 | filename = os.path.join(image_dir, imagelink.name) |
658 | image_req = SESSION.get(imagelink.link) | |
659 | if image_req.status_code != 200: | |
660 | logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code, sanitise_url(file_link.link), image_req.text)) | |
661 | fail_dir(self.download_dir) | |
662 | return State.FAILED | |
663 | with open(truncate_name(filename), 'wb') as handle: | |
680039fe OM |
664 | handle.write(image_req.content) |
665 | except Exception as exception: | |
e45ba963 | 666 | logging.error("Failed to download {} - {}".format(imagelink.name, exception)) |
65bd8b43 | 667 | fail_dir(self.download_dir) |
7b84ba6d | 668 | return State.FAILED |
680039fe | 669 | |
4f75dd69 | 670 | # Best get some licenses |
e45ba963 | 671 | logging.info("writing license file") |
4f75dd69 | 672 | try: |
fb28c59b | 673 | if self._license: |
65bd8b43 | 674 | with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w', encoding="utf-8") as license_handle: |
fb28c59b | 675 | license_handle.write("{}\n".format(self._license)) |
4f75dd69 OM |
676 | except IOError as exception: |
677 | logging.warning("Failed to write license! {}".format(exception)) | |
fb28c59b | 678 | |
e45ba963 OM |
679 | logging.info("writing readme") |
680 | try: | |
681 | if self._details: | |
682 | with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w', encoding="utf-8") as readme_handle: | |
683 | readme_handle.write("{}\n".format(self._details)) | |
684 | except IOError as exception: | |
685 | logging.warning("Failed to write readme! {}".format(exception)) | |
686 | ||
3c82f75b OM |
687 | try: |
688 | # Now write the timestamp | |
8ed15058 | 689 | with open(os.path.join(self.download_dir,TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle: |
ae598d73 | 690 | timestamp_handle.write(self.time_stamp.__str__()) |
3c82f75b | 691 | except Exception as exception: |
e45ba963 | 692 | logging.error("Failed to write timestamp file - {}".format(exception)) |
65bd8b43 | 693 | fail_dir(self.download_dir) |
7b84ba6d | 694 | return State.FAILED |
3c82f75b | 695 | self._needs_download = False |
e45ba963 | 696 | logging.debug("Download of {} finished".format(self.name)) |
ae598d73 OM |
697 | if not compress: |
698 | return State.OK | |
699 | ||
700 | ||
701 | thing_dir = "{} - {} - {}".format(self.thing_id, | |
e45ba963 | 702 | slugify(self.name), |
3ac180ed | 703 | self.time_stamp.strftime(SAFE_DATETIME_FORMAT)) |
ae598d73 OM |
704 | file_name = os.path.join(base_dir, |
705 | "{}.7z".format(thing_dir)) | |
706 | logging.debug("Compressing {} to {}".format( | |
e45ba963 | 707 | self.name, |
ae598d73 | 708 | file_name)) |
ae598d73 | 709 | with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive: |
ae598d73 | 710 | archive.writeall(self.download_dir, thing_dir) |
e45ba963 | 711 | logging.debug("Compression of {} finished.".format(self.name)) |
8ed15058 | 712 | shutil.rmtree(self.download_dir) |
e45ba963 | 713 | logging.debug("Removed temporary download dir of {}.".format(self.name)) |
7b84ba6d | 714 | return State.OK |
975060c9 | 715 | |
dbdb1782 | 716 | |
ae598d73 OM |
717 | |
718 | ||
719 | def do_batch(batch_file, download_dir, quick, compress): | |
1ab49020 OM |
720 | """ Read a file in line by line, parsing each as a set of calls to this script.""" |
721 | with open(batch_file) as handle: | |
722 | for line in handle: | |
723 | line = line.strip() | |
cf280385 M |
724 | if not line: |
725 | # Skip empty lines | |
726 | continue | |
1ab49020 OM |
727 | logging.info("Handling instruction {}".format(line)) |
728 | command_arr = line.split() | |
729 | if command_arr[0] == "thing": | |
dbdb1782 OM |
730 | logging.debug( |
731 | "Handling batch thing instruction: {}".format(line)) | |
ae598d73 | 732 | Thing(command_arr[1]).download(download_dir, compress) |
1ab49020 OM |
733 | continue |
734 | if command_arr[0] == "collection": | |
dbdb1782 OM |
735 | logging.debug( |
736 | "Handling batch collection instruction: {}".format(line)) | |
737 | Collection(command_arr[1], command_arr[2], | |
ae598d73 | 738 | download_dir, quick, compress).download() |
1ab49020 OM |
739 | continue |
740 | if command_arr[0] == "user": | |
dbdb1782 OM |
741 | logging.debug( |
742 | "Handling batch collection instruction: {}".format(line)) | |
ae598d73 | 743 | Designs(command_arr[1], download_dir, quick, compress).download() |
1ab49020 OM |
744 | continue |
745 | logging.warning("Unable to parse current instruction. Skipping.") | |
746 | ||
dbdb1782 | 747 | |
975060c9 OM |
748 | def main(): |
749 | """ Entry point for script being run as a command. """ | |
750 | parser = argparse.ArgumentParser() | |
dbdb1782 OM |
751 | parser.add_argument("-l", "--log-level", choices=[ |
752 | 'debug', 'info', 'warning'], default='info', help="level of logging desired") | |
753 | parser.add_argument("-d", "--directory", | |
754 | help="Target directory to download into") | |
4f94efc8 OM |
755 | parser.add_argument("-f", "--log-file", |
756 | help="Place to log debug information to") | |
7b84ba6d OM |
757 | parser.add_argument("-q", "--quick", action="store_true", |
758 | help="Assume date ordering on posts") | |
ae598d73 OM |
759 | parser.add_argument("-c", "--compress", action="store_true", |
760 | help="Compress files") | |
e45ba963 OM |
761 | parser.add_argument("-a", "--api-key", |
762 | help="API key for thingiverse") | |
763 | ||
7b84ba6d | 764 | |
dbdb1782 OM |
765 | subparsers = parser.add_subparsers( |
766 | help="Type of thing to download", dest="subcommand") | |
767 | collection_parser = subparsers.add_parser( | |
b7bfef68 | 768 | 'collection', help="Download one or more entire collection(s)") |
dbdb1782 | 769 | collection_parser.add_argument( |
b7bfef68 | 770 | "owner", help="The owner of the collection(s) to get") |
dbdb1782 | 771 | collection_parser.add_argument( |
b7bfef68 | 772 | "collections", nargs="+", help="Space seperated list of the name(s) of collection to get") |
dbdb1782 OM |
773 | thing_parser = subparsers.add_parser( |
774 | 'thing', help="Download a single thing.") | |
8cdd1b54 OM |
775 | thing_parser.add_argument( |
776 | "things", nargs="*", help="Space seperated list of thing ID(s) to download") | |
dbdb1782 | 777 | user_parser = subparsers.add_parser( |
b7bfef68 | 778 | "user", help="Download all things by one or more users") |
8cdd1b54 OM |
779 | user_parser.add_argument( |
780 | "users", nargs="+", help="A space seperated list of the user(s) to get the designs of") | |
dbdb1782 OM |
781 | batch_parser = subparsers.add_parser( |
782 | "batch", help="Perform multiple actions written in a text file") | |
783 | batch_parser.add_argument( | |
784 | "batch_file", help="The name of the file to read.") | |
680039fe | 785 | subparsers.add_parser("version", help="Show the current version") |
4a98996b | 786 | |
975060c9 | 787 | args = parser.parse_args() |
4a98996b OM |
788 | if not args.subcommand: |
789 | parser.print_help() | |
790 | sys.exit(1) | |
d66f1f78 OM |
791 | if not args.directory: |
792 | args.directory = os.getcwd() | |
4f94efc8 OM |
793 | |
794 | logger = logging.getLogger() | |
795 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
796 | logger.setLevel(logging.DEBUG) | |
797 | console_handler = logging.StreamHandler() | |
798 | console_handler.setLevel(args.log_level.upper()) | |
799 | ||
e45ba963 OM |
800 | global API_KEY |
801 | if args.api_key: | |
802 | API_KEY=args.api_key | |
803 | else: | |
804 | try: | |
805 | with open("api.key") as fh: | |
806 | API_KEY=fh.read().strip() | |
807 | except Exception as e: | |
808 | logging.error("Either specify the api-key on the command line or in a file called 'api.key'") | |
809 | logging.error("Exception: {}".format(e)) | |
810 | return | |
811 | ||
4f94efc8 OM |
812 | logger.addHandler(console_handler) |
813 | if args.log_file: | |
814 | file_handler = logging.FileHandler(args.log_file) | |
815 | file_handler.setLevel(logging.DEBUG) | |
816 | file_handler.setFormatter(formatter) | |
817 | logger.addHandler(file_handler) | |
fa2f3251 | 818 | |
6a777954 OM |
819 | |
820 | # Start downloader | |
821 | thing_queue = multiprocessing.JoinableQueue() | |
822 | logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT)) | |
ae598d73 | 823 | downloaders = [Downloader(thing_queue, args.directory, args.compress) for _ in range(DOWNLOADER_COUNT)] |
6a777954 OM |
824 | for downloader in downloaders: |
825 | downloader.start() | |
826 | ||
827 | ||
4a98996b | 828 | if args.subcommand.startswith("collection"): |
b7bfef68 | 829 | for collection in args.collections: |
ae598d73 | 830 | Collection(args.owner, collection, args.directory, args.quick, args.compress).download() |
4a98996b | 831 | if args.subcommand == "thing": |
b7bfef68 | 832 | for thing in args.things: |
6a777954 | 833 | thing_queue.put(thing) |
3522a3bf | 834 | if args.subcommand == "user": |
b7bfef68 | 835 | for user in args.users: |
ae598d73 | 836 | Designs(user, args.directory, args.quick, args.compress).download() |
db8066ec OM |
837 | if args.subcommand == "version": |
838 | print("thingy_grabber.py version {}".format(VERSION)) | |
1ab49020 | 839 | if args.subcommand == "batch": |
ae598d73 | 840 | do_batch(args.batch_file, args.directory, args.quick, args.compress) |
1ab49020 | 841 | |
6a777954 OM |
842 | # Stop the downloader processes |
843 | for downloader in downloaders: | |
844 | thing_queue.put(None) | |
975060c9 | 845 | |
d194b140 | 846 | |
0930777e OM |
847 | if __name__ == "__main__": |
848 | multiprocessing.freeze_support() | |
975060c9 | 849 | main() |