Commit | Line | Data |
---|---|---|
975060c9 OM |
1 | #!/usr/bin/env python3 |
2 | """ | |
3 | Thingiverse bulk downloader | |
4 | """ | |
5 | ||
6 | import re | |
4a98996b | 7 | import sys |
975060c9 OM |
8 | import os |
9 | import argparse | |
10 | import unicodedata | |
11 | import requests | |
fa2f3251 | 12 | import logging |
6a777954 | 13 | import multiprocessing |
7b84ba6d | 14 | import enum |
fb28c59b | 15 | import datetime |
3c82f75b | 16 | from shutil import copyfile |
b497d705 | 17 | from dataclasses import dataclass |
9828dabe | 18 | import py7zr |
8ed15058 OM |
19 | import glob |
20 | import shutil | |
975060c9 | 21 | |
ae598d73 OM |
22 | SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}] |
23 | ||
8ed15058 OM |
24 | # I don't think this is exported by datetime |
25 | DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' | |
3ac180ed OM |
26 | # Windows cannot handle : in filenames |
27 | SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S' | |
8ed15058 | 28 | |
73695baf OM |
29 | API_BASE = "https://api.thingiverse.com" |
30 | ACCESS_QP = "access_token={}" | |
31 | PAGE_QP = "page={}" | |
e45ba963 OM |
32 | API_USER_DESIGNS = API_BASE + "/users/{}/things/" |
33 | API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP | |
975060c9 | 34 | |
e45ba963 OM |
35 | # Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS |
36 | API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP | |
37 | API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP | |
38 | ||
39 | API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP | |
40 | API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP | |
41 | API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP | |
10f0238d | 42 | API_THING_DOWNLOAD = "/download/?" + ACCESS_QP |
e45ba963 OM |
43 | |
44 | API_KEY = None | |
dd8c35f4 | 45 | |
6a777954 | 46 | DOWNLOADER_COUNT = 1 |
7b84ba6d | 47 | RETRY_COUNT = 3 |
6a777954 | 48 | |
65bd8b43 OM |
49 | MAX_PATH_LENGTH = 250 |
50 | ||
10f0238d | 51 | VERSION = "0.10.2" |
dbdb1782 | 52 | |
8ed15058 | 53 | TIMESTAMP_FILE = "timestamp.txt" |
b497d705 | 54 | |
e45ba963 | 55 | SESSION = requests.Session() |
b497d705 | 56 | |
73695baf | 57 | |
e45ba963 OM |
58 | @dataclass |
59 | class ThingLink: | |
60 | thing_id: str | |
61 | name: str | |
62 | api_link: str | |
b497d705 | 63 | |
73695baf | 64 | |
b497d705 OM |
65 | @dataclass |
66 | class FileLink: | |
67 | name: str | |
ae598d73 OM |
68 | last_update: datetime.datetime |
69 | link: str | |
70 | ||
73695baf | 71 | |
e45ba963 OM |
72 | @dataclass |
73 | class ImageLink: | |
74 | name: str | |
75 | link: str | |
76 | ||
73695baf | 77 | |
ae598d73 | 78 | class FileLinks: |
73695baf OM |
79 | def __init__(self, initial_links=None): |
80 | if initial_links is None: | |
81 | initial_links = [] | |
ae598d73 OM |
82 | self.links = [] |
83 | self.last_update = None | |
73695baf | 84 | for link in initial_links: |
ae598d73 OM |
85 | self.append(link) |
86 | ||
87 | def __iter__(self): | |
88 | return iter(self.links) | |
89 | ||
90 | def __getitem__(self, item): | |
91 | return self.links[item] | |
92 | ||
93 | def __len__(self): | |
94 | return len(self.links) | |
95 | ||
96 | def append(self, link): | |
97 | try: | |
98 | self.last_update = max(self.last_update, link.last_update) | |
99 | except TypeError: | |
100 | self.last_update = link.last_update | |
101 | self.links.append(link) | |
8ed15058 | 102 | |
b497d705 | 103 | |
7b84ba6d OM |
104 | class State(enum.Enum): |
105 | OK = enum.auto() | |
106 | FAILED = enum.auto() | |
107 | ALREADY_DOWNLOADED = enum.auto() | |
108 | ||
73695baf | 109 | |
e45ba963 OM |
110 | def sanitise_url(url): |
111 | """ remove api keys from an url | |
112 | """ | |
113 | return re.sub(r'access_token=\w*', | |
114 | 'access_token=***', | |
115 | url) | |
116 | ||
73695baf | 117 | |
e45ba963 OM |
118 | def strip_time(date_obj): |
119 | """ Takes a datetime object and returns another with the time set to 00:00 | |
120 | """ | |
121 | return datetime.datetime.combine(date_obj.date(), datetime.time()) | |
122 | ||
73695baf | 123 | |
8ed15058 OM |
124 | def rename_unique(dir_name, target_dir_name): |
125 | """ Move a directory sideways to a new name, ensuring it is unique. | |
65bd8b43 | 126 | """ |
8ed15058 | 127 | target_dir = target_dir_name |
65bd8b43 OM |
128 | inc = 0 |
129 | while os.path.exists(target_dir): | |
73695baf OM |
130 | target_dir = "{}_{}".format(target_dir_name, inc) |
131 | inc += 1 | |
65bd8b43 | 132 | os.rename(dir_name, target_dir) |
8ed15058 OM |
133 | return target_dir |
134 | ||
135 | ||
136 | def fail_dir(dir_name): | |
137 | """ When a download has failed, move it sideways. | |
138 | """ | |
73695baf | 139 | return rename_unique(dir_name, "{}_failed".format(dir_name)) |
65bd8b43 OM |
140 | |
141 | ||
142 | def truncate_name(file_name): | |
143 | """ Ensure the filename is not too long for, well windows basically. | |
144 | """ | |
145 | path = os.path.abspath(file_name) | |
146 | if len(path) <= MAX_PATH_LENGTH: | |
147 | return path | |
65bd8b43 OM |
148 | base, extension = os.path.splitext(path) |
149 | inc = 0 | |
150 | new_path = "{}_{}{}".format(base, inc, extension) | |
151 | while os.path.exists(new_path): | |
152 | new_path = "{}_{}{}".format(base, inc, extension) | |
153 | inc += 1 | |
154 | return new_path | |
155 | ||
156 | ||
975060c9 OM |
157 | def slugify(value): |
158 | """ | |
d194b140 OM |
159 | Normalise string, removes invalid for filename charactersr |
160 | and converts string to lowercase. | |
975060c9 | 161 | """ |
e45ba963 | 162 | logging.debug("Sluggyfying {}".format(value)) |
d194b140 | 163 | value = unicodedata.normalize('NFKC', value).lower().strip() |
73695baf | 164 | value = re.sub(r'[\\/<>:?*|"]', '', value) |
65bd8b43 OM |
165 | value = re.sub(r'\.*$', '', value) |
166 | return value | |
975060c9 | 167 | |
b497d705 | 168 | |
6a777954 OM |
169 | class Downloader(multiprocessing.Process): |
170 | """ | |
171 | Class to handle downloading the things we have found to get. | |
172 | """ | |
173 | ||
eb4e5a3f | 174 | def __init__(self, thing_queue, download_directory, compress, api_key): |
6a777954 OM |
175 | multiprocessing.Process.__init__(self) |
176 | # TODO: add parameters | |
177 | self.thing_queue = thing_queue | |
178 | self.download_directory = download_directory | |
ae598d73 | 179 | self.compress = compress |
eb4e5a3f | 180 | self.api_key = api_key |
6a777954 OM |
181 | |
182 | def run(self): | |
183 | """ actual download loop. | |
184 | """ | |
185 | while True: | |
eb4e5a3f | 186 | thing_id = self.thing_queue.get() |
6a777954 OM |
187 | if thing_id is None: |
188 | logging.info("Shutting download queue") | |
189 | self.thing_queue.task_done() | |
190 | break | |
eb4e5a3f OM |
191 | thing = None |
192 | if isinstance(thing_id, str): | |
193 | thing = Thing.from_thing_id(thing_id) | |
194 | if isinstance(thing_id, ThingLink): | |
195 | thing = Thing(thing_id) | |
196 | if not thing: | |
197 | logging.error("Don't know how to handle thing_id {}".format(thing_id)) | |
198 | else: | |
199 | logging.info("Handling id {}".format(thing_id)) | |
200 | thing.download(self.download_directory, self.compress, self.api_key) | |
6a777954 OM |
201 | self.thing_queue.task_done() |
202 | return | |
203 | ||
7b84ba6d | 204 | |
3522a3bf | 205 | class Grouping: |
d66f1f78 | 206 | """ Holds details of a group of things for download |
3c82f75b OM |
207 | This is effectively (although not actually) an abstract class |
208 | - use Collection or Designs instead. | |
209 | """ | |
dbdb1782 | 210 | |
ae598d73 | 211 | def __init__(self, quick, compress): |
975060c9 OM |
212 | self.things = [] |
213 | self.total = 0 | |
214 | self.req_id = None | |
215 | self.last_page = 0 | |
216 | self.per_page = None | |
7b84ba6d | 217 | # Should we stop downloading when we hit a known datestamp? |
73695baf | 218 | self.quick = quick |
ae598d73 | 219 | self.compress = compress |
948bd56f | 220 | # These should be set by child classes. |
3522a3bf OM |
221 | self.url = None |
222 | self.download_dir = None | |
975060c9 | 223 | |
73695baf | 224 | @property |
3522a3bf OM |
225 | def get(self): |
226 | """ retrieve the things of the grouping. """ | |
975060c9 OM |
227 | if self.things: |
228 | # We've already done it. | |
229 | return self.things | |
230 | ||
3522a3bf OM |
231 | # Check for initialisation: |
232 | if not self.url: | |
fa2f3251 | 233 | logging.error("No URL set - object not initialised properly?") |
3522a3bf OM |
234 | raise ValueError("No URL set - object not initialised properly?") |
235 | ||
236 | # Get the internal details of the grouping. | |
e45ba963 | 237 | logging.debug("Querying {}".format(sanitise_url(self.url))) |
73695baf OM |
238 | |
239 | # self.url should already have been formatted as we don't need pagination | |
240 | logging.info("requesting:{}".format(sanitise_url(self.url))) | |
241 | current_req = SESSION.get(self.url) | |
242 | if current_req.status_code != 200: | |
243 | logging.error( | |
244 | "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url), | |
245 | current_req.text)) | |
e45ba963 | 246 | else: |
73695baf OM |
247 | current_json = current_req.json() |
248 | for thing in current_json: | |
249 | logging.info(thing) | |
250 | self.things.append(ThingLink(thing['id'], thing['name'], thing['url'])) | |
e45ba963 | 251 | logging.info("Found {} things.".format(len(self.things))) |
975060c9 OM |
252 | return self.things |
253 | ||
254 | def download(self): | |
255 | """ Downloads all the files in a collection """ | |
256 | if not self.things: | |
3522a3bf OM |
257 | self.get() |
258 | ||
259 | if not self.download_dir: | |
dbdb1782 OM |
260 | raise ValueError( |
261 | "No download_dir set - invalidly initialised object?") | |
3522a3bf | 262 | |
975060c9 | 263 | base_dir = os.getcwd() |
975060c9 | 264 | try: |
3522a3bf | 265 | os.mkdir(self.download_dir) |
975060c9 | 266 | except FileExistsError: |
fa2f3251 | 267 | logging.info("Target directory {} already exists. Assuming a resume." |
dbdb1782 | 268 | .format(self.download_dir)) |
fa2f3251 | 269 | logging.info("Downloading {} thing(s).".format(self.total)) |
dbdb1782 | 270 | for idx, thing in enumerate(self.things): |
fb28c59b | 271 | logging.info("Downloading thing {} - {}".format(idx, thing)) |
eb4e5a3f OM |
272 | return_code = Thing(thing).download(self.download_dir, self.compress) |
273 | if self.quick and return_code == State.ALREADY_DOWNLOADED: | |
7b84ba6d OM |
274 | logging.info("Caught up, stopping.") |
275 | return | |
975060c9 | 276 | |
73695baf | 277 | |
3522a3bf OM |
278 | class Collection(Grouping): |
279 | """ Holds details of a collection. """ | |
dbdb1782 | 280 | |
ae598d73 OM |
281 | def __init__(self, user, name, directory, quick, compress): |
282 | Grouping.__init__(self, quick, compress) | |
3522a3bf OM |
283 | self.user = user |
284 | self.name = name | |
e45ba963 OM |
285 | self.paginated = False |
286 | # need to figure out the the ID for the collection | |
287 | collection_url = API_USER_COLLECTIONS.format(user, API_KEY) | |
288 | try: | |
289 | current_req = SESSION.get(collection_url) | |
290 | except requests.exceptions.ConnectionError as error: | |
73695baf OM |
291 | logging.error("Unable to connect for collections for user {}: {}".format( |
292 | self.user, error)) | |
e45ba963 OM |
293 | return |
294 | if current_req.status_code != 200: | |
73695baf OM |
295 | logging.error( |
296 | "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url), | |
297 | current_req.text)) | |
e45ba963 OM |
298 | return |
299 | collection_list = current_req.json() | |
300 | try: | |
301 | # case insensitive to retain parity with previous behaviour | |
302 | collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0] | |
303 | except IndexError: | |
304 | logging.error("Unable to find collection {} for user {}".format(name, user)) | |
305 | return | |
306 | self.collection_id = collection['id'] | |
307 | self.url = API_COLLECTION_THINGS.format(self.collection_id, API_KEY) | |
308 | ||
d66f1f78 | 309 | self.download_dir = os.path.join(directory, |
3c82f75b | 310 | "{}-{}".format(slugify(self.user), slugify(self.name))) |
3522a3bf | 311 | |
dbdb1782 | 312 | |
3522a3bf OM |
313 | class Designs(Grouping): |
314 | """ Holds details of all of a users' designs. """ | |
dbdb1782 | 315 | |
ae598d73 OM |
316 | def __init__(self, user, directory, quick, compress): |
317 | Grouping.__init__(self, quick, compress) | |
3522a3bf | 318 | self.user = user |
e45ba963 OM |
319 | self.url = API_USER_DESIGNS.format(user) |
320 | self.paginated = True | |
dbdb1782 OM |
321 | self.download_dir = os.path.join( |
322 | directory, "{} designs".format(slugify(self.user))) | |
975060c9 | 323 | |
dbdb1782 | 324 | |
3c82f75b OM |
325 | class Thing: |
326 | """ An individual design on thingiverse. """ | |
dbdb1782 | 327 | |
e45ba963 OM |
328 | def __init__(self, thing_link): |
329 | self.thing_id = thing_link.thing_id | |
330 | self.name = thing_link.name | |
3c82f75b OM |
331 | self.last_time = None |
332 | self._parsed = False | |
333 | self._needs_download = True | |
334 | self.text = None | |
3c82f75b | 335 | self.download_dir = None |
ae598d73 OM |
336 | self.time_stamp = None |
337 | self._file_links = FileLinks() | |
e45ba963 | 338 | self._image_links = [] |
975060c9 | 339 | |
eb4e5a3f OM |
340 | @classmethod |
341 | def from_thing_id(cls, thing_id): | |
342 | """ | |
343 | Factory method that looks up a thing by ID and creates a Thing object for it | |
344 | :param thing_id: to look up | |
345 | :return: Thing or None | |
346 | """ | |
347 | return Thing(ThingLink(thing_id, "", "")) | |
348 | ||
349 | ||
350 | def _parse(self, base_dir, api_key): | |
3c82f75b OM |
351 | """ Work out what, if anything needs to be done. """ |
352 | if self._parsed: | |
353 | return | |
e36c2a07 | 354 | |
e45ba963 | 355 | # First get the broad details |
eb4e5a3f OM |
356 | url = API_THING_DETAILS.format(self.thing_id, api_key) |
357 | logging.error(url) | |
e0e69fc6 | 358 | try: |
e45ba963 | 359 | current_req = SESSION.get(url) |
e0e69fc6 | 360 | except requests.exceptions.ConnectionError as error: |
8cdd1b54 OM |
361 | logging.error("Unable to connect for thing {}: {}".format( |
362 | self.thing_id, error)) | |
363 | return | |
e45ba963 OM |
364 | # Check for DMCA |
365 | if current_req.status_code == 403: | |
366 | logging.error("Access to thing {} is forbidden".format(self.thing_id)) | |
fb28c59b | 367 | return |
e45ba963 | 368 | if current_req.status_code != 200: |
73695baf OM |
369 | logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url), |
370 | current_req.text)) | |
e45ba963 OM |
371 | return |
372 | ||
373 | thing_json = current_req.json() | |
374 | try: | |
375 | self._license = thing_json['license'] | |
376 | except KeyError: | |
377 | logging.warning("No license found for thing {}?".format(self.thing_id)) | |
378 | ||
379 | # TODO: Get non-html version of this? | |
380 | try: | |
381 | self._details = thing_json['details'] | |
382 | except KeyError: | |
383 | logging.warning("No description found for thing {}?".format(self.thing_id)) | |
e0e69fc6 | 384 | |
eb4e5a3f OM |
385 | if not self.name: |
386 | # Probably generated with factory method. | |
387 | try: | |
388 | self.name = thing_json['name'] | |
389 | except KeyError: | |
390 | logging.warning("No name found for thing {}?".format(self.thing_id)) | |
391 | self.name = self.thing_id | |
392 | ||
e45ba963 | 393 | # Now get the file details |
eb4e5a3f | 394 | file_url = API_THING_FILES.format(self.thing_id, api_key) |
e45ba963 OM |
395 | |
396 | try: | |
397 | current_req = SESSION.get(file_url) | |
398 | except requests.exceptions.ConnectionError as error: | |
399 | logging.error("Unable to connect for thing {}: {}".format( | |
400 | self.thing_id, error)) | |
401 | return | |
402 | ||
403 | if current_req.status_code != 200: | |
73695baf OM |
404 | logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url), |
405 | current_req.text)) | |
e45ba963 OM |
406 | return |
407 | ||
408 | link_list = current_req.json() | |
409 | ||
410 | if not link_list: | |
73695baf OM |
411 | logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format( |
412 | self.thing_id)) | |
e45ba963 OM |
413 | |
414 | for link in link_list: | |
415 | logging.debug("Parsing link: {}".format(sanitise_url(link['url']))) | |
fb28c59b | 416 | try: |
e45ba963 | 417 | datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT) |
73695baf | 418 | self._file_links.append( |
eb4e5a3f | 419 | FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key))) |
fb28c59b | 420 | except ValueError: |
e45ba963 OM |
421 | logging.error(link['date']) |
422 | ||
423 | # Finally get the image links | |
eb4e5a3f | 424 | image_url = API_THING_IMAGES.format(self.thing_id, api_key) |
e45ba963 OM |
425 | |
426 | try: | |
427 | current_req = SESSION.get(image_url) | |
428 | except requests.exceptions.ConnectionError as error: | |
429 | logging.error("Unable to connect for thing {}: {}".format( | |
430 | self.thing_id, error)) | |
431 | return | |
432 | ||
433 | if current_req.status_code != 200: | |
73695baf OM |
434 | logging.error( |
435 | "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url), | |
436 | current_req.text)) | |
e45ba963 | 437 | return |
fb28c59b | 438 | |
e45ba963 | 439 | image_list = current_req.json() |
e0e69fc6 | 440 | |
e45ba963 | 441 | if not image_list: |
73695baf OM |
442 | logging.warning( |
443 | "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format( | |
444 | self.thing_id)) | |
e0e69fc6 | 445 | |
e45ba963 OM |
446 | for image in image_list: |
447 | logging.debug("parsing image: {}".format(image)) | |
73695baf | 448 | name = None |
e45ba963 OM |
449 | try: |
450 | name = slugify(image['name']) | |
451 | # TODO: fallback to other types | |
73695baf | 452 | url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url'] |
e45ba963 OM |
453 | except KeyError: |
454 | logging.warning("Missing image for {}".format(name)) | |
455 | self._image_links.append(ImageLink(name, url)) | |
456 | ||
457 | self.slug = "{} - {}".format(self.thing_id, slugify(self.name)) | |
8ed15058 OM |
458 | self.download_dir = os.path.join(base_dir, self.slug) |
459 | ||
460 | self._handle_old_directory(base_dir) | |
3c82f75b | 461 | |
e45ba963 | 462 | logging.debug("Parsing {} ({})".format(self.thing_id, self.name)) |
8ed15058 | 463 | latest, self.last_time = self._find_last_download(base_dir) |
fa2f3251 | 464 | |
8ed15058 | 465 | if not latest: |
73695baf OM |
466 | # Not yet downloaded |
467 | self._parsed = True | |
468 | return | |
3c82f75b | 469 | |
8ed15058 | 470 | logging.info("last downloaded version: {}".format(self.last_time)) |
3c82f75b OM |
471 | |
472 | # OK, so we have a timestamp, lets see if there is anything new to get | |
e45ba963 OM |
473 | # First off, are we comparing an old download that threw away the timestamp? |
474 | ignore_time = self.last_time == strip_time(self.last_time) | |
ae598d73 | 475 | try: |
e45ba963 OM |
476 | # TODO: Allow for comparison at the exact time |
477 | files_last_update = self._file_links.last_update | |
478 | if ignore_time: | |
479 | logging.info("Dropping time from comparison stamp as old-style download dir") | |
480 | files_last_update = strip_time(files_last_update) | |
481 | ||
e45ba963 | 482 | if files_last_update > self.last_time: |
dbdb1782 | 483 | logging.info( |
ae598d73 | 484 | "Found new/updated files {}".format(self._file_links.last_update)) |
3c82f75b OM |
485 | self._needs_download = True |
486 | self._parsed = True | |
487 | return | |
ae598d73 OM |
488 | except TypeError: |
489 | logging.warning("No files found for {}.".format(self.thing_id)) | |
b497d705 | 490 | |
3c82f75b | 491 | # Got here, so nope, no new files. |
3c82f75b OM |
492 | self._needs_download = False |
493 | self._parsed = True | |
494 | ||
8ed15058 OM |
495 | def _handle_old_directory(self, base_dir): |
496 | """ Deal with any old directories from previous versions of the code. | |
497 | """ | |
e45ba963 | 498 | old_dir = os.path.join(base_dir, slugify(self.name)) |
8ed15058 OM |
499 | if os.path.exists(old_dir): |
500 | logging.warning("Found old style download_dir. Moving.") | |
501 | rename_unique(old_dir, self.download_dir) | |
502 | ||
73695baf | 503 | def _handle_outdated_directory(self): |
8ed15058 OM |
504 | """ Move the current download directory sideways if the thing has changed. |
505 | """ | |
506 | if not os.path.exists(self.download_dir): | |
507 | # No old directory to move. | |
508 | return None | |
509 | timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE) | |
510 | if not os.path.exists(timestamp_file): | |
511 | # Old form of download directory | |
512 | target_dir_name = "{} - old".format(self.download_dir) | |
513 | else: | |
3ac180ed | 514 | target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT)) |
8ed15058 OM |
515 | return rename_unique(self.download_dir, target_dir_name) |
516 | ||
517 | def _find_last_download(self, base_dir): | |
518 | """ Look for the most recent previous download (if any) of the thing. | |
519 | """ | |
520 | logging.info("Looking for old things") | |
521 | ||
522 | # First the DL directory itself. | |
523 | timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE) | |
524 | ||
525 | latest = None | |
526 | latest_time = None | |
527 | ||
528 | try: | |
529 | logging.debug("Checking for existing download in normal place.") | |
530 | with open(timestamp_file) as ts_fh: | |
531 | timestamp_text = ts_fh.read().strip() | |
532 | latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT) | |
533 | latest = self.download_dir | |
534 | except FileNotFoundError: | |
535 | # No existing download directory. huh. | |
536 | pass | |
537 | except TypeError: | |
538 | logging.warning("Invalid timestamp file found in {}".format(self.download_dir)) | |
539 | ||
540 | # TODO: Maybe look for old download directories. | |
541 | ||
8ed15058 OM |
542 | # Now look for 7z files |
543 | candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id))) | |
544 | # +3 to allow for ' - ' | |
73695baf | 545 | leading_length = len(self.slug) + 3 |
8ed15058 OM |
546 | for path in candidates: |
547 | candidate = os.path.basename(path) | |
548 | try: | |
549 | logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3])) | |
3ac180ed | 550 | candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT) |
8ed15058 OM |
551 | except ValueError: |
552 | logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate)) | |
553 | continue | |
554 | try: | |
555 | if candidate_time > latest_time: | |
556 | latest_time = candidate_time | |
557 | latest = candidate | |
558 | except TypeError: | |
559 | latest_time = candidate_time | |
560 | latest = candidate | |
73695baf | 561 | logging.info("Found last old thing: {} / {}".format(latest, latest_time)) |
eb4e5a3f | 562 | return latest, latest_time |
8ed15058 | 563 | |
eb4e5a3f | 564 | def download(self, base_dir, compress, api_key): |
7b84ba6d OM |
565 | """ Download all files for a given thing. |
566 | Returns True iff the thing is now downloaded (not iff it downloads the thing!) | |
567 | """ | |
3c82f75b | 568 | if not self._parsed: |
eb4e5a3f | 569 | self._parse(base_dir, api_key) |
3c82f75b | 570 | |
e0e69fc6 | 571 | if not self._parsed: |
8cdd1b54 OM |
572 | logging.error( |
573 | "Unable to parse {} - aborting download".format(self.thing_id)) | |
7b84ba6d | 574 | return State.FAILED |
e0e69fc6 | 575 | |
3c82f75b | 576 | if not self._needs_download: |
e45ba963 | 577 | logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name)) |
7b84ba6d | 578 | return State.ALREADY_DOWNLOADED |
3c82f75b | 579 | |
247c2cd5 | 580 | if not self._file_links: |
73695baf OM |
581 | logging.error( |
582 | "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name)) | |
247c2cd5 OM |
583 | return State.FAILED |
584 | ||
3c82f75b | 585 | # Have we already downloaded some things? |
73695baf | 586 | renamed_dir = self._handle_outdated_directory() |
3c82f75b OM |
587 | |
588 | # Get the list of files to download | |
3c82f75b OM |
589 | |
590 | new_file_links = [] | |
591 | old_file_links = [] | |
ae598d73 | 592 | self.time_stamp = None |
3c82f75b OM |
593 | |
594 | if not self.last_time: | |
595 | # If we don't have anything to copy from, then it is all new. | |
b497d705 OM |
596 | logging.debug("No last time, downloading all files") |
597 | new_file_links = self._file_links | |
ae598d73 | 598 | self.time_stamp = new_file_links[0].last_update |
73695baf | 599 | |
b497d705 | 600 | for file_link in new_file_links: |
ae598d73 OM |
601 | self.time_stamp = max(self.time_stamp, file_link.last_update) |
602 | logging.debug("New timestamp will be {}".format(self.time_stamp)) | |
3c82f75b | 603 | else: |
ae598d73 | 604 | self.time_stamp = self.last_time |
b497d705 OM |
605 | for file_link in self._file_links: |
606 | if file_link.last_update > self.last_time: | |
3c82f75b | 607 | new_file_links.append(file_link) |
ae598d73 | 608 | self.time_stamp = max(self.time_stamp, file_link.last_update) |
3c82f75b OM |
609 | else: |
610 | old_file_links.append(file_link) | |
3c82f75b | 611 | |
ae598d73 | 612 | logging.debug("new timestamp {}".format(self.time_stamp)) |
3c82f75b OM |
613 | |
614 | # OK. Time to get to work. | |
fa2f3251 | 615 | logging.debug("Generating download_dir") |
3c82f75b | 616 | os.mkdir(self.download_dir) |
b497d705 | 617 | filelist_file = os.path.join(self.download_dir, "filelist.txt") |
d194b140 | 618 | with open(filelist_file, 'w', encoding="utf-8") as fl_handle: |
b497d705 | 619 | for fl in self._file_links: |
73695baf | 620 | fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update)) |
b497d705 | 621 | |
3c82f75b | 622 | # First grab the cached files (if any) |
fa2f3251 | 623 | logging.info("Copying {} unchanged files.".format(len(old_file_links))) |
e6d8def4 OM |
624 | if renamed_dir: |
625 | for file_link in old_file_links: | |
626 | try: | |
627 | old_file = os.path.join(renamed_dir, file_link.name) | |
628 | new_file = truncate_name(os.path.join(self.download_dir, file_link.name)) | |
629 | logging.debug("Copying {} to {}".format(old_file, new_file)) | |
630 | copyfile(old_file, new_file) | |
631 | except FileNotFoundError: | |
632 | logging.warning( | |
633 | "Unable to find {} in old archive, redownloading".format(file_link.name)) | |
634 | new_file_links.append(file_link) | |
635 | except TypeError: | |
636 | # Not altogether sure how this could occur, possibly with some combination of the old file types | |
637 | logging.warning( | |
638 | "Typeerror looking for {} in {}".format(file_link.name, renamed_dir)) | |
639 | new_file_links.append(file_link) | |
640 | ||
3c82f75b | 641 | # Now download the new ones |
dbdb1782 | 642 | logging.info("Downloading {} new files of {}".format( |
b497d705 | 643 | len(new_file_links), len(self._file_links))) |
3c82f75b | 644 | try: |
b497d705 | 645 | for file_link in new_file_links: |
65bd8b43 | 646 | file_name = truncate_name(os.path.join(self.download_dir, file_link.name)) |
dbdb1782 | 647 | logging.debug("Downloading {} from {} to {}".format( |
b497d705 | 648 | file_link.name, file_link.link, file_name)) |
10f0238d | 649 | data_req = SESSION.get(file_link.link) |
e45ba963 | 650 | if data_req.status_code != 200: |
73695baf OM |
651 | logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code, |
652 | sanitise_url(file_link.link), | |
653 | data_req.text)) | |
e45ba963 OM |
654 | fail_dir(self.download_dir) |
655 | return State.FAILED | |
e45ba963 | 656 | |
3c82f75b OM |
657 | with open(file_name, 'wb') as handle: |
658 | handle.write(data_req.content) | |
659 | except Exception as exception: | |
b497d705 | 660 | logging.error("Failed to download {} - {}".format(file_link.name, exception)) |
65bd8b43 | 661 | fail_dir(self.download_dir) |
7b84ba6d | 662 | return State.FAILED |
3c82f75b | 663 | |
e45ba963 | 664 | # People like images. |
680039fe | 665 | image_dir = os.path.join(self.download_dir, 'images') |
fb28c59b | 666 | logging.info("Downloading {} images.".format(len(self._image_links))) |
680039fe OM |
667 | try: |
668 | os.mkdir(image_dir) | |
fb28c59b | 669 | for imagelink in self._image_links: |
e45ba963 OM |
670 | filename = os.path.join(image_dir, imagelink.name) |
671 | image_req = SESSION.get(imagelink.link) | |
672 | if image_req.status_code != 200: | |
73695baf OM |
673 | logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code, |
674 | sanitise_url(imagelink.link), | |
675 | image_req.text)) | |
e45ba963 OM |
676 | fail_dir(self.download_dir) |
677 | return State.FAILED | |
678 | with open(truncate_name(filename), 'wb') as handle: | |
680039fe OM |
679 | handle.write(image_req.content) |
680 | except Exception as exception: | |
e45ba963 | 681 | logging.error("Failed to download {} - {}".format(imagelink.name, exception)) |
65bd8b43 | 682 | fail_dir(self.download_dir) |
7b84ba6d | 683 | return State.FAILED |
680039fe | 684 | |
4f75dd69 | 685 | # Best get some licenses |
e45ba963 | 686 | logging.info("writing license file") |
4f75dd69 | 687 | try: |
fb28c59b | 688 | if self._license: |
73695baf OM |
689 | with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w', |
690 | encoding="utf-8") as license_handle: | |
fb28c59b | 691 | license_handle.write("{}\n".format(self._license)) |
4f75dd69 OM |
692 | except IOError as exception: |
693 | logging.warning("Failed to write license! {}".format(exception)) | |
fb28c59b | 694 | |
e45ba963 OM |
695 | logging.info("writing readme") |
696 | try: | |
697 | if self._details: | |
73695baf OM |
698 | with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w', |
699 | encoding="utf-8") as readme_handle: | |
e45ba963 OM |
700 | readme_handle.write("{}\n".format(self._details)) |
701 | except IOError as exception: | |
702 | logging.warning("Failed to write readme! {}".format(exception)) | |
703 | ||
3c82f75b OM |
704 | try: |
705 | # Now write the timestamp | |
73695baf | 706 | with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle: |
ae598d73 | 707 | timestamp_handle.write(self.time_stamp.__str__()) |
3c82f75b | 708 | except Exception as exception: |
e45ba963 | 709 | logging.error("Failed to write timestamp file - {}".format(exception)) |
65bd8b43 | 710 | fail_dir(self.download_dir) |
7b84ba6d | 711 | return State.FAILED |
3c82f75b | 712 | self._needs_download = False |
e45ba963 | 713 | logging.debug("Download of {} finished".format(self.name)) |
ae598d73 OM |
714 | if not compress: |
715 | return State.OK | |
716 | ||
ae598d73 | 717 | thing_dir = "{} - {} - {}".format(self.thing_id, |
73695baf OM |
718 | slugify(self.name), |
719 | self.time_stamp.strftime(SAFE_DATETIME_FORMAT)) | |
ae598d73 | 720 | file_name = os.path.join(base_dir, |
73695baf | 721 | "{}.7z".format(thing_dir)) |
ae598d73 | 722 | logging.debug("Compressing {} to {}".format( |
e45ba963 | 723 | self.name, |
ae598d73 | 724 | file_name)) |
ae598d73 | 725 | with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive: |
ae598d73 | 726 | archive.writeall(self.download_dir, thing_dir) |
e45ba963 | 727 | logging.debug("Compression of {} finished.".format(self.name)) |
8ed15058 | 728 | shutil.rmtree(self.download_dir) |
e45ba963 | 729 | logging.debug("Removed temporary download dir of {}.".format(self.name)) |
7b84ba6d | 730 | return State.OK |
975060c9 | 731 | |
dbdb1782 | 732 | |
ae598d73 | 733 | def do_batch(batch_file, download_dir, quick, compress): |
1ab49020 OM |
734 | """ Read a file in line by line, parsing each as a set of calls to this script.""" |
735 | with open(batch_file) as handle: | |
736 | for line in handle: | |
737 | line = line.strip() | |
cf280385 M |
738 | if not line: |
739 | # Skip empty lines | |
740 | continue | |
1ab49020 OM |
741 | logging.info("Handling instruction {}".format(line)) |
742 | command_arr = line.split() | |
743 | if command_arr[0] == "thing": | |
dbdb1782 OM |
744 | logging.debug( |
745 | "Handling batch thing instruction: {}".format(line)) | |
eb4e5a3f | 746 | Thing.from_thing_id(command_arr[1]).download(download_dir, compress) |
1ab49020 OM |
747 | continue |
748 | if command_arr[0] == "collection": | |
dbdb1782 OM |
749 | logging.debug( |
750 | "Handling batch collection instruction: {}".format(line)) | |
751 | Collection(command_arr[1], command_arr[2], | |
ae598d73 | 752 | download_dir, quick, compress).download() |
1ab49020 OM |
753 | continue |
754 | if command_arr[0] == "user": | |
dbdb1782 OM |
755 | logging.debug( |
756 | "Handling batch collection instruction: {}".format(line)) | |
ae598d73 | 757 | Designs(command_arr[1], download_dir, quick, compress).download() |
1ab49020 OM |
758 | continue |
759 | logging.warning("Unable to parse current instruction. Skipping.") | |
760 | ||
dbdb1782 | 761 | |
975060c9 OM |
762 | def main(): |
763 | """ Entry point for script being run as a command. """ | |
764 | parser = argparse.ArgumentParser() | |
dbdb1782 | 765 | parser.add_argument("-l", "--log-level", choices=[ |
73695baf | 766 | 'debug', 'info', 'warning'], default='info', help="level of logging desired") |
dbdb1782 OM |
767 | parser.add_argument("-d", "--directory", |
768 | help="Target directory to download into") | |
4f94efc8 OM |
769 | parser.add_argument("-f", "--log-file", |
770 | help="Place to log debug information to") | |
7b84ba6d OM |
771 | parser.add_argument("-q", "--quick", action="store_true", |
772 | help="Assume date ordering on posts") | |
ae598d73 OM |
773 | parser.add_argument("-c", "--compress", action="store_true", |
774 | help="Compress files") | |
e45ba963 OM |
775 | parser.add_argument("-a", "--api-key", |
776 | help="API key for thingiverse") | |
7b84ba6d | 777 | |
dbdb1782 OM |
778 | subparsers = parser.add_subparsers( |
779 | help="Type of thing to download", dest="subcommand") | |
780 | collection_parser = subparsers.add_parser( | |
b7bfef68 | 781 | 'collection', help="Download one or more entire collection(s)") |
dbdb1782 | 782 | collection_parser.add_argument( |
b7bfef68 | 783 | "owner", help="The owner of the collection(s) to get") |
dbdb1782 | 784 | collection_parser.add_argument( |
73695baf | 785 | "collections", nargs="+", help="Space seperated list of the name(s) of collection to get") |
dbdb1782 OM |
786 | thing_parser = subparsers.add_parser( |
787 | 'thing', help="Download a single thing.") | |
8cdd1b54 OM |
788 | thing_parser.add_argument( |
789 | "things", nargs="*", help="Space seperated list of thing ID(s) to download") | |
dbdb1782 | 790 | user_parser = subparsers.add_parser( |
73695baf | 791 | "user", help="Download all things by one or more users") |
8cdd1b54 OM |
792 | user_parser.add_argument( |
793 | "users", nargs="+", help="A space seperated list of the user(s) to get the designs of") | |
dbdb1782 OM |
794 | batch_parser = subparsers.add_parser( |
795 | "batch", help="Perform multiple actions written in a text file") | |
796 | batch_parser.add_argument( | |
797 | "batch_file", help="The name of the file to read.") | |
680039fe | 798 | subparsers.add_parser("version", help="Show the current version") |
4a98996b | 799 | |
975060c9 | 800 | args = parser.parse_args() |
4a98996b OM |
801 | if not args.subcommand: |
802 | parser.print_help() | |
803 | sys.exit(1) | |
d66f1f78 OM |
804 | if not args.directory: |
805 | args.directory = os.getcwd() | |
4f94efc8 OM |
806 | |
807 | logger = logging.getLogger() | |
808 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
809 | logger.setLevel(logging.DEBUG) | |
810 | console_handler = logging.StreamHandler() | |
811 | console_handler.setLevel(args.log_level.upper()) | |
812 | ||
e45ba963 OM |
813 | global API_KEY |
814 | if args.api_key: | |
73695baf | 815 | API_KEY = args.api_key |
e45ba963 OM |
816 | else: |
817 | try: | |
818 | with open("api.key") as fh: | |
73695baf | 819 | API_KEY = fh.read().strip() |
e45ba963 OM |
820 | except Exception as e: |
821 | logging.error("Either specify the api-key on the command line or in a file called 'api.key'") | |
822 | logging.error("Exception: {}".format(e)) | |
823 | return | |
824 | ||
4f94efc8 OM |
825 | logger.addHandler(console_handler) |
826 | if args.log_file: | |
827 | file_handler = logging.FileHandler(args.log_file) | |
828 | file_handler.setLevel(logging.DEBUG) | |
829 | file_handler.setFormatter(formatter) | |
830 | logger.addHandler(file_handler) | |
fa2f3251 | 831 | |
6a777954 OM |
832 | # Start downloader |
833 | thing_queue = multiprocessing.JoinableQueue() | |
834 | logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT)) | |
eb4e5a3f | 835 | downloaders = [Downloader(thing_queue, args.directory, args.compress, API_KEY) for _ in range(DOWNLOADER_COUNT)] |
6a777954 OM |
836 | for downloader in downloaders: |
837 | downloader.start() | |
838 | ||
4a98996b | 839 | if args.subcommand.startswith("collection"): |
b7bfef68 | 840 | for collection in args.collections: |
ae598d73 | 841 | Collection(args.owner, collection, args.directory, args.quick, args.compress).download() |
4a98996b | 842 | if args.subcommand == "thing": |
b7bfef68 | 843 | for thing in args.things: |
6a777954 | 844 | thing_queue.put(thing) |
3522a3bf | 845 | if args.subcommand == "user": |
b7bfef68 | 846 | for user in args.users: |
ae598d73 | 847 | Designs(user, args.directory, args.quick, args.compress).download() |
db8066ec OM |
848 | if args.subcommand == "version": |
849 | print("thingy_grabber.py version {}".format(VERSION)) | |
1ab49020 | 850 | if args.subcommand == "batch": |
ae598d73 | 851 | do_batch(args.batch_file, args.directory, args.quick, args.compress) |
1ab49020 | 852 | |
6a777954 | 853 | # Stop the downloader processes |
73695baf | 854 | for _ in downloaders: |
6a777954 | 855 | thing_queue.put(None) |
975060c9 | 856 | |
d194b140 | 857 | |
73695baf | 858 | if __name__ == "__main__": |
0930777e | 859 | multiprocessing.freeze_support() |
975060c9 | 860 | main() |