Use a safe formatting for timstamps in filenames
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
975060c9 17from bs4 import BeautifulSoup
b497d705
OM
18from dataclasses import dataclass
19import selenium
20from selenium import webdriver
21from selenium.webdriver.common.by import By
22from selenium.webdriver.support.ui import WebDriverWait
23from selenium.webdriver.support import expected_conditions as EC
24from selenium.webdriver.firefox.options import Options
d194b140 25import atexit
9828dabe 26import py7zr
8ed15058
OM
27import glob
28import shutil
975060c9 29
ae598d73
OM
30SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
31
8ed15058
OM
32# I don't think this is exported by datetime
33DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
3ac180ed
OM
34# Windows cannot handle : in filenames
35SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
8ed15058 36
975060c9
OM
37URL_BASE = "https://www.thingiverse.com"
38URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
948bd56f 39USER_COLLECTION = URL_BASE + "/ajax/user/designs"
975060c9
OM
40
41ID_REGEX = re.compile(r'"id":(\d*),')
42TOTAL_REGEX = re.compile(r'"total":(\d*),')
43LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
44# This appears to be fixed at 12, but if it changes would screw the rest up.
45PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
46NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
47
6a777954 48DOWNLOADER_COUNT = 1
7b84ba6d 49RETRY_COUNT = 3
6a777954 50
65bd8b43
OM
51MAX_PATH_LENGTH = 250
52
ae598d73 53VERSION = "0.9.0"
dbdb1782 54
8ed15058 55TIMESTAMP_FILE = "timestamp.txt"
b497d705
OM
56
57#BROWSER = webdriver.PhantomJS('./phantomjs')
58options = Options()
fb28c59b 59options.add_argument("--headless")
b497d705
OM
60BROWSER = webdriver.Firefox(options=options)
61
62BROWSER.set_window_size(1980, 1080)
63
64
65@dataclass
66class FileLink:
67 name: str
ae598d73
OM
68 last_update: datetime.datetime
69 link: str
70
71class FileLinks:
72 def __init__(self, initial_links=[]):
73 self.links = []
74 self.last_update = None
75 for link in initial_links:
76 self.append(link)
77
78 def __iter__(self):
79 return iter(self.links)
80
81 def __getitem__(self, item):
82 return self.links[item]
83
84 def __len__(self):
85 return len(self.links)
86
87 def append(self, link):
88 try:
89 self.last_update = max(self.last_update, link.last_update)
90 except TypeError:
91 self.last_update = link.last_update
92 self.links.append(link)
8ed15058 93
b497d705 94
7b84ba6d
OM
95class State(enum.Enum):
96 OK = enum.auto()
97 FAILED = enum.auto()
98 ALREADY_DOWNLOADED = enum.auto()
99
8ed15058
OM
100def rename_unique(dir_name, target_dir_name):
101 """ Move a directory sideways to a new name, ensuring it is unique.
65bd8b43 102 """
8ed15058 103 target_dir = target_dir_name
65bd8b43
OM
104 inc = 0
105 while os.path.exists(target_dir):
8ed15058 106 target_dir = "{}_{}".format(target_dir_name, inc)
65bd8b43
OM
107 inc += 1
108 os.rename(dir_name, target_dir)
8ed15058
OM
109 return target_dir
110
111
112def fail_dir(dir_name):
113 """ When a download has failed, move it sideways.
114 """
115 return rename_unique(dir_name,"{}_failed".format(dir_name))
65bd8b43
OM
116
117
118def truncate_name(file_name):
119 """ Ensure the filename is not too long for, well windows basically.
120 """
121 path = os.path.abspath(file_name)
122 if len(path) <= MAX_PATH_LENGTH:
123 return path
124 to_cut = len(path) - (MAX_PATH_LENGTH + 3)
125 base, extension = os.path.splitext(path)
126 inc = 0
127 new_path = "{}_{}{}".format(base, inc, extension)
128 while os.path.exists(new_path):
129 new_path = "{}_{}{}".format(base, inc, extension)
130 inc += 1
131 return new_path
132
133
dd8c35f4
OM
134def strip_ws(value):
135 """ Remove whitespace from a string """
136 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9 137
dbdb1782 138
975060c9
OM
139def slugify(value):
140 """
d194b140
OM
141 Normalise string, removes invalid for filename charactersr
142 and converts string to lowercase.
975060c9 143 """
d194b140 144 value = unicodedata.normalize('NFKC', value).lower().strip()
65bd8b43
OM
145 value = re.sub(r'[\\/<>:\?\*\|"]', '', value)
146 value = re.sub(r'\.*$', '', value)
147 return value
975060c9 148
b497d705
OM
149class PageChecker(object):
150 def __init__(self):
151 self.log = []
152 self.title = None
153 self.file_count = None
154 self.files = None
fb28c59b
OM
155 self.images = None
156 self.license = None
b497d705
OM
157
158
159 def __call__(self, _):
160 try:
161 self.log.append("call")
162 if self.title is None:
163 # first find the name
164 name = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=ThingPage__modelName]"))
165 if name is None:
166 return False
167 self.title = name.text
168
169 if self.file_count is None:
170 # OK. Do we know how many files we have to download?
171 metrics = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=MetricButton]"))
172 self.log.append("got some metrics: {}".format(len(metrics)))
173 cur_count = int([x.text.split("\n")[0] for x in metrics if x.text.endswith("\nThing Files")][0])
174 self.log.append(cur_count)
175 if cur_count == 0:
176 return False
177 self.file_count = cur_count
178
179 self.log.append("looking for {} files".format(self.file_count))
180 fileRows = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=ThingFile__fileRow]"))
181 self.log.append("found {} files".format(len(fileRows)))
fb28c59b
OM
182 if len(fileRows) < self.file_count:
183 return False
184
185 self.log.append("Looking for images")
186 # By this point _should_ have loaded all the images
187 self.images = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=thumb]"))
188 self.license = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=License__licenseText]")).text
189 self.log.append("found {} images".format(len(self.images)))
190 self.files = fileRows
191 return True
b497d705
OM
192 except Exception:
193 return False
194
195
196
197
6a777954
OM
198class Downloader(multiprocessing.Process):
199 """
200 Class to handle downloading the things we have found to get.
201 """
202
ae598d73 203 def __init__(self, thing_queue, download_directory, compress):
6a777954
OM
204 multiprocessing.Process.__init__(self)
205 # TODO: add parameters
206 self.thing_queue = thing_queue
207 self.download_directory = download_directory
ae598d73 208 self.compress = compress
6a777954
OM
209
210 def run(self):
211 """ actual download loop.
212 """
213 while True:
214 thing_id = self.thing_queue.get()
215 if thing_id is None:
216 logging.info("Shutting download queue")
217 self.thing_queue.task_done()
218 break
219 logging.info("Handling id {}".format(thing_id))
ae598d73 220 Thing(thing_id).download(self.download_directory, self.compress)
6a777954
OM
221 self.thing_queue.task_done()
222 return
223
7b84ba6d 224
6a777954
OM
225
226
dbdb1782 227
3522a3bf 228class Grouping:
d66f1f78 229 """ Holds details of a group of things for download
3c82f75b
OM
230 This is effectively (although not actually) an abstract class
231 - use Collection or Designs instead.
232 """
dbdb1782 233
ae598d73 234 def __init__(self, quick, compress):
975060c9
OM
235 self.things = []
236 self.total = 0
237 self.req_id = None
238 self.last_page = 0
239 self.per_page = None
7b84ba6d
OM
240 # Should we stop downloading when we hit a known datestamp?
241 self.quick = quick
ae598d73 242 self.compress = compress
948bd56f 243 # These should be set by child classes.
3522a3bf
OM
244 self.url = None
245 self.download_dir = None
948bd56f 246 self.collection_url = None
975060c9 247
3522a3bf
OM
248 def _get_small_grouping(self, req):
249 """ Handle small groupings """
975060c9 250 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 251 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9 252 self.things = [x['href'].split(':')[1] for x in links]
fa2f3251 253 self.total = len(self.things)
975060c9
OM
254
255 return self.things
256
3522a3bf
OM
257 def get(self):
258 """ retrieve the things of the grouping. """
975060c9
OM
259 if self.things:
260 # We've already done it.
261 return self.things
262
3522a3bf
OM
263 # Check for initialisation:
264 if not self.url:
fa2f3251 265 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
266 raise ValueError("No URL set - object not initialised properly?")
267
268 # Get the internal details of the grouping.
fa2f3251 269 logging.debug("Querying {}".format(self.url))
3522a3bf 270 c_req = requests.get(self.url)
975060c9
OM
271 total = TOTAL_REGEX.search(c_req.text)
272 if total is None:
3522a3bf
OM
273 # This is a small (<13) items grouping. Pull the list from this req.
274 return self._get_small_grouping(c_req)
975060c9
OM
275 self.total = total.groups()[0]
276 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
277 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
278 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
279 parameters = {
dbdb1782
OM
280 'base_url': self.url,
281 'page': '1',
282 'per_page': '12',
283 'id': self.req_id
975060c9
OM
284 }
285 for current_page in range(1, self.last_page + 1):
286 parameters['page'] = current_page
948bd56f 287 req = requests.post(self.collection_url, parameters)
975060c9 288 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 289 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9
OM
290 self.things += [x['href'].split(':')[1] for x in links]
291
292 return self.things
293
294 def download(self):
295 """ Downloads all the files in a collection """
296 if not self.things:
3522a3bf
OM
297 self.get()
298
299 if not self.download_dir:
dbdb1782
OM
300 raise ValueError(
301 "No download_dir set - invalidly initialised object?")
3522a3bf 302
975060c9 303 base_dir = os.getcwd()
975060c9 304 try:
3522a3bf 305 os.mkdir(self.download_dir)
975060c9 306 except FileExistsError:
fa2f3251 307 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 308 .format(self.download_dir))
fa2f3251 309 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 310 for idx, thing in enumerate(self.things):
fb28c59b 311 logging.info("Downloading thing {} - {}".format(idx, thing))
ae598d73 312 RC = Thing(thing).download(self.download_dir, self.compress)
7b84ba6d
OM
313 if self.quick and RC==State.ALREADY_DOWNLOADED:
314 logging.info("Caught up, stopping.")
315 return
975060c9 316
dbdb1782 317
ae598d73
OM
318
319
320
3522a3bf
OM
321class Collection(Grouping):
322 """ Holds details of a collection. """
dbdb1782 323
ae598d73
OM
324 def __init__(self, user, name, directory, quick, compress):
325 Grouping.__init__(self, quick, compress)
3522a3bf
OM
326 self.user = user
327 self.name = name
3c82f75b
OM
328 self.url = "{}/{}/collections/{}".format(
329 URL_BASE, self.user, strip_ws(self.name))
d66f1f78 330 self.download_dir = os.path.join(directory,
3c82f75b 331 "{}-{}".format(slugify(self.user), slugify(self.name)))
948bd56f 332 self.collection_url = URL_COLLECTION
3522a3bf 333
dbdb1782 334
3522a3bf
OM
335class Designs(Grouping):
336 """ Holds details of all of a users' designs. """
dbdb1782 337
ae598d73
OM
338 def __init__(self, user, directory, quick, compress):
339 Grouping.__init__(self, quick, compress)
3522a3bf
OM
340 self.user = user
341 self.url = "{}/{}/designs".format(URL_BASE, self.user)
dbdb1782
OM
342 self.download_dir = os.path.join(
343 directory, "{} designs".format(slugify(self.user)))
948bd56f 344 self.collection_url = USER_COLLECTION
975060c9 345
dbdb1782 346
3c82f75b
OM
347class Thing:
348 """ An individual design on thingiverse. """
dbdb1782 349
3c82f75b
OM
350 def __init__(self, thing_id):
351 self.thing_id = thing_id
352 self.last_time = None
353 self._parsed = False
354 self._needs_download = True
355 self.text = None
356 self.title = None
357 self.download_dir = None
ae598d73
OM
358 self.time_stamp = None
359 self._file_links = FileLinks()
975060c9 360
3c82f75b
OM
361 def _parse(self, base_dir):
362 """ Work out what, if anything needs to be done. """
363 if self._parsed:
364 return
e36c2a07 365
3c82f75b 366 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
e0e69fc6 367 try:
b497d705 368 BROWSER.get(url)
fb28c59b 369 wait = WebDriverWait(BROWSER, 60)
b497d705
OM
370 pc = PageChecker()
371 wait.until(pc)
e0e69fc6 372 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
373 logging.error("Unable to connect for thing {}: {}".format(
374 self.thing_id, error))
375 return
fb28c59b
OM
376 except selenium.common.exceptions.TimeoutException:
377 logging.error(pc.log)
378 logging.error("Timeout trying to parse thing {}".format(self.thing_id))
379 return
e0e69fc6 380
b497d705 381 self.title = pc.title
247c2cd5
OM
382 if not pc.files:
383 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(self.thing_id))
b497d705 384 for link in pc.files:
fb28c59b 385 logging.debug("Parsing link: {}".format(link.text))
b497d705 386 link_link = link.find_element_by_xpath(".//a").get_attribute("href")
fb28c59b
OM
387 if link_link.endswith("/zip"):
388 # bulk link.
389 continue
390 try:
391 link_title, link_details, _ = link.text.split("\n")
392 except ValueError:
393 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
394 _, link_title, link_details, _ = link.text.split("\n")
395
396 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
397 #need to convert from M D Y to Y M D
1267e583 398 link_date = [int(x) for x in link_details.split("|")[1].split()[-1].split("-")]
fb28c59b 399 try:
d194b140 400 self._file_links.append(FileLink(link_title, datetime.datetime(link_date[2], link_date[0], link_date[1]), link_link))
fb28c59b
OM
401 except ValueError:
402 logging.error(link_date)
403
404 self._image_links=[x.find_element_by_xpath(".//img").get_attribute("src") for x in pc.images]
d194b140 405 self._license = pc.license
fb28c59b 406 self.pc = pc
e0e69fc6 407
e0e69fc6 408
8ed15058
OM
409 self.slug = "{} - {}".format(self.thing_id, slugify(self.title))
410 self.download_dir = os.path.join(base_dir, self.slug)
411
412 self._handle_old_directory(base_dir)
3c82f75b 413
fa2f3251 414 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
8ed15058 415 latest, self.last_time = self._find_last_download(base_dir)
fa2f3251 416
8ed15058 417 if not latest:
3b497b1a
M
418 # Not yet downloaded
419 self._parsed = True
420 return
3c82f75b 421
3c82f75b 422
8ed15058 423 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
424
425 # OK, so we have a timestamp, lets see if there is anything new to get
ae598d73
OM
426 try:
427 if self._file_links.last_update > self.last_time:
dbdb1782 428 logging.info(
ae598d73 429 "Found new/updated files {}".format(self._file_links.last_update))
3c82f75b
OM
430 self._needs_download = True
431 self._parsed = True
432 return
ae598d73
OM
433 except TypeError:
434 logging.warning("No files found for {}.".format(self.thing_id))
b497d705 435
3c82f75b 436 # Got here, so nope, no new files.
3c82f75b
OM
437 self._needs_download = False
438 self._parsed = True
439
8ed15058
OM
440 def _handle_old_directory(self, base_dir):
441 """ Deal with any old directories from previous versions of the code.
442 """
443 old_dir = os.path.join(base_dir, slugify(self.title))
444 if os.path.exists(old_dir):
445 logging.warning("Found old style download_dir. Moving.")
446 rename_unique(old_dir, self.download_dir)
447
448 def _handle_outdated_directory(self, base_dir):
449 """ Move the current download directory sideways if the thing has changed.
450 """
451 if not os.path.exists(self.download_dir):
452 # No old directory to move.
453 return None
454 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
455 if not os.path.exists(timestamp_file):
456 # Old form of download directory
457 target_dir_name = "{} - old".format(self.download_dir)
458 else:
3ac180ed 459 target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
8ed15058
OM
460 return rename_unique(self.download_dir, target_dir_name)
461
462 def _find_last_download(self, base_dir):
463 """ Look for the most recent previous download (if any) of the thing.
464 """
465 logging.info("Looking for old things")
466
467 # First the DL directory itself.
468 timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
469
470 latest = None
471 latest_time = None
472
473 try:
474 logging.debug("Checking for existing download in normal place.")
475 with open(timestamp_file) as ts_fh:
476 timestamp_text = ts_fh.read().strip()
477 latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
478 latest = self.download_dir
479 except FileNotFoundError:
480 # No existing download directory. huh.
481 pass
482 except TypeError:
483 logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
484
485 # TODO: Maybe look for old download directories.
486
487
488 # Now look for 7z files
489 candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
490 # +3 to allow for ' - '
491 leading_length =len(self.slug)+3
492 for path in candidates:
493 candidate = os.path.basename(path)
494 try:
495 logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
3ac180ed 496 candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
8ed15058
OM
497 except ValueError:
498 logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
499 continue
500 try:
501 if candidate_time > latest_time:
502 latest_time = candidate_time
503 latest = candidate
504 except TypeError:
505 latest_time = candidate_time
506 latest = candidate
507 logging.info("Found last old thing: {} / {}".format(latest,latest_time))
508 return (latest, latest_time)
509
510
511
ae598d73 512 def download(self, base_dir, compress):
7b84ba6d
OM
513 """ Download all files for a given thing.
514 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
515 """
3c82f75b
OM
516 if not self._parsed:
517 self._parse(base_dir)
518
e0e69fc6 519 if not self._parsed:
8cdd1b54
OM
520 logging.error(
521 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 522 return State.FAILED
e0e69fc6 523
3c82f75b 524 if not self._needs_download:
7b84ba6d
OM
525 print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title))
526 return State.ALREADY_DOWNLOADED
3c82f75b 527
247c2cd5
OM
528 if not self._file_links:
529 print("{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.title))
530 return State.FAILED
531
3c82f75b 532 # Have we already downloaded some things?
8ed15058 533 renamed_dir = self._handle_outdated_directory(base_dir)
3c82f75b
OM
534
535 # Get the list of files to download
3c82f75b
OM
536
537 new_file_links = []
538 old_file_links = []
ae598d73 539 self.time_stamp = None
3c82f75b
OM
540
541 if not self.last_time:
542 # If we don't have anything to copy from, then it is all new.
b497d705
OM
543 logging.debug("No last time, downloading all files")
544 new_file_links = self._file_links
ae598d73 545 self.time_stamp = new_file_links[0].last_update
b497d705
OM
546
547 for file_link in new_file_links:
ae598d73
OM
548 self.time_stamp = max(self.time_stamp, file_link.last_update)
549 logging.debug("New timestamp will be {}".format(self.time_stamp))
3c82f75b 550 else:
ae598d73 551 self.time_stamp = self.last_time
b497d705
OM
552 for file_link in self._file_links:
553 if file_link.last_update > self.last_time:
3c82f75b 554 new_file_links.append(file_link)
ae598d73 555 self.time_stamp = max(self.time_stamp, file_link.last_update)
3c82f75b
OM
556 else:
557 old_file_links.append(file_link)
3c82f75b 558
ae598d73 559 logging.debug("new timestamp {}".format(self.time_stamp))
3c82f75b
OM
560
561 # OK. Time to get to work.
fa2f3251 562 logging.debug("Generating download_dir")
3c82f75b 563 os.mkdir(self.download_dir)
b497d705 564 filelist_file = os.path.join(self.download_dir, "filelist.txt")
d194b140 565 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705
OM
566 for fl in self._file_links:
567 base_link = fl.link
568 try:
569 fl.link=requests.get(fl.link, allow_redirects=False).headers['location']
fb28c59b
OM
570 except Exception:
571 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
572 pass
b497d705 573
fb28c59b 574 fl_handle.write("{},{},{}, {}\n".format(fl.link, fl.name, fl.last_update, base_link))
b497d705
OM
575
576
3c82f75b 577 # First grab the cached files (if any)
fa2f3251 578 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
3c82f75b 579 for file_link in old_file_links:
8ed15058 580 old_file = os.path.join(renamed_dir, file_link.name)
65bd8b43 581 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
3c82f75b 582 try:
fa2f3251 583 logging.debug("Copying {} to {}".format(old_file, new_file))
3c82f75b
OM
584 copyfile(old_file, new_file)
585 except FileNotFoundError:
dbdb1782
OM
586 logging.warning(
587 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
3c82f75b
OM
588 new_file_links.append(file_link)
589
590 # Now download the new ones
dbdb1782 591 logging.info("Downloading {} new files of {}".format(
b497d705 592 len(new_file_links), len(self._file_links)))
3c82f75b 593 try:
b497d705 594 for file_link in new_file_links:
65bd8b43 595 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 596 logging.debug("Downloading {} from {} to {}".format(
b497d705
OM
597 file_link.name, file_link.link, file_name))
598 data_req = requests.get(file_link.link)
3c82f75b
OM
599 with open(file_name, 'wb') as handle:
600 handle.write(data_req.content)
601 except Exception as exception:
b497d705 602 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 603 fail_dir(self.download_dir)
7b84ba6d 604 return State.FAILED
3c82f75b 605
b497d705 606
b497d705 607 # People like images. But this doesn't work yet.
680039fe 608 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 609 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
610 try:
611 os.mkdir(image_dir)
fb28c59b
OM
612 for imagelink in self._image_links:
613 filename = os.path.basename(imagelink)
680039fe
OM
614 if filename.endswith('stl'):
615 filename = "{}.png".format(filename)
fb28c59b 616 image_req = requests.get(imagelink)
65bd8b43 617 with open(truncate_name(os.path.join(image_dir, filename)), 'wb') as handle:
680039fe
OM
618 handle.write(image_req.content)
619 except Exception as exception:
620 print("Failed to download {} - {}".format(filename, exception))
65bd8b43 621 fail_dir(self.download_dir)
7b84ba6d 622 return State.FAILED
680039fe 623
fb28c59b 624 """
4f75dd69
OM
625 # instructions are good too.
626 logging.info("Downloading readme")
627 try:
8cdd1b54
OM
628 readme_txt = soup.find('meta', property='og:description')[
629 'content']
630 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
4f75dd69
OM
631 readme_handle.write("{}\n".format(readme_txt))
632 except (TypeError, KeyError) as exception:
633 logging.warning("No readme? {}".format(exception))
634 except IOError as exception:
635 logging.warning("Failed to write readme! {}".format(exception))
636
fb28c59b 637 """
4f75dd69
OM
638 # Best get some licenses
639 logging.info("Downloading license")
640 try:
fb28c59b 641 if self._license:
65bd8b43 642 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w', encoding="utf-8") as license_handle:
fb28c59b 643 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
644 except IOError as exception:
645 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 646
3c82f75b
OM
647 try:
648 # Now write the timestamp
8ed15058 649 with open(os.path.join(self.download_dir,TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
ae598d73 650 timestamp_handle.write(self.time_stamp.__str__())
3c82f75b
OM
651 except Exception as exception:
652 print("Failed to write timestamp file - {}".format(exception))
65bd8b43 653 fail_dir(self.download_dir)
7b84ba6d 654 return State.FAILED
3c82f75b 655 self._needs_download = False
fa2f3251 656 logging.debug("Download of {} finished".format(self.title))
ae598d73
OM
657 if not compress:
658 return State.OK
659
660
661 thing_dir = "{} - {} - {}".format(self.thing_id,
662 slugify(self.title),
3ac180ed 663 self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
ae598d73
OM
664 file_name = os.path.join(base_dir,
665 "{}.7z".format(thing_dir))
666 logging.debug("Compressing {} to {}".format(
667 self.title,
668 file_name))
ae598d73 669 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
ae598d73
OM
670 archive.writeall(self.download_dir, thing_dir)
671 logging.debug("Compression of {} finished.".format(self.title))
8ed15058
OM
672 shutil.rmtree(self.download_dir)
673 logging.debug("Removed temporary download dir of {}.".format(self.title))
7b84ba6d 674 return State.OK
975060c9 675
dbdb1782 676
ae598d73
OM
677
678
679def do_batch(batch_file, download_dir, quick, compress):
1ab49020
OM
680 """ Read a file in line by line, parsing each as a set of calls to this script."""
681 with open(batch_file) as handle:
682 for line in handle:
683 line = line.strip()
cf280385
M
684 if not line:
685 # Skip empty lines
686 continue
1ab49020
OM
687 logging.info("Handling instruction {}".format(line))
688 command_arr = line.split()
689 if command_arr[0] == "thing":
dbdb1782
OM
690 logging.debug(
691 "Handling batch thing instruction: {}".format(line))
ae598d73 692 Thing(command_arr[1]).download(download_dir, compress)
1ab49020
OM
693 continue
694 if command_arr[0] == "collection":
dbdb1782
OM
695 logging.debug(
696 "Handling batch collection instruction: {}".format(line))
697 Collection(command_arr[1], command_arr[2],
ae598d73 698 download_dir, quick, compress).download()
1ab49020
OM
699 continue
700 if command_arr[0] == "user":
dbdb1782
OM
701 logging.debug(
702 "Handling batch collection instruction: {}".format(line))
ae598d73 703 Designs(command_arr[1], download_dir, quick, compress).download()
1ab49020
OM
704 continue
705 logging.warning("Unable to parse current instruction. Skipping.")
706
dbdb1782 707
975060c9
OM
708def main():
709 """ Entry point for script being run as a command. """
710 parser = argparse.ArgumentParser()
dbdb1782
OM
711 parser.add_argument("-l", "--log-level", choices=[
712 'debug', 'info', 'warning'], default='info', help="level of logging desired")
713 parser.add_argument("-d", "--directory",
714 help="Target directory to download into")
4f94efc8
OM
715 parser.add_argument("-f", "--log-file",
716 help="Place to log debug information to")
7b84ba6d
OM
717 parser.add_argument("-q", "--quick", action="store_true",
718 help="Assume date ordering on posts")
ae598d73
OM
719 parser.add_argument("-c", "--compress", action="store_true",
720 help="Compress files")
721
7b84ba6d 722
dbdb1782
OM
723 subparsers = parser.add_subparsers(
724 help="Type of thing to download", dest="subcommand")
725 collection_parser = subparsers.add_parser(
b7bfef68 726 'collection', help="Download one or more entire collection(s)")
dbdb1782 727 collection_parser.add_argument(
b7bfef68 728 "owner", help="The owner of the collection(s) to get")
dbdb1782 729 collection_parser.add_argument(
b7bfef68 730 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
731 thing_parser = subparsers.add_parser(
732 'thing', help="Download a single thing.")
8cdd1b54
OM
733 thing_parser.add_argument(
734 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 735 user_parser = subparsers.add_parser(
b7bfef68 736 "user", help="Download all things by one or more users")
8cdd1b54
OM
737 user_parser.add_argument(
738 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
739 batch_parser = subparsers.add_parser(
740 "batch", help="Perform multiple actions written in a text file")
741 batch_parser.add_argument(
742 "batch_file", help="The name of the file to read.")
680039fe 743 subparsers.add_parser("version", help="Show the current version")
4a98996b 744
975060c9 745 args = parser.parse_args()
4a98996b
OM
746 if not args.subcommand:
747 parser.print_help()
748 sys.exit(1)
d66f1f78
OM
749 if not args.directory:
750 args.directory = os.getcwd()
4f94efc8
OM
751
752 logger = logging.getLogger()
753 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
754 logger.setLevel(logging.DEBUG)
755 console_handler = logging.StreamHandler()
756 console_handler.setLevel(args.log_level.upper())
757
758 logger.addHandler(console_handler)
759 if args.log_file:
760 file_handler = logging.FileHandler(args.log_file)
761 file_handler.setLevel(logging.DEBUG)
762 file_handler.setFormatter(formatter)
763 logger.addHandler(file_handler)
fa2f3251 764
6a777954
OM
765
766 # Start downloader
767 thing_queue = multiprocessing.JoinableQueue()
768 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
ae598d73 769 downloaders = [Downloader(thing_queue, args.directory, args.compress) for _ in range(DOWNLOADER_COUNT)]
6a777954
OM
770 for downloader in downloaders:
771 downloader.start()
772
773
4a98996b 774 if args.subcommand.startswith("collection"):
b7bfef68 775 for collection in args.collections:
ae598d73 776 Collection(args.owner, collection, args.directory, args.quick, args.compress).download()
4a98996b 777 if args.subcommand == "thing":
b7bfef68 778 for thing in args.things:
6a777954 779 thing_queue.put(thing)
3522a3bf 780 if args.subcommand == "user":
b7bfef68 781 for user in args.users:
ae598d73 782 Designs(user, args.directory, args.quick, args.compress).download()
db8066ec
OM
783 if args.subcommand == "version":
784 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 785 if args.subcommand == "batch":
ae598d73 786 do_batch(args.batch_file, args.directory, args.quick, args.compress)
1ab49020 787
6a777954
OM
788 # Stop the downloader processes
789 for downloader in downloaders:
790 thing_queue.put(None)
975060c9 791
d194b140
OM
792atexit.register(BROWSER.quit)
793
0930777e
OM
794if __name__ == "__main__":
795 multiprocessing.freeze_support()
975060c9 796 main()