Add custom FileLinks class, 7z writing initial implementation
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
975060c9 17from bs4 import BeautifulSoup
b497d705
OM
18from dataclasses import dataclass
19import selenium
20from selenium import webdriver
21from selenium.webdriver.common.by import By
22from selenium.webdriver.support.ui import WebDriverWait
23from selenium.webdriver.support import expected_conditions as EC
24from selenium.webdriver.firefox.options import Options
d194b140 25import atexit
9828dabe 26import py7zr
975060c9 27
ae598d73
OM
28SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
29
975060c9
OM
30URL_BASE = "https://www.thingiverse.com"
31URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
948bd56f 32USER_COLLECTION = URL_BASE + "/ajax/user/designs"
975060c9
OM
33
34ID_REGEX = re.compile(r'"id":(\d*),')
35TOTAL_REGEX = re.compile(r'"total":(\d*),')
36LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
37# This appears to be fixed at 12, but if it changes would screw the rest up.
38PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
39NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
40
6a777954 41DOWNLOADER_COUNT = 1
7b84ba6d 42RETRY_COUNT = 3
6a777954 43
65bd8b43
OM
44MAX_PATH_LENGTH = 250
45
ae598d73 46VERSION = "0.9.0"
dbdb1782 47
b497d705
OM
48
49#BROWSER = webdriver.PhantomJS('./phantomjs')
50options = Options()
fb28c59b 51options.add_argument("--headless")
b497d705
OM
52BROWSER = webdriver.Firefox(options=options)
53
54BROWSER.set_window_size(1980, 1080)
55
56
57@dataclass
58class FileLink:
59 name: str
ae598d73
OM
60 last_update: datetime.datetime
61 link: str
62
63class FileLinks:
64 def __init__(self, initial_links=[]):
65 self.links = []
66 self.last_update = None
67 for link in initial_links:
68 self.append(link)
69
70 def __iter__(self):
71 return iter(self.links)
72
73 def __getitem__(self, item):
74 return self.links[item]
75
76 def __len__(self):
77 return len(self.links)
78
79 def append(self, link):
80 try:
81 self.last_update = max(self.last_update, link.last_update)
82 except TypeError:
83 self.last_update = link.last_update
84 self.links.append(link)
b497d705
OM
85
86
7b84ba6d
OM
87class State(enum.Enum):
88 OK = enum.auto()
89 FAILED = enum.auto()
90 ALREADY_DOWNLOADED = enum.auto()
91
dbdb1782 92
65bd8b43
OM
93def fail_dir(dir_name):
94 """ When a download has failed, move it sideways.
95 """
96 target_dir = "{}_failed".format(dir_name)
97 inc = 0
98 while os.path.exists(target_dir):
99 target_dir = "{}_failed_{}".format(dir_name, inc)
100 inc += 1
101 os.rename(dir_name, target_dir)
102
103
104def truncate_name(file_name):
105 """ Ensure the filename is not too long for, well windows basically.
106 """
107 path = os.path.abspath(file_name)
108 if len(path) <= MAX_PATH_LENGTH:
109 return path
110 to_cut = len(path) - (MAX_PATH_LENGTH + 3)
111 base, extension = os.path.splitext(path)
112 inc = 0
113 new_path = "{}_{}{}".format(base, inc, extension)
114 while os.path.exists(new_path):
115 new_path = "{}_{}{}".format(base, inc, extension)
116 inc += 1
117 return new_path
118
119
dd8c35f4
OM
120def strip_ws(value):
121 """ Remove whitespace from a string """
122 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9 123
dbdb1782 124
975060c9
OM
125def slugify(value):
126 """
d194b140
OM
127 Normalise string, removes invalid for filename charactersr
128 and converts string to lowercase.
975060c9 129 """
d194b140 130 value = unicodedata.normalize('NFKC', value).lower().strip()
65bd8b43
OM
131 value = re.sub(r'[\\/<>:\?\*\|"]', '', value)
132 value = re.sub(r'\.*$', '', value)
133 return value
975060c9 134
b497d705
OM
135class PageChecker(object):
136 def __init__(self):
137 self.log = []
138 self.title = None
139 self.file_count = None
140 self.files = None
fb28c59b
OM
141 self.images = None
142 self.license = None
b497d705
OM
143
144
145 def __call__(self, _):
146 try:
147 self.log.append("call")
148 if self.title is None:
149 # first find the name
150 name = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=ThingPage__modelName]"))
151 if name is None:
152 return False
153 self.title = name.text
154
155 if self.file_count is None:
156 # OK. Do we know how many files we have to download?
157 metrics = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=MetricButton]"))
158 self.log.append("got some metrics: {}".format(len(metrics)))
159 cur_count = int([x.text.split("\n")[0] for x in metrics if x.text.endswith("\nThing Files")][0])
160 self.log.append(cur_count)
161 if cur_count == 0:
162 return False
163 self.file_count = cur_count
164
165 self.log.append("looking for {} files".format(self.file_count))
166 fileRows = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=ThingFile__fileRow]"))
167 self.log.append("found {} files".format(len(fileRows)))
fb28c59b
OM
168 if len(fileRows) < self.file_count:
169 return False
170
171 self.log.append("Looking for images")
172 # By this point _should_ have loaded all the images
173 self.images = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=thumb]"))
174 self.license = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=License__licenseText]")).text
175 self.log.append("found {} images".format(len(self.images)))
176 self.files = fileRows
177 return True
b497d705
OM
178 except Exception:
179 return False
180
181
182
183
6a777954
OM
184class Downloader(multiprocessing.Process):
185 """
186 Class to handle downloading the things we have found to get.
187 """
188
ae598d73 189 def __init__(self, thing_queue, download_directory, compress):
6a777954
OM
190 multiprocessing.Process.__init__(self)
191 # TODO: add parameters
192 self.thing_queue = thing_queue
193 self.download_directory = download_directory
ae598d73 194 self.compress = compress
6a777954
OM
195
196 def run(self):
197 """ actual download loop.
198 """
199 while True:
200 thing_id = self.thing_queue.get()
201 if thing_id is None:
202 logging.info("Shutting download queue")
203 self.thing_queue.task_done()
204 break
205 logging.info("Handling id {}".format(thing_id))
ae598d73 206 Thing(thing_id).download(self.download_directory, self.compress)
6a777954
OM
207 self.thing_queue.task_done()
208 return
209
7b84ba6d 210
6a777954
OM
211
212
dbdb1782 213
3522a3bf 214class Grouping:
d66f1f78 215 """ Holds details of a group of things for download
3c82f75b
OM
216 This is effectively (although not actually) an abstract class
217 - use Collection or Designs instead.
218 """
dbdb1782 219
ae598d73 220 def __init__(self, quick, compress):
975060c9
OM
221 self.things = []
222 self.total = 0
223 self.req_id = None
224 self.last_page = 0
225 self.per_page = None
7b84ba6d
OM
226 # Should we stop downloading when we hit a known datestamp?
227 self.quick = quick
ae598d73 228 self.compress = compress
948bd56f 229 # These should be set by child classes.
3522a3bf
OM
230 self.url = None
231 self.download_dir = None
948bd56f 232 self.collection_url = None
975060c9 233
3522a3bf
OM
234 def _get_small_grouping(self, req):
235 """ Handle small groupings """
975060c9 236 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 237 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9 238 self.things = [x['href'].split(':')[1] for x in links]
fa2f3251 239 self.total = len(self.things)
975060c9
OM
240
241 return self.things
242
3522a3bf
OM
243 def get(self):
244 """ retrieve the things of the grouping. """
975060c9
OM
245 if self.things:
246 # We've already done it.
247 return self.things
248
3522a3bf
OM
249 # Check for initialisation:
250 if not self.url:
fa2f3251 251 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
252 raise ValueError("No URL set - object not initialised properly?")
253
254 # Get the internal details of the grouping.
fa2f3251 255 logging.debug("Querying {}".format(self.url))
3522a3bf 256 c_req = requests.get(self.url)
975060c9
OM
257 total = TOTAL_REGEX.search(c_req.text)
258 if total is None:
3522a3bf
OM
259 # This is a small (<13) items grouping. Pull the list from this req.
260 return self._get_small_grouping(c_req)
975060c9
OM
261 self.total = total.groups()[0]
262 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
263 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
264 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
265 parameters = {
dbdb1782
OM
266 'base_url': self.url,
267 'page': '1',
268 'per_page': '12',
269 'id': self.req_id
975060c9
OM
270 }
271 for current_page in range(1, self.last_page + 1):
272 parameters['page'] = current_page
948bd56f 273 req = requests.post(self.collection_url, parameters)
975060c9 274 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 275 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9
OM
276 self.things += [x['href'].split(':')[1] for x in links]
277
278 return self.things
279
280 def download(self):
281 """ Downloads all the files in a collection """
282 if not self.things:
3522a3bf
OM
283 self.get()
284
285 if not self.download_dir:
dbdb1782
OM
286 raise ValueError(
287 "No download_dir set - invalidly initialised object?")
3522a3bf 288
975060c9 289 base_dir = os.getcwd()
975060c9 290 try:
3522a3bf 291 os.mkdir(self.download_dir)
975060c9 292 except FileExistsError:
fa2f3251 293 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 294 .format(self.download_dir))
fa2f3251 295 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 296 for idx, thing in enumerate(self.things):
fb28c59b 297 logging.info("Downloading thing {} - {}".format(idx, thing))
ae598d73 298 RC = Thing(thing).download(self.download_dir, self.compress)
7b84ba6d
OM
299 if self.quick and RC==State.ALREADY_DOWNLOADED:
300 logging.info("Caught up, stopping.")
301 return
975060c9 302
dbdb1782 303
ae598d73
OM
304
305
306
3522a3bf
OM
307class Collection(Grouping):
308 """ Holds details of a collection. """
dbdb1782 309
ae598d73
OM
310 def __init__(self, user, name, directory, quick, compress):
311 Grouping.__init__(self, quick, compress)
3522a3bf
OM
312 self.user = user
313 self.name = name
3c82f75b
OM
314 self.url = "{}/{}/collections/{}".format(
315 URL_BASE, self.user, strip_ws(self.name))
d66f1f78 316 self.download_dir = os.path.join(directory,
3c82f75b 317 "{}-{}".format(slugify(self.user), slugify(self.name)))
948bd56f 318 self.collection_url = URL_COLLECTION
3522a3bf 319
dbdb1782 320
3522a3bf
OM
321class Designs(Grouping):
322 """ Holds details of all of a users' designs. """
dbdb1782 323
ae598d73
OM
324 def __init__(self, user, directory, quick, compress):
325 Grouping.__init__(self, quick, compress)
3522a3bf
OM
326 self.user = user
327 self.url = "{}/{}/designs".format(URL_BASE, self.user)
dbdb1782
OM
328 self.download_dir = os.path.join(
329 directory, "{} designs".format(slugify(self.user)))
948bd56f 330 self.collection_url = USER_COLLECTION
975060c9 331
dbdb1782 332
3c82f75b
OM
333class Thing:
334 """ An individual design on thingiverse. """
dbdb1782 335
3c82f75b
OM
336 def __init__(self, thing_id):
337 self.thing_id = thing_id
338 self.last_time = None
339 self._parsed = False
340 self._needs_download = True
341 self.text = None
342 self.title = None
343 self.download_dir = None
ae598d73
OM
344 self.time_stamp = None
345 self._file_links = FileLinks()
975060c9 346
3c82f75b
OM
347 def _parse(self, base_dir):
348 """ Work out what, if anything needs to be done. """
349 if self._parsed:
350 return
e36c2a07 351
3c82f75b 352 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
e0e69fc6 353 try:
b497d705 354 BROWSER.get(url)
fb28c59b 355 wait = WebDriverWait(BROWSER, 60)
b497d705
OM
356 pc = PageChecker()
357 wait.until(pc)
e0e69fc6 358 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
359 logging.error("Unable to connect for thing {}: {}".format(
360 self.thing_id, error))
361 return
fb28c59b
OM
362 except selenium.common.exceptions.TimeoutException:
363 logging.error(pc.log)
364 logging.error("Timeout trying to parse thing {}".format(self.thing_id))
365 return
e0e69fc6 366
b497d705 367 self.title = pc.title
247c2cd5
OM
368 if not pc.files:
369 logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(self.thing_id))
b497d705 370 for link in pc.files:
fb28c59b 371 logging.debug("Parsing link: {}".format(link.text))
b497d705 372 link_link = link.find_element_by_xpath(".//a").get_attribute("href")
fb28c59b
OM
373 if link_link.endswith("/zip"):
374 # bulk link.
375 continue
376 try:
377 link_title, link_details, _ = link.text.split("\n")
378 except ValueError:
379 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
380 _, link_title, link_details, _ = link.text.split("\n")
381
382 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
383 #need to convert from M D Y to Y M D
1267e583 384 link_date = [int(x) for x in link_details.split("|")[1].split()[-1].split("-")]
fb28c59b 385 try:
d194b140 386 self._file_links.append(FileLink(link_title, datetime.datetime(link_date[2], link_date[0], link_date[1]), link_link))
fb28c59b
OM
387 except ValueError:
388 logging.error(link_date)
389
390 self._image_links=[x.find_element_by_xpath(".//img").get_attribute("src") for x in pc.images]
d194b140 391 self._license = pc.license
fb28c59b 392 self.pc = pc
e0e69fc6 393
e0e69fc6 394
fb28c59b
OM
395 self.old_download_dir = os.path.join(base_dir, slugify(self.title))
396 self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, slugify(self.title)))
3c82f75b 397
fa2f3251
OM
398 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
399
3c82f75b 400 if not os.path.exists(self.download_dir):
fb28c59b 401 logging.info("Looking for old dir at {}".format(self.old_download_dir))
3b497b1a 402 if os.path.exists(self.old_download_dir):
fb28c59b
OM
403 logging.warning("Found previous style download directory. Moving it from {} to {}".format(self.old_download_dir, self.download_dir))
404 os.rename(self.old_download_dir, self.download_dir)
3b497b1a
M
405 else:
406 # Not yet downloaded
407 self._parsed = True
408 return
3c82f75b
OM
409
410 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
411 if not os.path.exists(timestamp_file):
412 # Old download from before
dbdb1782
OM
413 logging.warning(
414 "Old-style download directory found. Assuming update required.")
3c82f75b
OM
415 self._parsed = True
416 return
417
418 try:
419 with open(timestamp_file, 'r') as timestamp_handle:
b497d705 420 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
fb28c59b
OM
421 last_bits = [int(x) for x in timestamp_handle.readlines()[0].split(' ')[0].split("-")]
422 logging.warning(last_bits)
9828dabe
OM
423 if last_bits[0] == 0:
424 last_bits[0] = 1
425 if last_bits[1] == 0:
426 last_bits[1] = 1
427 if last_bits[2] == 0:
428 last_bits[2] = 1980
fb28c59b
OM
429 try:
430 self.last_time = datetime.datetime(last_bits[0], last_bits[1], last_bits[2])
431 except ValueError:
432 # This one appears to be M D Y
433 self.last_time = datetime.datetime(last_bits[2], last_bits[0], last_bits[1])
434
fa2f3251 435 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
436 except FileNotFoundError:
437 # Not run on this thing before.
dbdb1782
OM
438 logging.info(
439 "Old-style download directory found. Assuming update required.")
3c82f75b 440 self.last_time = None
b497d705 441 self._needs_download = True
3c82f75b
OM
442 self._parsed = True
443 return
444
445 # OK, so we have a timestamp, lets see if there is anything new to get
ae598d73
OM
446 try:
447 if self._file_links.last_update > self.last_time:
dbdb1782 448 logging.info(
ae598d73 449 "Found new/updated files {}".format(self._file_links.last_update))
3c82f75b
OM
450 self._needs_download = True
451 self._parsed = True
452 return
ae598d73
OM
453 except TypeError:
454 logging.warning("No files found for {}.".format(self.thing_id))
b497d705 455
3c82f75b 456 # Got here, so nope, no new files.
3c82f75b
OM
457 self._needs_download = False
458 self._parsed = True
459
ae598d73 460 def download(self, base_dir, compress):
7b84ba6d
OM
461 """ Download all files for a given thing.
462 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
463 """
3c82f75b
OM
464 if not self._parsed:
465 self._parse(base_dir)
466
e0e69fc6 467 if not self._parsed:
8cdd1b54
OM
468 logging.error(
469 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 470 return State.FAILED
e0e69fc6 471
3c82f75b 472 if not self._needs_download:
7b84ba6d
OM
473 print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title))
474 return State.ALREADY_DOWNLOADED
3c82f75b 475
247c2cd5
OM
476 if not self._file_links:
477 print("{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.title))
478 return State.FAILED
479
3c82f75b
OM
480 # Have we already downloaded some things?
481 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
482 prev_dir = None
483 if os.path.exists(self.download_dir):
484 if not os.path.exists(timestamp_file):
485 # edge case: old style dir w/out timestamp.
fb28c59b 486 logging.warning("Old style download dir found at {}".format(self.title))
4f94efc8
OM
487 prev_count = 0
488 target_dir = "{}_old".format(self.download_dir)
489 while os.path.exists(target_dir):
490 prev_count = prev_count + 1
491 target_dir = "{}_old_{}".format(self.download_dir, prev_count)
492 os.rename(self.download_dir, target_dir)
3c82f75b 493 else:
fb28c59b 494 prev_dir = "{}_{}".format(self.download_dir, slugify(self.last_time.__str__()))
3c82f75b
OM
495 os.rename(self.download_dir, prev_dir)
496
497 # Get the list of files to download
3c82f75b
OM
498
499 new_file_links = []
500 old_file_links = []
ae598d73 501 self.time_stamp = None
3c82f75b
OM
502
503 if not self.last_time:
504 # If we don't have anything to copy from, then it is all new.
b497d705
OM
505 logging.debug("No last time, downloading all files")
506 new_file_links = self._file_links
ae598d73 507 self.time_stamp = new_file_links[0].last_update
b497d705
OM
508
509 for file_link in new_file_links:
ae598d73
OM
510 self.time_stamp = max(self.time_stamp, file_link.last_update)
511 logging.debug("New timestamp will be {}".format(self.time_stamp))
3c82f75b 512 else:
ae598d73 513 self.time_stamp = self.last_time
b497d705
OM
514 for file_link in self._file_links:
515 if file_link.last_update > self.last_time:
3c82f75b 516 new_file_links.append(file_link)
ae598d73 517 self.time_stamp = max(self.time_stamp, file_link.last_update)
3c82f75b
OM
518 else:
519 old_file_links.append(file_link)
3c82f75b 520
ae598d73 521 logging.debug("new timestamp {}".format(self.time_stamp))
3c82f75b
OM
522
523 # OK. Time to get to work.
fa2f3251 524 logging.debug("Generating download_dir")
3c82f75b 525 os.mkdir(self.download_dir)
b497d705 526 filelist_file = os.path.join(self.download_dir, "filelist.txt")
d194b140 527 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705
OM
528 for fl in self._file_links:
529 base_link = fl.link
530 try:
531 fl.link=requests.get(fl.link, allow_redirects=False).headers['location']
fb28c59b
OM
532 except Exception:
533 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
534 pass
b497d705 535
fb28c59b 536 fl_handle.write("{},{},{}, {}\n".format(fl.link, fl.name, fl.last_update, base_link))
b497d705
OM
537
538
3c82f75b 539 # First grab the cached files (if any)
fa2f3251 540 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
3c82f75b 541 for file_link in old_file_links:
b497d705 542 old_file = os.path.join(prev_dir, file_link.name)
65bd8b43 543 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
3c82f75b 544 try:
fa2f3251 545 logging.debug("Copying {} to {}".format(old_file, new_file))
3c82f75b
OM
546 copyfile(old_file, new_file)
547 except FileNotFoundError:
dbdb1782
OM
548 logging.warning(
549 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
3c82f75b
OM
550 new_file_links.append(file_link)
551
552 # Now download the new ones
dbdb1782 553 logging.info("Downloading {} new files of {}".format(
b497d705 554 len(new_file_links), len(self._file_links)))
3c82f75b 555 try:
b497d705 556 for file_link in new_file_links:
65bd8b43 557 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 558 logging.debug("Downloading {} from {} to {}".format(
b497d705
OM
559 file_link.name, file_link.link, file_name))
560 data_req = requests.get(file_link.link)
3c82f75b
OM
561 with open(file_name, 'wb') as handle:
562 handle.write(data_req.content)
563 except Exception as exception:
b497d705 564 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 565 fail_dir(self.download_dir)
7b84ba6d 566 return State.FAILED
3c82f75b 567
b497d705 568
b497d705 569 # People like images. But this doesn't work yet.
680039fe 570 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 571 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
572 try:
573 os.mkdir(image_dir)
fb28c59b
OM
574 for imagelink in self._image_links:
575 filename = os.path.basename(imagelink)
680039fe
OM
576 if filename.endswith('stl'):
577 filename = "{}.png".format(filename)
fb28c59b 578 image_req = requests.get(imagelink)
65bd8b43 579 with open(truncate_name(os.path.join(image_dir, filename)), 'wb') as handle:
680039fe
OM
580 handle.write(image_req.content)
581 except Exception as exception:
582 print("Failed to download {} - {}".format(filename, exception))
65bd8b43 583 fail_dir(self.download_dir)
7b84ba6d 584 return State.FAILED
680039fe 585
fb28c59b 586 """
4f75dd69
OM
587 # instructions are good too.
588 logging.info("Downloading readme")
589 try:
8cdd1b54
OM
590 readme_txt = soup.find('meta', property='og:description')[
591 'content']
592 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
4f75dd69
OM
593 readme_handle.write("{}\n".format(readme_txt))
594 except (TypeError, KeyError) as exception:
595 logging.warning("No readme? {}".format(exception))
596 except IOError as exception:
597 logging.warning("Failed to write readme! {}".format(exception))
598
fb28c59b 599 """
4f75dd69
OM
600 # Best get some licenses
601 logging.info("Downloading license")
602 try:
fb28c59b 603 if self._license:
65bd8b43 604 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w', encoding="utf-8") as license_handle:
fb28c59b 605 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
606 except IOError as exception:
607 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 608
3c82f75b
OM
609 try:
610 # Now write the timestamp
d194b140 611 with open(timestamp_file, 'w', encoding="utf-8") as timestamp_handle:
ae598d73 612 timestamp_handle.write(self.time_stamp.__str__())
3c82f75b
OM
613 except Exception as exception:
614 print("Failed to write timestamp file - {}".format(exception))
65bd8b43 615 fail_dir(self.download_dir)
7b84ba6d 616 return State.FAILED
3c82f75b 617 self._needs_download = False
fa2f3251 618 logging.debug("Download of {} finished".format(self.title))
ae598d73
OM
619 if not compress:
620 return State.OK
621
622
623 thing_dir = "{} - {} - {}".format(self.thing_id,
624 slugify(self.title),
625 self.time_stamp)
626 file_name = os.path.join(base_dir,
627 "{}.7z".format(thing_dir))
628 logging.debug("Compressing {} to {}".format(
629 self.title,
630 file_name))
631 #with libarchive.file_writer(filename, 'lzma', '7z') as archive:
632 with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
633 #with py7zr.SevenZipFile(file_name, 'w' ) as archive:
634 archive.writeall(self.download_dir, thing_dir)
635 logging.debug("Compression of {} finished.".format(self.title))
7b84ba6d 636 return State.OK
975060c9 637
dbdb1782 638
ae598d73
OM
639
640
641def do_batch(batch_file, download_dir, quick, compress):
1ab49020
OM
642 """ Read a file in line by line, parsing each as a set of calls to this script."""
643 with open(batch_file) as handle:
644 for line in handle:
645 line = line.strip()
cf280385
M
646 if not line:
647 # Skip empty lines
648 continue
1ab49020
OM
649 logging.info("Handling instruction {}".format(line))
650 command_arr = line.split()
651 if command_arr[0] == "thing":
dbdb1782
OM
652 logging.debug(
653 "Handling batch thing instruction: {}".format(line))
ae598d73 654 Thing(command_arr[1]).download(download_dir, compress)
1ab49020
OM
655 continue
656 if command_arr[0] == "collection":
dbdb1782
OM
657 logging.debug(
658 "Handling batch collection instruction: {}".format(line))
659 Collection(command_arr[1], command_arr[2],
ae598d73 660 download_dir, quick, compress).download()
1ab49020
OM
661 continue
662 if command_arr[0] == "user":
dbdb1782
OM
663 logging.debug(
664 "Handling batch collection instruction: {}".format(line))
ae598d73 665 Designs(command_arr[1], download_dir, quick, compress).download()
1ab49020
OM
666 continue
667 logging.warning("Unable to parse current instruction. Skipping.")
668
dbdb1782 669
975060c9
OM
670def main():
671 """ Entry point for script being run as a command. """
672 parser = argparse.ArgumentParser()
dbdb1782
OM
673 parser.add_argument("-l", "--log-level", choices=[
674 'debug', 'info', 'warning'], default='info', help="level of logging desired")
675 parser.add_argument("-d", "--directory",
676 help="Target directory to download into")
4f94efc8
OM
677 parser.add_argument("-f", "--log-file",
678 help="Place to log debug information to")
7b84ba6d
OM
679 parser.add_argument("-q", "--quick", action="store_true",
680 help="Assume date ordering on posts")
ae598d73
OM
681 parser.add_argument("-c", "--compress", action="store_true",
682 help="Compress files")
683
7b84ba6d 684
dbdb1782
OM
685 subparsers = parser.add_subparsers(
686 help="Type of thing to download", dest="subcommand")
687 collection_parser = subparsers.add_parser(
b7bfef68 688 'collection', help="Download one or more entire collection(s)")
dbdb1782 689 collection_parser.add_argument(
b7bfef68 690 "owner", help="The owner of the collection(s) to get")
dbdb1782 691 collection_parser.add_argument(
b7bfef68 692 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
693 thing_parser = subparsers.add_parser(
694 'thing', help="Download a single thing.")
8cdd1b54
OM
695 thing_parser.add_argument(
696 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 697 user_parser = subparsers.add_parser(
b7bfef68 698 "user", help="Download all things by one or more users")
8cdd1b54
OM
699 user_parser.add_argument(
700 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
701 batch_parser = subparsers.add_parser(
702 "batch", help="Perform multiple actions written in a text file")
703 batch_parser.add_argument(
704 "batch_file", help="The name of the file to read.")
680039fe 705 subparsers.add_parser("version", help="Show the current version")
4a98996b 706
975060c9 707 args = parser.parse_args()
4a98996b
OM
708 if not args.subcommand:
709 parser.print_help()
710 sys.exit(1)
d66f1f78
OM
711 if not args.directory:
712 args.directory = os.getcwd()
4f94efc8
OM
713
714 logger = logging.getLogger()
715 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
716 logger.setLevel(logging.DEBUG)
717 console_handler = logging.StreamHandler()
718 console_handler.setLevel(args.log_level.upper())
719
720 logger.addHandler(console_handler)
721 if args.log_file:
722 file_handler = logging.FileHandler(args.log_file)
723 file_handler.setLevel(logging.DEBUG)
724 file_handler.setFormatter(formatter)
725 logger.addHandler(file_handler)
fa2f3251 726
6a777954
OM
727
728 # Start downloader
729 thing_queue = multiprocessing.JoinableQueue()
730 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
ae598d73 731 downloaders = [Downloader(thing_queue, args.directory, args.compress) for _ in range(DOWNLOADER_COUNT)]
6a777954
OM
732 for downloader in downloaders:
733 downloader.start()
734
735
4a98996b 736 if args.subcommand.startswith("collection"):
b7bfef68 737 for collection in args.collections:
ae598d73 738 Collection(args.owner, collection, args.directory, args.quick, args.compress).download()
4a98996b 739 if args.subcommand == "thing":
b7bfef68 740 for thing in args.things:
6a777954 741 thing_queue.put(thing)
3522a3bf 742 if args.subcommand == "user":
b7bfef68 743 for user in args.users:
ae598d73 744 Designs(user, args.directory, args.quick, args.compress).download()
db8066ec
OM
745 if args.subcommand == "version":
746 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 747 if args.subcommand == "batch":
ae598d73 748 do_batch(args.batch_file, args.directory, args.quick, args.compress)
1ab49020 749
6a777954
OM
750 # Stop the downloader processes
751 for downloader in downloaders:
752 thing_queue.put(None)
975060c9 753
d194b140
OM
754atexit.register(BROWSER.quit)
755
0930777e
OM
756if __name__ == "__main__":
757 multiprocessing.freeze_support()
975060c9 758 main()