Couple of minor filname handling fixes for windows - resolves #10, resolves #11
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
6a777954 13import multiprocessing
7b84ba6d 14import enum
fb28c59b 15import datetime
3c82f75b 16from shutil import copyfile
975060c9 17from bs4 import BeautifulSoup
b497d705
OM
18from dataclasses import dataclass
19import selenium
20from selenium import webdriver
21from selenium.webdriver.common.by import By
22from selenium.webdriver.support.ui import WebDriverWait
23from selenium.webdriver.support import expected_conditions as EC
24from selenium.webdriver.firefox.options import Options
d194b140 25import atexit
975060c9
OM
26
27URL_BASE = "https://www.thingiverse.com"
28URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
948bd56f 29USER_COLLECTION = URL_BASE + "/ajax/user/designs"
975060c9
OM
30
31ID_REGEX = re.compile(r'"id":(\d*),')
32TOTAL_REGEX = re.compile(r'"total":(\d*),')
33LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
34# This appears to be fixed at 12, but if it changes would screw the rest up.
35PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
36NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
37
6a777954 38DOWNLOADER_COUNT = 1
7b84ba6d 39RETRY_COUNT = 3
6a777954 40
65bd8b43
OM
41MAX_PATH_LENGTH = 250
42
43VERSION = "0.8.5"
dbdb1782 44
b497d705
OM
45
46#BROWSER = webdriver.PhantomJS('./phantomjs')
47options = Options()
fb28c59b 48options.add_argument("--headless")
b497d705
OM
49BROWSER = webdriver.Firefox(options=options)
50
51BROWSER.set_window_size(1980, 1080)
52
53
54@dataclass
55class FileLink:
56 name: str
57 last_update: str
fb28c59b 58 link: datetime.datetime
b497d705
OM
59
60
7b84ba6d
OM
61class State(enum.Enum):
62 OK = enum.auto()
63 FAILED = enum.auto()
64 ALREADY_DOWNLOADED = enum.auto()
65
dbdb1782 66
65bd8b43
OM
67def fail_dir(dir_name):
68 """ When a download has failed, move it sideways.
69 """
70 target_dir = "{}_failed".format(dir_name)
71 inc = 0
72 while os.path.exists(target_dir):
73 target_dir = "{}_failed_{}".format(dir_name, inc)
74 inc += 1
75 os.rename(dir_name, target_dir)
76
77
78def truncate_name(file_name):
79 """ Ensure the filename is not too long for, well windows basically.
80 """
81 path = os.path.abspath(file_name)
82 if len(path) <= MAX_PATH_LENGTH:
83 return path
84 to_cut = len(path) - (MAX_PATH_LENGTH + 3)
85 base, extension = os.path.splitext(path)
86 inc = 0
87 new_path = "{}_{}{}".format(base, inc, extension)
88 while os.path.exists(new_path):
89 new_path = "{}_{}{}".format(base, inc, extension)
90 inc += 1
91 return new_path
92
93
dd8c35f4
OM
94def strip_ws(value):
95 """ Remove whitespace from a string """
96 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9 97
dbdb1782 98
975060c9
OM
99def slugify(value):
100 """
d194b140
OM
101 Normalise string, removes invalid for filename charactersr
102 and converts string to lowercase.
975060c9 103 """
d194b140 104 value = unicodedata.normalize('NFKC', value).lower().strip()
65bd8b43
OM
105 value = re.sub(r'[\\/<>:\?\*\|"]', '', value)
106 value = re.sub(r'\.*$', '', value)
107 return value
975060c9 108
b497d705
OM
109class PageChecker(object):
110 def __init__(self):
111 self.log = []
112 self.title = None
113 self.file_count = None
114 self.files = None
fb28c59b
OM
115 self.images = None
116 self.license = None
b497d705
OM
117
118
119 def __call__(self, _):
120 try:
121 self.log.append("call")
122 if self.title is None:
123 # first find the name
124 name = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=ThingPage__modelName]"))
125 if name is None:
126 return False
127 self.title = name.text
128
129 if self.file_count is None:
130 # OK. Do we know how many files we have to download?
131 metrics = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=MetricButton]"))
132 self.log.append("got some metrics: {}".format(len(metrics)))
133 cur_count = int([x.text.split("\n")[0] for x in metrics if x.text.endswith("\nThing Files")][0])
134 self.log.append(cur_count)
135 if cur_count == 0:
136 return False
137 self.file_count = cur_count
138
139 self.log.append("looking for {} files".format(self.file_count))
140 fileRows = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=ThingFile__fileRow]"))
141 self.log.append("found {} files".format(len(fileRows)))
fb28c59b
OM
142 if len(fileRows) < self.file_count:
143 return False
144
145 self.log.append("Looking for images")
146 # By this point _should_ have loaded all the images
147 self.images = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=thumb]"))
148 self.license = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=License__licenseText]")).text
149 self.log.append("found {} images".format(len(self.images)))
150 self.files = fileRows
151 return True
b497d705
OM
152 except Exception:
153 return False
154
155
156
157
6a777954
OM
158class Downloader(multiprocessing.Process):
159 """
160 Class to handle downloading the things we have found to get.
161 """
162
163 def __init__(self, thing_queue, download_directory):
164 multiprocessing.Process.__init__(self)
165 # TODO: add parameters
166 self.thing_queue = thing_queue
167 self.download_directory = download_directory
168
169 def run(self):
170 """ actual download loop.
171 """
172 while True:
173 thing_id = self.thing_queue.get()
174 if thing_id is None:
175 logging.info("Shutting download queue")
176 self.thing_queue.task_done()
177 break
178 logging.info("Handling id {}".format(thing_id))
179 Thing(thing_id).download(self.download_directory)
180 self.thing_queue.task_done()
181 return
182
7b84ba6d 183
6a777954
OM
184
185
dbdb1782 186
3522a3bf 187class Grouping:
d66f1f78 188 """ Holds details of a group of things for download
3c82f75b
OM
189 This is effectively (although not actually) an abstract class
190 - use Collection or Designs instead.
191 """
dbdb1782 192
7b84ba6d 193 def __init__(self, quick):
975060c9
OM
194 self.things = []
195 self.total = 0
196 self.req_id = None
197 self.last_page = 0
198 self.per_page = None
7b84ba6d
OM
199 # Should we stop downloading when we hit a known datestamp?
200 self.quick = quick
948bd56f 201 # These should be set by child classes.
3522a3bf
OM
202 self.url = None
203 self.download_dir = None
948bd56f 204 self.collection_url = None
975060c9 205
3522a3bf
OM
206 def _get_small_grouping(self, req):
207 """ Handle small groupings """
975060c9 208 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 209 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9 210 self.things = [x['href'].split(':')[1] for x in links]
fa2f3251 211 self.total = len(self.things)
975060c9
OM
212
213 return self.things
214
3522a3bf
OM
215 def get(self):
216 """ retrieve the things of the grouping. """
975060c9
OM
217 if self.things:
218 # We've already done it.
219 return self.things
220
3522a3bf
OM
221 # Check for initialisation:
222 if not self.url:
fa2f3251 223 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
224 raise ValueError("No URL set - object not initialised properly?")
225
226 # Get the internal details of the grouping.
fa2f3251 227 logging.debug("Querying {}".format(self.url))
3522a3bf 228 c_req = requests.get(self.url)
975060c9
OM
229 total = TOTAL_REGEX.search(c_req.text)
230 if total is None:
3522a3bf
OM
231 # This is a small (<13) items grouping. Pull the list from this req.
232 return self._get_small_grouping(c_req)
975060c9
OM
233 self.total = total.groups()[0]
234 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
235 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
236 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
237 parameters = {
dbdb1782
OM
238 'base_url': self.url,
239 'page': '1',
240 'per_page': '12',
241 'id': self.req_id
975060c9
OM
242 }
243 for current_page in range(1, self.last_page + 1):
244 parameters['page'] = current_page
948bd56f 245 req = requests.post(self.collection_url, parameters)
975060c9 246 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 247 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9
OM
248 self.things += [x['href'].split(':')[1] for x in links]
249
250 return self.things
251
252 def download(self):
253 """ Downloads all the files in a collection """
254 if not self.things:
3522a3bf
OM
255 self.get()
256
257 if not self.download_dir:
dbdb1782
OM
258 raise ValueError(
259 "No download_dir set - invalidly initialised object?")
3522a3bf 260
975060c9 261 base_dir = os.getcwd()
975060c9 262 try:
3522a3bf 263 os.mkdir(self.download_dir)
975060c9 264 except FileExistsError:
fa2f3251 265 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 266 .format(self.download_dir))
fa2f3251 267 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 268 for idx, thing in enumerate(self.things):
fb28c59b 269 logging.info("Downloading thing {} - {}".format(idx, thing))
7b84ba6d
OM
270 RC = Thing(thing).download(self.download_dir)
271 if self.quick and RC==State.ALREADY_DOWNLOADED:
272 logging.info("Caught up, stopping.")
273 return
975060c9 274
dbdb1782 275
3522a3bf
OM
276class Collection(Grouping):
277 """ Holds details of a collection. """
dbdb1782 278
7b84ba6d
OM
279 def __init__(self, user, name, directory, quick):
280 Grouping.__init__(self, quick)
3522a3bf
OM
281 self.user = user
282 self.name = name
3c82f75b
OM
283 self.url = "{}/{}/collections/{}".format(
284 URL_BASE, self.user, strip_ws(self.name))
d66f1f78 285 self.download_dir = os.path.join(directory,
3c82f75b 286 "{}-{}".format(slugify(self.user), slugify(self.name)))
948bd56f 287 self.collection_url = URL_COLLECTION
3522a3bf 288
dbdb1782 289
3522a3bf
OM
290class Designs(Grouping):
291 """ Holds details of all of a users' designs. """
dbdb1782 292
7b84ba6d
OM
293 def __init__(self, user, directory, quick):
294 Grouping.__init__(self, quick)
3522a3bf
OM
295 self.user = user
296 self.url = "{}/{}/designs".format(URL_BASE, self.user)
dbdb1782
OM
297 self.download_dir = os.path.join(
298 directory, "{} designs".format(slugify(self.user)))
948bd56f 299 self.collection_url = USER_COLLECTION
975060c9 300
dbdb1782 301
3c82f75b
OM
302class Thing:
303 """ An individual design on thingiverse. """
dbdb1782 304
3c82f75b
OM
305 def __init__(self, thing_id):
306 self.thing_id = thing_id
307 self.last_time = None
308 self._parsed = False
309 self._needs_download = True
310 self.text = None
311 self.title = None
312 self.download_dir = None
975060c9 313
3c82f75b
OM
314 def _parse(self, base_dir):
315 """ Work out what, if anything needs to be done. """
316 if self._parsed:
317 return
e36c2a07 318
3c82f75b 319 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
e0e69fc6 320 try:
b497d705 321 BROWSER.get(url)
fb28c59b 322 wait = WebDriverWait(BROWSER, 60)
b497d705
OM
323 pc = PageChecker()
324 wait.until(pc)
e0e69fc6 325 except requests.exceptions.ConnectionError as error:
8cdd1b54
OM
326 logging.error("Unable to connect for thing {}: {}".format(
327 self.thing_id, error))
328 return
fb28c59b
OM
329 except selenium.common.exceptions.TimeoutException:
330 logging.error(pc.log)
331 logging.error("Timeout trying to parse thing {}".format(self.thing_id))
332 return
e0e69fc6 333
b497d705
OM
334 self.title = pc.title
335 self._file_links=[]
336 for link in pc.files:
fb28c59b 337 logging.debug("Parsing link: {}".format(link.text))
b497d705 338 link_link = link.find_element_by_xpath(".//a").get_attribute("href")
fb28c59b
OM
339 if link_link.endswith("/zip"):
340 # bulk link.
341 continue
342 try:
343 link_title, link_details, _ = link.text.split("\n")
344 except ValueError:
345 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
346 _, link_title, link_details, _ = link.text.split("\n")
347
348 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
349 #need to convert from M D Y to Y M D
1267e583 350 link_date = [int(x) for x in link_details.split("|")[1].split()[-1].split("-")]
fb28c59b 351 try:
d194b140 352 self._file_links.append(FileLink(link_title, datetime.datetime(link_date[2], link_date[0], link_date[1]), link_link))
fb28c59b
OM
353 except ValueError:
354 logging.error(link_date)
355
356 self._image_links=[x.find_element_by_xpath(".//img").get_attribute("src") for x in pc.images]
d194b140 357 self._license = pc.license
fb28c59b 358 self.pc = pc
e0e69fc6 359
e0e69fc6 360
fb28c59b
OM
361 self.old_download_dir = os.path.join(base_dir, slugify(self.title))
362 self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, slugify(self.title)))
3c82f75b 363
fa2f3251
OM
364 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
365
3c82f75b 366 if not os.path.exists(self.download_dir):
fb28c59b 367 logging.info("Looking for old dir at {}".format(self.old_download_dir))
3b497b1a 368 if os.path.exists(self.old_download_dir):
fb28c59b
OM
369 logging.warning("Found previous style download directory. Moving it from {} to {}".format(self.old_download_dir, self.download_dir))
370 os.rename(self.old_download_dir, self.download_dir)
3b497b1a
M
371 else:
372 # Not yet downloaded
373 self._parsed = True
374 return
3c82f75b
OM
375
376 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
377 if not os.path.exists(timestamp_file):
378 # Old download from before
dbdb1782
OM
379 logging.warning(
380 "Old-style download directory found. Assuming update required.")
3c82f75b
OM
381 self._parsed = True
382 return
383
384 try:
385 with open(timestamp_file, 'r') as timestamp_handle:
b497d705 386 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
fb28c59b
OM
387 last_bits = [int(x) for x in timestamp_handle.readlines()[0].split(' ')[0].split("-")]
388 logging.warning(last_bits)
389 try:
390 self.last_time = datetime.datetime(last_bits[0], last_bits[1], last_bits[2])
391 except ValueError:
392 # This one appears to be M D Y
393 self.last_time = datetime.datetime(last_bits[2], last_bits[0], last_bits[1])
394
fa2f3251 395 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
396 except FileNotFoundError:
397 # Not run on this thing before.
dbdb1782
OM
398 logging.info(
399 "Old-style download directory found. Assuming update required.")
3c82f75b 400 self.last_time = None
b497d705 401 self._needs_download = True
3c82f75b
OM
402 self._parsed = True
403 return
404
405 # OK, so we have a timestamp, lets see if there is anything new to get
b497d705
OM
406 for file_link in self._file_links:
407 if file_link.last_update > self.last_time:
dbdb1782 408 logging.info(
fb28c59b 409 "Found new/updated file {} - {}".format(file_link.name, file_link.last_update))
3c82f75b
OM
410 self._needs_download = True
411 self._parsed = True
412 return
b497d705 413
3c82f75b 414 # Got here, so nope, no new files.
3c82f75b
OM
415 self._needs_download = False
416 self._parsed = True
417
418 def download(self, base_dir):
7b84ba6d
OM
419 """ Download all files for a given thing.
420 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
421 """
3c82f75b
OM
422 if not self._parsed:
423 self._parse(base_dir)
424
e0e69fc6 425 if not self._parsed:
8cdd1b54
OM
426 logging.error(
427 "Unable to parse {} - aborting download".format(self.thing_id))
7b84ba6d 428 return State.FAILED
e0e69fc6 429
3c82f75b 430 if not self._needs_download:
7b84ba6d
OM
431 print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title))
432 return State.ALREADY_DOWNLOADED
3c82f75b
OM
433
434 # Have we already downloaded some things?
435 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
436 prev_dir = None
437 if os.path.exists(self.download_dir):
438 if not os.path.exists(timestamp_file):
439 # edge case: old style dir w/out timestamp.
fb28c59b 440 logging.warning("Old style download dir found at {}".format(self.title))
4f94efc8
OM
441 prev_count = 0
442 target_dir = "{}_old".format(self.download_dir)
443 while os.path.exists(target_dir):
444 prev_count = prev_count + 1
445 target_dir = "{}_old_{}".format(self.download_dir, prev_count)
446 os.rename(self.download_dir, target_dir)
3c82f75b 447 else:
fb28c59b 448 prev_dir = "{}_{}".format(self.download_dir, slugify(self.last_time.__str__()))
3c82f75b
OM
449 os.rename(self.download_dir, prev_dir)
450
451 # Get the list of files to download
3c82f75b
OM
452
453 new_file_links = []
454 old_file_links = []
455 new_last_time = None
456
457 if not self.last_time:
458 # If we don't have anything to copy from, then it is all new.
b497d705
OM
459 logging.debug("No last time, downloading all files")
460 new_file_links = self._file_links
461 new_last_time = new_file_links[0].last_update
462
463 for file_link in new_file_links:
464 new_last_time = max(new_last_time, file_link.last_update)
465 logging.debug("New timestamp will be {}".format(new_last_time))
3c82f75b 466 else:
b497d705
OM
467 new_last_time = self.last_time
468 for file_link in self._file_links:
469 if file_link.last_update > self.last_time:
3c82f75b 470 new_file_links.append(file_link)
b497d705 471 new_last_time = max(new_last_time, file_link.last_update)
3c82f75b
OM
472 else:
473 old_file_links.append(file_link)
3c82f75b 474
fa2f3251 475 logging.debug("new timestamp {}".format(new_last_time))
3c82f75b
OM
476
477 # OK. Time to get to work.
fa2f3251 478 logging.debug("Generating download_dir")
3c82f75b 479 os.mkdir(self.download_dir)
b497d705 480 filelist_file = os.path.join(self.download_dir, "filelist.txt")
d194b140 481 with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
b497d705
OM
482 for fl in self._file_links:
483 base_link = fl.link
484 try:
485 fl.link=requests.get(fl.link, allow_redirects=False).headers['location']
fb28c59b
OM
486 except Exception:
487 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
488 pass
b497d705 489
fb28c59b 490 fl_handle.write("{},{},{}, {}\n".format(fl.link, fl.name, fl.last_update, base_link))
b497d705
OM
491
492
3c82f75b 493 # First grab the cached files (if any)
fa2f3251 494 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
3c82f75b 495 for file_link in old_file_links:
b497d705 496 old_file = os.path.join(prev_dir, file_link.name)
65bd8b43 497 new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
3c82f75b 498 try:
fa2f3251 499 logging.debug("Copying {} to {}".format(old_file, new_file))
3c82f75b
OM
500 copyfile(old_file, new_file)
501 except FileNotFoundError:
dbdb1782
OM
502 logging.warning(
503 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
3c82f75b
OM
504 new_file_links.append(file_link)
505
506 # Now download the new ones
dbdb1782 507 logging.info("Downloading {} new files of {}".format(
b497d705 508 len(new_file_links), len(self._file_links)))
3c82f75b 509 try:
b497d705 510 for file_link in new_file_links:
65bd8b43 511 file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
dbdb1782 512 logging.debug("Downloading {} from {} to {}".format(
b497d705
OM
513 file_link.name, file_link.link, file_name))
514 data_req = requests.get(file_link.link)
3c82f75b
OM
515 with open(file_name, 'wb') as handle:
516 handle.write(data_req.content)
517 except Exception as exception:
b497d705 518 logging.error("Failed to download {} - {}".format(file_link.name, exception))
65bd8b43 519 fail_dir(self.download_dir)
7b84ba6d 520 return State.FAILED
3c82f75b 521
b497d705 522
b497d705 523 # People like images. But this doesn't work yet.
680039fe 524 image_dir = os.path.join(self.download_dir, 'images')
fb28c59b 525 logging.info("Downloading {} images.".format(len(self._image_links)))
680039fe
OM
526 try:
527 os.mkdir(image_dir)
fb28c59b
OM
528 for imagelink in self._image_links:
529 filename = os.path.basename(imagelink)
680039fe
OM
530 if filename.endswith('stl'):
531 filename = "{}.png".format(filename)
fb28c59b 532 image_req = requests.get(imagelink)
65bd8b43 533 with open(truncate_name(os.path.join(image_dir, filename)), 'wb') as handle:
680039fe
OM
534 handle.write(image_req.content)
535 except Exception as exception:
536 print("Failed to download {} - {}".format(filename, exception))
65bd8b43 537 fail_dir(self.download_dir)
7b84ba6d 538 return State.FAILED
680039fe 539
fb28c59b 540 """
4f75dd69
OM
541 # instructions are good too.
542 logging.info("Downloading readme")
543 try:
8cdd1b54
OM
544 readme_txt = soup.find('meta', property='og:description')[
545 'content']
546 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
4f75dd69
OM
547 readme_handle.write("{}\n".format(readme_txt))
548 except (TypeError, KeyError) as exception:
549 logging.warning("No readme? {}".format(exception))
550 except IOError as exception:
551 logging.warning("Failed to write readme! {}".format(exception))
552
fb28c59b 553 """
4f75dd69
OM
554 # Best get some licenses
555 logging.info("Downloading license")
556 try:
fb28c59b 557 if self._license:
65bd8b43 558 with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w', encoding="utf-8") as license_handle:
fb28c59b 559 license_handle.write("{}\n".format(self._license))
4f75dd69
OM
560 except IOError as exception:
561 logging.warning("Failed to write license! {}".format(exception))
fb28c59b 562
3c82f75b
OM
563 try:
564 # Now write the timestamp
d194b140 565 with open(timestamp_file, 'w', encoding="utf-8") as timestamp_handle:
fb28c59b 566 timestamp_handle.write(new_last_time.__str__())
3c82f75b
OM
567 except Exception as exception:
568 print("Failed to write timestamp file - {}".format(exception))
65bd8b43 569 fail_dir(self.download_dir)
7b84ba6d 570 return State.FAILED
3c82f75b 571 self._needs_download = False
fa2f3251 572 logging.debug("Download of {} finished".format(self.title))
7b84ba6d 573 return State.OK
975060c9 574
dbdb1782 575
7b84ba6d 576def do_batch(batch_file, download_dir, quick):
1ab49020
OM
577 """ Read a file in line by line, parsing each as a set of calls to this script."""
578 with open(batch_file) as handle:
579 for line in handle:
580 line = line.strip()
cf280385
M
581 if not line:
582 # Skip empty lines
583 continue
1ab49020
OM
584 logging.info("Handling instruction {}".format(line))
585 command_arr = line.split()
586 if command_arr[0] == "thing":
dbdb1782
OM
587 logging.debug(
588 "Handling batch thing instruction: {}".format(line))
1ab49020
OM
589 Thing(command_arr[1]).download(download_dir)
590 continue
591 if command_arr[0] == "collection":
dbdb1782
OM
592 logging.debug(
593 "Handling batch collection instruction: {}".format(line))
594 Collection(command_arr[1], command_arr[2],
7b84ba6d 595 download_dir, quick).download()
1ab49020
OM
596 continue
597 if command_arr[0] == "user":
dbdb1782
OM
598 logging.debug(
599 "Handling batch collection instruction: {}".format(line))
7b84ba6d 600 Designs(command_arr[1], download_dir, quick).download()
1ab49020
OM
601 continue
602 logging.warning("Unable to parse current instruction. Skipping.")
603
dbdb1782 604
975060c9
OM
605def main():
606 """ Entry point for script being run as a command. """
607 parser = argparse.ArgumentParser()
dbdb1782
OM
608 parser.add_argument("-l", "--log-level", choices=[
609 'debug', 'info', 'warning'], default='info', help="level of logging desired")
610 parser.add_argument("-d", "--directory",
611 help="Target directory to download into")
4f94efc8
OM
612 parser.add_argument("-f", "--log-file",
613 help="Place to log debug information to")
7b84ba6d
OM
614 parser.add_argument("-q", "--quick", action="store_true",
615 help="Assume date ordering on posts")
616
dbdb1782
OM
617 subparsers = parser.add_subparsers(
618 help="Type of thing to download", dest="subcommand")
619 collection_parser = subparsers.add_parser(
b7bfef68 620 'collection', help="Download one or more entire collection(s)")
dbdb1782 621 collection_parser.add_argument(
b7bfef68 622 "owner", help="The owner of the collection(s) to get")
dbdb1782 623 collection_parser.add_argument(
b7bfef68 624 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
625 thing_parser = subparsers.add_parser(
626 'thing', help="Download a single thing.")
8cdd1b54
OM
627 thing_parser.add_argument(
628 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 629 user_parser = subparsers.add_parser(
b7bfef68 630 "user", help="Download all things by one or more users")
8cdd1b54
OM
631 user_parser.add_argument(
632 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
633 batch_parser = subparsers.add_parser(
634 "batch", help="Perform multiple actions written in a text file")
635 batch_parser.add_argument(
636 "batch_file", help="The name of the file to read.")
680039fe 637 subparsers.add_parser("version", help="Show the current version")
4a98996b 638
975060c9 639 args = parser.parse_args()
4a98996b
OM
640 if not args.subcommand:
641 parser.print_help()
642 sys.exit(1)
d66f1f78
OM
643 if not args.directory:
644 args.directory = os.getcwd()
4f94efc8
OM
645
646 logger = logging.getLogger()
647 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
648 logger.setLevel(logging.DEBUG)
649 console_handler = logging.StreamHandler()
650 console_handler.setLevel(args.log_level.upper())
651
652 logger.addHandler(console_handler)
653 if args.log_file:
654 file_handler = logging.FileHandler(args.log_file)
655 file_handler.setLevel(logging.DEBUG)
656 file_handler.setFormatter(formatter)
657 logger.addHandler(file_handler)
fa2f3251 658
6a777954
OM
659
660 # Start downloader
661 thing_queue = multiprocessing.JoinableQueue()
662 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
663 downloaders = [Downloader(thing_queue, args.directory) for _ in range(DOWNLOADER_COUNT)]
664 for downloader in downloaders:
665 downloader.start()
666
667
4a98996b 668 if args.subcommand.startswith("collection"):
b7bfef68 669 for collection in args.collections:
7b84ba6d 670 Collection(args.owner, collection, args.directory, args.quick).download()
4a98996b 671 if args.subcommand == "thing":
b7bfef68 672 for thing in args.things:
6a777954 673 thing_queue.put(thing)
3522a3bf 674 if args.subcommand == "user":
b7bfef68 675 for user in args.users:
7b84ba6d 676 Designs(user, args.directory, args.quick).download()
db8066ec
OM
677 if args.subcommand == "version":
678 print("thingy_grabber.py version {}".format(VERSION))
1ab49020 679 if args.subcommand == "batch":
7b84ba6d 680 do_batch(args.batch_file, args.directory, args.quick)
1ab49020 681
6a777954
OM
682 # Stop the downloader processes
683 for downloader in downloaders:
684 thing_queue.put(None)
975060c9 685
d194b140
OM
686atexit.register(BROWSER.quit)
687
0930777e
OM
688if __name__ == "__main__":
689 multiprocessing.freeze_support()
975060c9 690 main()