Commit | Line | Data |
---|---|---|
975060c9 OM |
1 | #!/usr/bin/env python3 |
2 | """ | |
3 | Thingiverse bulk downloader | |
4 | """ | |
5 | ||
6 | import re | |
4a98996b | 7 | import sys |
975060c9 OM |
8 | import os |
9 | import argparse | |
10 | import unicodedata | |
11 | import requests | |
fa2f3251 | 12 | import logging |
6a777954 | 13 | import multiprocessing |
7b84ba6d | 14 | import enum |
3c82f75b | 15 | from shutil import copyfile |
975060c9 | 16 | from bs4 import BeautifulSoup |
b497d705 OM |
17 | from dataclasses import dataclass |
18 | import selenium | |
19 | from selenium import webdriver | |
20 | from selenium.webdriver.common.by import By | |
21 | from selenium.webdriver.support.ui import WebDriverWait | |
22 | from selenium.webdriver.support import expected_conditions as EC | |
23 | from selenium.webdriver.firefox.options import Options | |
975060c9 OM |
24 | |
25 | URL_BASE = "https://www.thingiverse.com" | |
26 | URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things" | |
948bd56f | 27 | USER_COLLECTION = URL_BASE + "/ajax/user/designs" |
975060c9 OM |
28 | |
29 | ID_REGEX = re.compile(r'"id":(\d*),') | |
30 | TOTAL_REGEX = re.compile(r'"total":(\d*),') | |
31 | LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),') | |
32 | # This appears to be fixed at 12, but if it changes would screw the rest up. | |
33 | PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),') | |
dd8c35f4 OM |
34 | NO_WHITESPACE_REGEX = re.compile(r'[-\s]+') |
35 | ||
6a777954 | 36 | DOWNLOADER_COUNT = 1 |
7b84ba6d | 37 | RETRY_COUNT = 3 |
6a777954 OM |
38 | |
39 | VERSION = "0.7.0" | |
dbdb1782 | 40 | |
b497d705 OM |
41 | |
42 | #BROWSER = webdriver.PhantomJS('./phantomjs') | |
43 | options = Options() | |
44 | BROWSER = webdriver.Firefox(options=options) | |
45 | ||
46 | BROWSER.set_window_size(1980, 1080) | |
47 | ||
48 | ||
49 | @dataclass | |
50 | class FileLink: | |
51 | name: str | |
52 | last_update: str | |
53 | link: str | |
54 | ||
55 | ||
7b84ba6d OM |
56 | class State(enum.Enum): |
57 | OK = enum.auto() | |
58 | FAILED = enum.auto() | |
59 | ALREADY_DOWNLOADED = enum.auto() | |
60 | ||
dbdb1782 | 61 | |
dd8c35f4 OM |
62 | def strip_ws(value): |
63 | """ Remove whitespace from a string """ | |
64 | return str(NO_WHITESPACE_REGEX.sub('-', value)) | |
975060c9 | 65 | |
dbdb1782 | 66 | |
975060c9 OM |
67 | def slugify(value): |
68 | """ | |
69 | Normalizes string, converts to lowercase, removes non-alpha characters, | |
70 | and converts spaces to hyphens. | |
71 | """ | |
dbdb1782 OM |
72 | value = unicodedata.normalize('NFKD', value).encode( |
73 | 'ascii', 'ignore').decode() | |
975060c9 | 74 | value = str(re.sub(r'[^\w\s-]', '', value).strip()) |
dd8c35f4 | 75 | value = str(NO_WHITESPACE_REGEX.sub('-', value)) |
975060c9 OM |
76 | return value |
77 | ||
b497d705 OM |
78 | class PageChecker(object): |
79 | def __init__(self): | |
80 | self.log = [] | |
81 | self.title = None | |
82 | self.file_count = None | |
83 | self.files = None | |
84 | ||
85 | ||
86 | def __call__(self, _): | |
87 | try: | |
88 | self.log.append("call") | |
89 | if self.title is None: | |
90 | # first find the name | |
91 | name = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=ThingPage__modelName]")) | |
92 | if name is None: | |
93 | return False | |
94 | self.title = name.text | |
95 | ||
96 | if self.file_count is None: | |
97 | # OK. Do we know how many files we have to download? | |
98 | metrics = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=MetricButton]")) | |
99 | self.log.append("got some metrics: {}".format(len(metrics))) | |
100 | cur_count = int([x.text.split("\n")[0] for x in metrics if x.text.endswith("\nThing Files")][0]) | |
101 | self.log.append(cur_count) | |
102 | if cur_count == 0: | |
103 | return False | |
104 | self.file_count = cur_count | |
105 | ||
106 | self.log.append("looking for {} files".format(self.file_count)) | |
107 | fileRows = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=ThingFile__fileRow]")) | |
108 | self.log.append("found {} files".format(len(fileRows))) | |
109 | if len(fileRows) >= self.file_count: | |
110 | self.files = fileRows | |
111 | return True | |
112 | return False | |
113 | except Exception: | |
114 | return False | |
115 | ||
116 | ||
117 | ||
118 | ||
6a777954 OM |
119 | class Downloader(multiprocessing.Process): |
120 | """ | |
121 | Class to handle downloading the things we have found to get. | |
122 | """ | |
123 | ||
124 | def __init__(self, thing_queue, download_directory): | |
125 | multiprocessing.Process.__init__(self) | |
126 | # TODO: add parameters | |
127 | self.thing_queue = thing_queue | |
128 | self.download_directory = download_directory | |
129 | ||
130 | def run(self): | |
131 | """ actual download loop. | |
132 | """ | |
133 | while True: | |
134 | thing_id = self.thing_queue.get() | |
135 | if thing_id is None: | |
136 | logging.info("Shutting download queue") | |
137 | self.thing_queue.task_done() | |
138 | break | |
139 | logging.info("Handling id {}".format(thing_id)) | |
140 | Thing(thing_id).download(self.download_directory) | |
141 | self.thing_queue.task_done() | |
142 | return | |
143 | ||
7b84ba6d | 144 | |
6a777954 OM |
145 | |
146 | ||
dbdb1782 | 147 | |
3522a3bf | 148 | class Grouping: |
d66f1f78 | 149 | """ Holds details of a group of things for download |
3c82f75b OM |
150 | This is effectively (although not actually) an abstract class |
151 | - use Collection or Designs instead. | |
152 | """ | |
dbdb1782 | 153 | |
7b84ba6d | 154 | def __init__(self, quick): |
975060c9 OM |
155 | self.things = [] |
156 | self.total = 0 | |
157 | self.req_id = None | |
158 | self.last_page = 0 | |
159 | self.per_page = None | |
7b84ba6d OM |
160 | # Should we stop downloading when we hit a known datestamp? |
161 | self.quick = quick | |
948bd56f | 162 | # These should be set by child classes. |
3522a3bf OM |
163 | self.url = None |
164 | self.download_dir = None | |
948bd56f | 165 | self.collection_url = None |
975060c9 | 166 | |
3522a3bf OM |
167 | def _get_small_grouping(self, req): |
168 | """ Handle small groupings """ | |
975060c9 | 169 | soup = BeautifulSoup(req.text, features='lxml') |
dbdb1782 | 170 | links = soup.find_all('a', {'class': 'card-img-holder'}) |
975060c9 | 171 | self.things = [x['href'].split(':')[1] for x in links] |
fa2f3251 | 172 | self.total = len(self.things) |
975060c9 OM |
173 | |
174 | return self.things | |
175 | ||
3522a3bf OM |
176 | def get(self): |
177 | """ retrieve the things of the grouping. """ | |
975060c9 OM |
178 | if self.things: |
179 | # We've already done it. | |
180 | return self.things | |
181 | ||
3522a3bf OM |
182 | # Check for initialisation: |
183 | if not self.url: | |
fa2f3251 | 184 | logging.error("No URL set - object not initialised properly?") |
3522a3bf OM |
185 | raise ValueError("No URL set - object not initialised properly?") |
186 | ||
187 | # Get the internal details of the grouping. | |
fa2f3251 | 188 | logging.debug("Querying {}".format(self.url)) |
3522a3bf | 189 | c_req = requests.get(self.url) |
975060c9 OM |
190 | total = TOTAL_REGEX.search(c_req.text) |
191 | if total is None: | |
3522a3bf OM |
192 | # This is a small (<13) items grouping. Pull the list from this req. |
193 | return self._get_small_grouping(c_req) | |
975060c9 OM |
194 | self.total = total.groups()[0] |
195 | self.req_id = ID_REGEX.search(c_req.text).groups()[0] | |
196 | self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0]) | |
197 | self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0] | |
198 | parameters = { | |
dbdb1782 OM |
199 | 'base_url': self.url, |
200 | 'page': '1', | |
201 | 'per_page': '12', | |
202 | 'id': self.req_id | |
975060c9 OM |
203 | } |
204 | for current_page in range(1, self.last_page + 1): | |
205 | parameters['page'] = current_page | |
948bd56f | 206 | req = requests.post(self.collection_url, parameters) |
975060c9 | 207 | soup = BeautifulSoup(req.text, features='lxml') |
dbdb1782 | 208 | links = soup.find_all('a', {'class': 'card-img-holder'}) |
975060c9 OM |
209 | self.things += [x['href'].split(':')[1] for x in links] |
210 | ||
211 | return self.things | |
212 | ||
213 | def download(self): | |
214 | """ Downloads all the files in a collection """ | |
215 | if not self.things: | |
3522a3bf OM |
216 | self.get() |
217 | ||
218 | if not self.download_dir: | |
dbdb1782 OM |
219 | raise ValueError( |
220 | "No download_dir set - invalidly initialised object?") | |
3522a3bf | 221 | |
975060c9 | 222 | base_dir = os.getcwd() |
975060c9 | 223 | try: |
3522a3bf | 224 | os.mkdir(self.download_dir) |
975060c9 | 225 | except FileExistsError: |
fa2f3251 | 226 | logging.info("Target directory {} already exists. Assuming a resume." |
dbdb1782 | 227 | .format(self.download_dir)) |
fa2f3251 | 228 | logging.info("Downloading {} thing(s).".format(self.total)) |
dbdb1782 | 229 | for idx, thing in enumerate(self.things): |
fa2f3251 | 230 | logging.info("Downloading thing {}".format(idx)) |
7b84ba6d OM |
231 | RC = Thing(thing).download(self.download_dir) |
232 | if self.quick and RC==State.ALREADY_DOWNLOADED: | |
233 | logging.info("Caught up, stopping.") | |
234 | return | |
975060c9 | 235 | |
dbdb1782 | 236 | |
3522a3bf OM |
237 | class Collection(Grouping): |
238 | """ Holds details of a collection. """ | |
dbdb1782 | 239 | |
7b84ba6d OM |
240 | def __init__(self, user, name, directory, quick): |
241 | Grouping.__init__(self, quick) | |
3522a3bf OM |
242 | self.user = user |
243 | self.name = name | |
3c82f75b OM |
244 | self.url = "{}/{}/collections/{}".format( |
245 | URL_BASE, self.user, strip_ws(self.name)) | |
d66f1f78 | 246 | self.download_dir = os.path.join(directory, |
3c82f75b | 247 | "{}-{}".format(slugify(self.user), slugify(self.name))) |
948bd56f | 248 | self.collection_url = URL_COLLECTION |
3522a3bf | 249 | |
dbdb1782 | 250 | |
3522a3bf OM |
251 | class Designs(Grouping): |
252 | """ Holds details of all of a users' designs. """ | |
dbdb1782 | 253 | |
7b84ba6d OM |
254 | def __init__(self, user, directory, quick): |
255 | Grouping.__init__(self, quick) | |
3522a3bf OM |
256 | self.user = user |
257 | self.url = "{}/{}/designs".format(URL_BASE, self.user) | |
dbdb1782 OM |
258 | self.download_dir = os.path.join( |
259 | directory, "{} designs".format(slugify(self.user))) | |
948bd56f | 260 | self.collection_url = USER_COLLECTION |
975060c9 | 261 | |
dbdb1782 | 262 | |
3c82f75b OM |
263 | class Thing: |
264 | """ An individual design on thingiverse. """ | |
dbdb1782 | 265 | |
3c82f75b OM |
266 | def __init__(self, thing_id): |
267 | self.thing_id = thing_id | |
268 | self.last_time = None | |
269 | self._parsed = False | |
270 | self._needs_download = True | |
271 | self.text = None | |
272 | self.title = None | |
273 | self.download_dir = None | |
975060c9 | 274 | |
3c82f75b OM |
275 | def _parse(self, base_dir): |
276 | """ Work out what, if anything needs to be done. """ | |
277 | if self._parsed: | |
278 | return | |
e36c2a07 | 279 | |
3c82f75b | 280 | url = "{}/thing:{}/files".format(URL_BASE, self.thing_id) |
e0e69fc6 | 281 | try: |
b497d705 OM |
282 | BROWSER.get(url) |
283 | wait = WebDriverWait(BROWSER, 20) | |
284 | pc = PageChecker() | |
285 | wait.until(pc) | |
e0e69fc6 | 286 | except requests.exceptions.ConnectionError as error: |
8cdd1b54 OM |
287 | logging.error("Unable to connect for thing {}: {}".format( |
288 | self.thing_id, error)) | |
289 | return | |
e0e69fc6 | 290 | |
b497d705 OM |
291 | self.title = pc.title |
292 | self._file_links=[] | |
293 | for link in pc.files: | |
294 | link_title, link_details, _ = link.text.split("\n") | |
295 | #link_details we be something like '461 kb | Updated 06-11-2019 | 373 Downloads' | |
296 | link_date = link_details.split("|")[1][10:-1] | |
297 | link_link = link.find_element_by_xpath(".//a").get_attribute("href") | |
298 | self._file_links.append(FileLink(link_title, link_date, link_link)) | |
e0e69fc6 | 299 | |
e0e69fc6 | 300 | |
3b497b1a | 301 | self.old_download_dir = os.path.join(base_dir, self.title) |
84ca2da2 | 302 | self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, self.title)) |
3c82f75b | 303 | |
fa2f3251 OM |
304 | logging.debug("Parsing {} ({})".format(self.thing_id, self.title)) |
305 | ||
3c82f75b | 306 | if not os.path.exists(self.download_dir): |
3b497b1a M |
307 | if os.path.exists(self.old_download_dir): |
308 | logging.info("Found previous style download directory. Moving it") | |
309 | copyfile(self.old_download_dir, self.download_dir) | |
310 | else: | |
311 | # Not yet downloaded | |
312 | self._parsed = True | |
313 | return | |
3c82f75b OM |
314 | |
315 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') | |
316 | if not os.path.exists(timestamp_file): | |
317 | # Old download from before | |
dbdb1782 OM |
318 | logging.warning( |
319 | "Old-style download directory found. Assuming update required.") | |
3c82f75b OM |
320 | self._parsed = True |
321 | return | |
322 | ||
323 | try: | |
324 | with open(timestamp_file, 'r') as timestamp_handle: | |
b497d705 OM |
325 | # add the .split(' ')[0] to remove the timestamp from the old style timestamps |
326 | self.last_time = timestamp_handle.readlines()[0].split(' ')[0] | |
fa2f3251 | 327 | logging.info("last downloaded version: {}".format(self.last_time)) |
3c82f75b OM |
328 | except FileNotFoundError: |
329 | # Not run on this thing before. | |
dbdb1782 OM |
330 | logging.info( |
331 | "Old-style download directory found. Assuming update required.") | |
3c82f75b | 332 | self.last_time = None |
b497d705 | 333 | self._needs_download = True |
3c82f75b OM |
334 | self._parsed = True |
335 | return | |
336 | ||
337 | # OK, so we have a timestamp, lets see if there is anything new to get | |
b497d705 OM |
338 | for file_link in self._file_links: |
339 | if file_link.last_update > self.last_time: | |
dbdb1782 OM |
340 | logging.info( |
341 | "Found new/updated file {}".format(file_link["title"])) | |
3c82f75b OM |
342 | self._needs_download = True |
343 | self._parsed = True | |
344 | return | |
b497d705 | 345 | |
3c82f75b | 346 | # Got here, so nope, no new files. |
3c82f75b OM |
347 | self._needs_download = False |
348 | self._parsed = True | |
349 | ||
350 | def download(self, base_dir): | |
7b84ba6d OM |
351 | """ Download all files for a given thing. |
352 | Returns True iff the thing is now downloaded (not iff it downloads the thing!) | |
353 | """ | |
3c82f75b OM |
354 | if not self._parsed: |
355 | self._parse(base_dir) | |
356 | ||
e0e69fc6 | 357 | if not self._parsed: |
8cdd1b54 OM |
358 | logging.error( |
359 | "Unable to parse {} - aborting download".format(self.thing_id)) | |
7b84ba6d | 360 | return State.FAILED |
e0e69fc6 | 361 | |
3c82f75b | 362 | if not self._needs_download: |
7b84ba6d OM |
363 | print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title)) |
364 | return State.ALREADY_DOWNLOADED | |
3c82f75b OM |
365 | |
366 | # Have we already downloaded some things? | |
367 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') | |
368 | prev_dir = None | |
369 | if os.path.exists(self.download_dir): | |
370 | if not os.path.exists(timestamp_file): | |
371 | # edge case: old style dir w/out timestamp. | |
dbdb1782 OM |
372 | logging.warning( |
373 | "Old style download dir found for {}".format(self.title)) | |
4f94efc8 OM |
374 | prev_count = 0 |
375 | target_dir = "{}_old".format(self.download_dir) | |
376 | while os.path.exists(target_dir): | |
377 | prev_count = prev_count + 1 | |
378 | target_dir = "{}_old_{}".format(self.download_dir, prev_count) | |
379 | os.rename(self.download_dir, target_dir) | |
3c82f75b | 380 | else: |
2560222a | 381 | prev_dir = "{}_{}".format(self.download_dir, slugify(self.last_time)) |
3c82f75b OM |
382 | os.rename(self.download_dir, prev_dir) |
383 | ||
384 | # Get the list of files to download | |
3c82f75b OM |
385 | |
386 | new_file_links = [] | |
387 | old_file_links = [] | |
388 | new_last_time = None | |
389 | ||
390 | if not self.last_time: | |
391 | # If we don't have anything to copy from, then it is all new. | |
b497d705 OM |
392 | logging.debug("No last time, downloading all files") |
393 | new_file_links = self._file_links | |
394 | new_last_time = new_file_links[0].last_update | |
395 | ||
396 | for file_link in new_file_links: | |
397 | new_last_time = max(new_last_time, file_link.last_update) | |
398 | logging.debug("New timestamp will be {}".format(new_last_time)) | |
3c82f75b | 399 | else: |
b497d705 OM |
400 | new_last_time = self.last_time |
401 | for file_link in self._file_links: | |
402 | if file_link.last_update > self.last_time: | |
3c82f75b | 403 | new_file_links.append(file_link) |
b497d705 | 404 | new_last_time = max(new_last_time, file_link.last_update) |
3c82f75b OM |
405 | else: |
406 | old_file_links.append(file_link) | |
3c82f75b | 407 | |
fa2f3251 | 408 | logging.debug("new timestamp {}".format(new_last_time)) |
3c82f75b OM |
409 | |
410 | # OK. Time to get to work. | |
fa2f3251 | 411 | logging.debug("Generating download_dir") |
3c82f75b | 412 | os.mkdir(self.download_dir) |
b497d705 OM |
413 | filelist_file = os.path.join(self.download_dir, "filelist.txt") |
414 | with open(filelist_file, 'w') as fl_handle: | |
415 | for fl in self._file_links: | |
416 | base_link = fl.link | |
417 | try: | |
418 | fl.link=requests.get(fl.link, allow_redirects=False).headers['location'] | |
419 | except Exception e: | |
420 | logging.warn("Unable to get actual target for {}".format(base_link)) | |
421 | ||
422 | fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update, base_link)) | |
423 | ||
424 | ||
3c82f75b | 425 | # First grab the cached files (if any) |
fa2f3251 | 426 | logging.info("Copying {} unchanged files.".format(len(old_file_links))) |
3c82f75b | 427 | for file_link in old_file_links: |
b497d705 OM |
428 | old_file = os.path.join(prev_dir, file_link.name) |
429 | new_file = os.path.join(self.download_dir, file_link.name) | |
3c82f75b | 430 | try: |
fa2f3251 | 431 | logging.debug("Copying {} to {}".format(old_file, new_file)) |
3c82f75b OM |
432 | copyfile(old_file, new_file) |
433 | except FileNotFoundError: | |
dbdb1782 OM |
434 | logging.warning( |
435 | "Unable to find {} in old archive, redownloading".format(file_link["title"])) | |
3c82f75b OM |
436 | new_file_links.append(file_link) |
437 | ||
438 | # Now download the new ones | |
dbdb1782 | 439 | logging.info("Downloading {} new files of {}".format( |
b497d705 | 440 | len(new_file_links), len(self._file_links))) |
3c82f75b | 441 | try: |
b497d705 OM |
442 | for file_link in new_file_links: |
443 | file_name = os.path.join(self.download_dir, file_link.name) | |
dbdb1782 | 444 | logging.debug("Downloading {} from {} to {}".format( |
b497d705 OM |
445 | file_link.name, file_link.link, file_name)) |
446 | data_req = requests.get(file_link.link) | |
3c82f75b OM |
447 | with open(file_name, 'wb') as handle: |
448 | handle.write(data_req.content) | |
449 | except Exception as exception: | |
b497d705 | 450 | logging.error("Failed to download {} - {}".format(file_link.name, exception)) |
3c82f75b | 451 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) |
7b84ba6d | 452 | return State.FAILED |
3c82f75b | 453 | |
b497d705 OM |
454 | |
455 | """ | |
456 | # People like images. But this doesn't work yet. | |
680039fe | 457 | image_dir = os.path.join(self.download_dir, 'images') |
dbdb1782 OM |
458 | imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \ |
459 | .find_all('div', {'class': 'gallery-photo'}) | |
fa2f3251 | 460 | logging.info("Downloading {} images.".format(len(imagelinks))) |
680039fe OM |
461 | try: |
462 | os.mkdir(image_dir) | |
fa2f3251 | 463 | for imagelink in imagelinks: |
8cdd1b54 OM |
464 | url = next(filter(None, [imagelink[x] for x in ['data-full', |
465 | 'data-large', | |
466 | 'data-medium', | |
467 | 'data-thumb']]), None) | |
b7bfef68 | 468 | if not url: |
8cdd1b54 OM |
469 | logging.warning( |
470 | "Unable to find any urls for {}".format(imagelink)) | |
b7bfef68 OM |
471 | continue |
472 | ||
680039fe OM |
473 | filename = os.path.basename(url) |
474 | if filename.endswith('stl'): | |
475 | filename = "{}.png".format(filename) | |
476 | image_req = requests.get(url) | |
477 | with open(os.path.join(image_dir, filename), 'wb') as handle: | |
478 | handle.write(image_req.content) | |
479 | except Exception as exception: | |
480 | print("Failed to download {} - {}".format(filename, exception)) | |
481 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) | |
7b84ba6d | 482 | return State.FAILED |
680039fe | 483 | |
4f75dd69 OM |
484 | # instructions are good too. |
485 | logging.info("Downloading readme") | |
486 | try: | |
8cdd1b54 OM |
487 | readme_txt = soup.find('meta', property='og:description')[ |
488 | 'content'] | |
489 | with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle: | |
4f75dd69 OM |
490 | readme_handle.write("{}\n".format(readme_txt)) |
491 | except (TypeError, KeyError) as exception: | |
492 | logging.warning("No readme? {}".format(exception)) | |
493 | except IOError as exception: | |
494 | logging.warning("Failed to write readme! {}".format(exception)) | |
495 | ||
496 | # Best get some licenses | |
497 | logging.info("Downloading license") | |
498 | try: | |
8cdd1b54 | 499 | license_txt = soup.find('div', {'class': 'license-text'}).text |
4f75dd69 | 500 | if license_txt: |
8cdd1b54 | 501 | with open(os.path.join(self.download_dir, 'license.txt'), 'w') as license_handle: |
4f75dd69 OM |
502 | license_handle.write("{}\n".format(license_txt)) |
503 | except AttributeError as exception: | |
504 | logging.warning("No license? {}".format(exception)) | |
505 | except IOError as exception: | |
506 | logging.warning("Failed to write license! {}".format(exception)) | |
b497d705 | 507 | """ |
3c82f75b OM |
508 | try: |
509 | # Now write the timestamp | |
510 | with open(timestamp_file, 'w') as timestamp_handle: | |
511 | timestamp_handle.write(new_last_time) | |
512 | except Exception as exception: | |
513 | print("Failed to write timestamp file - {}".format(exception)) | |
514 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) | |
7b84ba6d | 515 | return State.FAILED |
3c82f75b | 516 | self._needs_download = False |
fa2f3251 | 517 | logging.debug("Download of {} finished".format(self.title)) |
7b84ba6d | 518 | return State.OK |
975060c9 | 519 | |
dbdb1782 | 520 | |
7b84ba6d | 521 | def do_batch(batch_file, download_dir, quick): |
1ab49020 OM |
522 | """ Read a file in line by line, parsing each as a set of calls to this script.""" |
523 | with open(batch_file) as handle: | |
524 | for line in handle: | |
525 | line = line.strip() | |
cf280385 M |
526 | if not line: |
527 | # Skip empty lines | |
528 | continue | |
1ab49020 OM |
529 | logging.info("Handling instruction {}".format(line)) |
530 | command_arr = line.split() | |
531 | if command_arr[0] == "thing": | |
dbdb1782 OM |
532 | logging.debug( |
533 | "Handling batch thing instruction: {}".format(line)) | |
1ab49020 OM |
534 | Thing(command_arr[1]).download(download_dir) |
535 | continue | |
536 | if command_arr[0] == "collection": | |
dbdb1782 OM |
537 | logging.debug( |
538 | "Handling batch collection instruction: {}".format(line)) | |
539 | Collection(command_arr[1], command_arr[2], | |
7b84ba6d | 540 | download_dir, quick).download() |
1ab49020 OM |
541 | continue |
542 | if command_arr[0] == "user": | |
dbdb1782 OM |
543 | logging.debug( |
544 | "Handling batch collection instruction: {}".format(line)) | |
7b84ba6d | 545 | Designs(command_arr[1], download_dir, quick).download() |
1ab49020 OM |
546 | continue |
547 | logging.warning("Unable to parse current instruction. Skipping.") | |
548 | ||
dbdb1782 | 549 | |
975060c9 OM |
550 | def main(): |
551 | """ Entry point for script being run as a command. """ | |
552 | parser = argparse.ArgumentParser() | |
dbdb1782 OM |
553 | parser.add_argument("-l", "--log-level", choices=[ |
554 | 'debug', 'info', 'warning'], default='info', help="level of logging desired") | |
555 | parser.add_argument("-d", "--directory", | |
556 | help="Target directory to download into") | |
4f94efc8 OM |
557 | parser.add_argument("-f", "--log-file", |
558 | help="Place to log debug information to") | |
7b84ba6d OM |
559 | parser.add_argument("-q", "--quick", action="store_true", |
560 | help="Assume date ordering on posts") | |
561 | ||
dbdb1782 OM |
562 | subparsers = parser.add_subparsers( |
563 | help="Type of thing to download", dest="subcommand") | |
564 | collection_parser = subparsers.add_parser( | |
b7bfef68 | 565 | 'collection', help="Download one or more entire collection(s)") |
dbdb1782 | 566 | collection_parser.add_argument( |
b7bfef68 | 567 | "owner", help="The owner of the collection(s) to get") |
dbdb1782 | 568 | collection_parser.add_argument( |
b7bfef68 | 569 | "collections", nargs="+", help="Space seperated list of the name(s) of collection to get") |
dbdb1782 OM |
570 | thing_parser = subparsers.add_parser( |
571 | 'thing', help="Download a single thing.") | |
8cdd1b54 OM |
572 | thing_parser.add_argument( |
573 | "things", nargs="*", help="Space seperated list of thing ID(s) to download") | |
dbdb1782 | 574 | user_parser = subparsers.add_parser( |
b7bfef68 | 575 | "user", help="Download all things by one or more users") |
8cdd1b54 OM |
576 | user_parser.add_argument( |
577 | "users", nargs="+", help="A space seperated list of the user(s) to get the designs of") | |
dbdb1782 OM |
578 | batch_parser = subparsers.add_parser( |
579 | "batch", help="Perform multiple actions written in a text file") | |
580 | batch_parser.add_argument( | |
581 | "batch_file", help="The name of the file to read.") | |
680039fe | 582 | subparsers.add_parser("version", help="Show the current version") |
4a98996b | 583 | |
975060c9 | 584 | args = parser.parse_args() |
4a98996b OM |
585 | if not args.subcommand: |
586 | parser.print_help() | |
587 | sys.exit(1) | |
d66f1f78 OM |
588 | if not args.directory: |
589 | args.directory = os.getcwd() | |
4f94efc8 OM |
590 | |
591 | logger = logging.getLogger() | |
592 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
593 | logger.setLevel(logging.DEBUG) | |
594 | console_handler = logging.StreamHandler() | |
595 | console_handler.setLevel(args.log_level.upper()) | |
596 | ||
597 | logger.addHandler(console_handler) | |
598 | if args.log_file: | |
599 | file_handler = logging.FileHandler(args.log_file) | |
600 | file_handler.setLevel(logging.DEBUG) | |
601 | file_handler.setFormatter(formatter) | |
602 | logger.addHandler(file_handler) | |
fa2f3251 | 603 | |
6a777954 OM |
604 | |
605 | # Start downloader | |
606 | thing_queue = multiprocessing.JoinableQueue() | |
607 | logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT)) | |
608 | downloaders = [Downloader(thing_queue, args.directory) for _ in range(DOWNLOADER_COUNT)] | |
609 | for downloader in downloaders: | |
610 | downloader.start() | |
611 | ||
612 | ||
4a98996b | 613 | if args.subcommand.startswith("collection"): |
b7bfef68 | 614 | for collection in args.collections: |
7b84ba6d | 615 | Collection(args.owner, collection, args.directory, args.quick).download() |
4a98996b | 616 | if args.subcommand == "thing": |
b7bfef68 | 617 | for thing in args.things: |
6a777954 | 618 | thing_queue.put(thing) |
3522a3bf | 619 | if args.subcommand == "user": |
b7bfef68 | 620 | for user in args.users: |
7b84ba6d | 621 | Designs(user, args.directory, args.quick).download() |
db8066ec OM |
622 | if args.subcommand == "version": |
623 | print("thingy_grabber.py version {}".format(VERSION)) | |
1ab49020 | 624 | if args.subcommand == "batch": |
7b84ba6d | 625 | do_batch(args.batch_file, args.directory, args.quick) |
1ab49020 | 626 | |
6a777954 OM |
627 | # Stop the downloader processes |
628 | for downloader in downloaders: | |
629 | thing_queue.put(None) | |
975060c9 | 630 | |
0930777e OM |
631 | if __name__ == "__main__": |
632 | multiprocessing.freeze_support() | |
975060c9 | 633 | main() |