update it to work with new thingiverse
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 import logging
13 import multiprocessing
14 import enum
15 from shutil import copyfile
16 from bs4 import BeautifulSoup
17 from dataclasses import dataclass
18 import selenium
19 from selenium import webdriver
20 from selenium.webdriver.common.by import By
21 from selenium.webdriver.support.ui import WebDriverWait
22 from selenium.webdriver.support import expected_conditions as EC
23 from selenium.webdriver.firefox.options import Options
24
25 URL_BASE = "https://www.thingiverse.com"
26 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
27 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
28
29 ID_REGEX = re.compile(r'"id":(\d*),')
30 TOTAL_REGEX = re.compile(r'"total":(\d*),')
31 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
32 # This appears to be fixed at 12, but if it changes would screw the rest up.
33 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
34 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
35
36 DOWNLOADER_COUNT = 1
37 RETRY_COUNT = 3
38
39 VERSION = "0.7.0"
40
41
42 #BROWSER = webdriver.PhantomJS('./phantomjs')
43 options = Options()
44 BROWSER = webdriver.Firefox(options=options)
45
46 BROWSER.set_window_size(1980, 1080)
47
48
49 @dataclass
50 class FileLink:
51 name: str
52 last_update: str
53 link: str
54
55
56 class State(enum.Enum):
57 OK = enum.auto()
58 FAILED = enum.auto()
59 ALREADY_DOWNLOADED = enum.auto()
60
61
62 def strip_ws(value):
63 """ Remove whitespace from a string """
64 return str(NO_WHITESPACE_REGEX.sub('-', value))
65
66
67 def slugify(value):
68 """
69 Normalizes string, converts to lowercase, removes non-alpha characters,
70 and converts spaces to hyphens.
71 """
72 value = unicodedata.normalize('NFKD', value).encode(
73 'ascii', 'ignore').decode()
74 value = str(re.sub(r'[^\w\s-]', '', value).strip())
75 value = str(NO_WHITESPACE_REGEX.sub('-', value))
76 return value
77
78 class PageChecker(object):
79 def __init__(self):
80 self.log = []
81 self.title = None
82 self.file_count = None
83 self.files = None
84
85
86 def __call__(self, _):
87 try:
88 self.log.append("call")
89 if self.title is None:
90 # first find the name
91 name = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=ThingPage__modelName]"))
92 if name is None:
93 return False
94 self.title = name.text
95
96 if self.file_count is None:
97 # OK. Do we know how many files we have to download?
98 metrics = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=MetricButton]"))
99 self.log.append("got some metrics: {}".format(len(metrics)))
100 cur_count = int([x.text.split("\n")[0] for x in metrics if x.text.endswith("\nThing Files")][0])
101 self.log.append(cur_count)
102 if cur_count == 0:
103 return False
104 self.file_count = cur_count
105
106 self.log.append("looking for {} files".format(self.file_count))
107 fileRows = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=ThingFile__fileRow]"))
108 self.log.append("found {} files".format(len(fileRows)))
109 if len(fileRows) >= self.file_count:
110 self.files = fileRows
111 return True
112 return False
113 except Exception:
114 return False
115
116
117
118
119 class Downloader(multiprocessing.Process):
120 """
121 Class to handle downloading the things we have found to get.
122 """
123
124 def __init__(self, thing_queue, download_directory):
125 multiprocessing.Process.__init__(self)
126 # TODO: add parameters
127 self.thing_queue = thing_queue
128 self.download_directory = download_directory
129
130 def run(self):
131 """ actual download loop.
132 """
133 while True:
134 thing_id = self.thing_queue.get()
135 if thing_id is None:
136 logging.info("Shutting download queue")
137 self.thing_queue.task_done()
138 break
139 logging.info("Handling id {}".format(thing_id))
140 Thing(thing_id).download(self.download_directory)
141 self.thing_queue.task_done()
142 return
143
144
145
146
147
148 class Grouping:
149 """ Holds details of a group of things for download
150 This is effectively (although not actually) an abstract class
151 - use Collection or Designs instead.
152 """
153
154 def __init__(self, quick):
155 self.things = []
156 self.total = 0
157 self.req_id = None
158 self.last_page = 0
159 self.per_page = None
160 # Should we stop downloading when we hit a known datestamp?
161 self.quick = quick
162 # These should be set by child classes.
163 self.url = None
164 self.download_dir = None
165 self.collection_url = None
166
167 def _get_small_grouping(self, req):
168 """ Handle small groupings """
169 soup = BeautifulSoup(req.text, features='lxml')
170 links = soup.find_all('a', {'class': 'card-img-holder'})
171 self.things = [x['href'].split(':')[1] for x in links]
172 self.total = len(self.things)
173
174 return self.things
175
176 def get(self):
177 """ retrieve the things of the grouping. """
178 if self.things:
179 # We've already done it.
180 return self.things
181
182 # Check for initialisation:
183 if not self.url:
184 logging.error("No URL set - object not initialised properly?")
185 raise ValueError("No URL set - object not initialised properly?")
186
187 # Get the internal details of the grouping.
188 logging.debug("Querying {}".format(self.url))
189 c_req = requests.get(self.url)
190 total = TOTAL_REGEX.search(c_req.text)
191 if total is None:
192 # This is a small (<13) items grouping. Pull the list from this req.
193 return self._get_small_grouping(c_req)
194 self.total = total.groups()[0]
195 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
196 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
197 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
198 parameters = {
199 'base_url': self.url,
200 'page': '1',
201 'per_page': '12',
202 'id': self.req_id
203 }
204 for current_page in range(1, self.last_page + 1):
205 parameters['page'] = current_page
206 req = requests.post(self.collection_url, parameters)
207 soup = BeautifulSoup(req.text, features='lxml')
208 links = soup.find_all('a', {'class': 'card-img-holder'})
209 self.things += [x['href'].split(':')[1] for x in links]
210
211 return self.things
212
213 def download(self):
214 """ Downloads all the files in a collection """
215 if not self.things:
216 self.get()
217
218 if not self.download_dir:
219 raise ValueError(
220 "No download_dir set - invalidly initialised object?")
221
222 base_dir = os.getcwd()
223 try:
224 os.mkdir(self.download_dir)
225 except FileExistsError:
226 logging.info("Target directory {} already exists. Assuming a resume."
227 .format(self.download_dir))
228 logging.info("Downloading {} thing(s).".format(self.total))
229 for idx, thing in enumerate(self.things):
230 logging.info("Downloading thing {}".format(idx))
231 RC = Thing(thing).download(self.download_dir)
232 if self.quick and RC==State.ALREADY_DOWNLOADED:
233 logging.info("Caught up, stopping.")
234 return
235
236
237 class Collection(Grouping):
238 """ Holds details of a collection. """
239
240 def __init__(self, user, name, directory, quick):
241 Grouping.__init__(self, quick)
242 self.user = user
243 self.name = name
244 self.url = "{}/{}/collections/{}".format(
245 URL_BASE, self.user, strip_ws(self.name))
246 self.download_dir = os.path.join(directory,
247 "{}-{}".format(slugify(self.user), slugify(self.name)))
248 self.collection_url = URL_COLLECTION
249
250
251 class Designs(Grouping):
252 """ Holds details of all of a users' designs. """
253
254 def __init__(self, user, directory, quick):
255 Grouping.__init__(self, quick)
256 self.user = user
257 self.url = "{}/{}/designs".format(URL_BASE, self.user)
258 self.download_dir = os.path.join(
259 directory, "{} designs".format(slugify(self.user)))
260 self.collection_url = USER_COLLECTION
261
262
263 class Thing:
264 """ An individual design on thingiverse. """
265
266 def __init__(self, thing_id):
267 self.thing_id = thing_id
268 self.last_time = None
269 self._parsed = False
270 self._needs_download = True
271 self.text = None
272 self.title = None
273 self.download_dir = None
274
275 def _parse(self, base_dir):
276 """ Work out what, if anything needs to be done. """
277 if self._parsed:
278 return
279
280 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
281 try:
282 BROWSER.get(url)
283 wait = WebDriverWait(BROWSER, 20)
284 pc = PageChecker()
285 wait.until(pc)
286 except requests.exceptions.ConnectionError as error:
287 logging.error("Unable to connect for thing {}: {}".format(
288 self.thing_id, error))
289 return
290
291 self.title = pc.title
292 self._file_links=[]
293 for link in pc.files:
294 link_title, link_details, _ = link.text.split("\n")
295 #link_details we be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
296 link_date = link_details.split("|")[1][10:-1]
297 link_link = link.find_element_by_xpath(".//a").get_attribute("href")
298 self._file_links.append(FileLink(link_title, link_date, link_link))
299
300
301 self.old_download_dir = os.path.join(base_dir, self.title)
302 self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, self.title))
303
304 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
305
306 if not os.path.exists(self.download_dir):
307 if os.path.exists(self.old_download_dir):
308 logging.info("Found previous style download directory. Moving it")
309 copyfile(self.old_download_dir, self.download_dir)
310 else:
311 # Not yet downloaded
312 self._parsed = True
313 return
314
315 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
316 if not os.path.exists(timestamp_file):
317 # Old download from before
318 logging.warning(
319 "Old-style download directory found. Assuming update required.")
320 self._parsed = True
321 return
322
323 try:
324 with open(timestamp_file, 'r') as timestamp_handle:
325 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
326 self.last_time = timestamp_handle.readlines()[0].split(' ')[0]
327 logging.info("last downloaded version: {}".format(self.last_time))
328 except FileNotFoundError:
329 # Not run on this thing before.
330 logging.info(
331 "Old-style download directory found. Assuming update required.")
332 self.last_time = None
333 self._needs_download = True
334 self._parsed = True
335 return
336
337 # OK, so we have a timestamp, lets see if there is anything new to get
338 for file_link in self._file_links:
339 if file_link.last_update > self.last_time:
340 logging.info(
341 "Found new/updated file {}".format(file_link["title"]))
342 self._needs_download = True
343 self._parsed = True
344 return
345
346 # Got here, so nope, no new files.
347 self._needs_download = False
348 self._parsed = True
349
350 def download(self, base_dir):
351 """ Download all files for a given thing.
352 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
353 """
354 if not self._parsed:
355 self._parse(base_dir)
356
357 if not self._parsed:
358 logging.error(
359 "Unable to parse {} - aborting download".format(self.thing_id))
360 return State.FAILED
361
362 if not self._needs_download:
363 print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title))
364 return State.ALREADY_DOWNLOADED
365
366 # Have we already downloaded some things?
367 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
368 prev_dir = None
369 if os.path.exists(self.download_dir):
370 if not os.path.exists(timestamp_file):
371 # edge case: old style dir w/out timestamp.
372 logging.warning(
373 "Old style download dir found for {}".format(self.title))
374 prev_count = 0
375 target_dir = "{}_old".format(self.download_dir)
376 while os.path.exists(target_dir):
377 prev_count = prev_count + 1
378 target_dir = "{}_old_{}".format(self.download_dir, prev_count)
379 os.rename(self.download_dir, target_dir)
380 else:
381 prev_dir = "{}_{}".format(self.download_dir, slugify(self.last_time))
382 os.rename(self.download_dir, prev_dir)
383
384 # Get the list of files to download
385
386 new_file_links = []
387 old_file_links = []
388 new_last_time = None
389
390 if not self.last_time:
391 # If we don't have anything to copy from, then it is all new.
392 logging.debug("No last time, downloading all files")
393 new_file_links = self._file_links
394 new_last_time = new_file_links[0].last_update
395
396 for file_link in new_file_links:
397 new_last_time = max(new_last_time, file_link.last_update)
398 logging.debug("New timestamp will be {}".format(new_last_time))
399 else:
400 new_last_time = self.last_time
401 for file_link in self._file_links:
402 if file_link.last_update > self.last_time:
403 new_file_links.append(file_link)
404 new_last_time = max(new_last_time, file_link.last_update)
405 else:
406 old_file_links.append(file_link)
407
408 logging.debug("new timestamp {}".format(new_last_time))
409
410 # OK. Time to get to work.
411 logging.debug("Generating download_dir")
412 os.mkdir(self.download_dir)
413 filelist_file = os.path.join(self.download_dir, "filelist.txt")
414 with open(filelist_file, 'w') as fl_handle:
415 for fl in self._file_links:
416 base_link = fl.link
417 try:
418 fl.link=requests.get(fl.link, allow_redirects=False).headers['location']
419 except Exception e:
420 logging.warn("Unable to get actual target for {}".format(base_link))
421
422 fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update, base_link))
423
424
425 # First grab the cached files (if any)
426 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
427 for file_link in old_file_links:
428 old_file = os.path.join(prev_dir, file_link.name)
429 new_file = os.path.join(self.download_dir, file_link.name)
430 try:
431 logging.debug("Copying {} to {}".format(old_file, new_file))
432 copyfile(old_file, new_file)
433 except FileNotFoundError:
434 logging.warning(
435 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
436 new_file_links.append(file_link)
437
438 # Now download the new ones
439 logging.info("Downloading {} new files of {}".format(
440 len(new_file_links), len(self._file_links)))
441 try:
442 for file_link in new_file_links:
443 file_name = os.path.join(self.download_dir, file_link.name)
444 logging.debug("Downloading {} from {} to {}".format(
445 file_link.name, file_link.link, file_name))
446 data_req = requests.get(file_link.link)
447 with open(file_name, 'wb') as handle:
448 handle.write(data_req.content)
449 except Exception as exception:
450 logging.error("Failed to download {} - {}".format(file_link.name, exception))
451 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
452 return State.FAILED
453
454
455 """
456 # People like images. But this doesn't work yet.
457 image_dir = os.path.join(self.download_dir, 'images')
458 imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
459 .find_all('div', {'class': 'gallery-photo'})
460 logging.info("Downloading {} images.".format(len(imagelinks)))
461 try:
462 os.mkdir(image_dir)
463 for imagelink in imagelinks:
464 url = next(filter(None, [imagelink[x] for x in ['data-full',
465 'data-large',
466 'data-medium',
467 'data-thumb']]), None)
468 if not url:
469 logging.warning(
470 "Unable to find any urls for {}".format(imagelink))
471 continue
472
473 filename = os.path.basename(url)
474 if filename.endswith('stl'):
475 filename = "{}.png".format(filename)
476 image_req = requests.get(url)
477 with open(os.path.join(image_dir, filename), 'wb') as handle:
478 handle.write(image_req.content)
479 except Exception as exception:
480 print("Failed to download {} - {}".format(filename, exception))
481 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
482 return State.FAILED
483
484 # instructions are good too.
485 logging.info("Downloading readme")
486 try:
487 readme_txt = soup.find('meta', property='og:description')[
488 'content']
489 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
490 readme_handle.write("{}\n".format(readme_txt))
491 except (TypeError, KeyError) as exception:
492 logging.warning("No readme? {}".format(exception))
493 except IOError as exception:
494 logging.warning("Failed to write readme! {}".format(exception))
495
496 # Best get some licenses
497 logging.info("Downloading license")
498 try:
499 license_txt = soup.find('div', {'class': 'license-text'}).text
500 if license_txt:
501 with open(os.path.join(self.download_dir, 'license.txt'), 'w') as license_handle:
502 license_handle.write("{}\n".format(license_txt))
503 except AttributeError as exception:
504 logging.warning("No license? {}".format(exception))
505 except IOError as exception:
506 logging.warning("Failed to write license! {}".format(exception))
507 """
508 try:
509 # Now write the timestamp
510 with open(timestamp_file, 'w') as timestamp_handle:
511 timestamp_handle.write(new_last_time)
512 except Exception as exception:
513 print("Failed to write timestamp file - {}".format(exception))
514 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
515 return State.FAILED
516 self._needs_download = False
517 logging.debug("Download of {} finished".format(self.title))
518 return State.OK
519
520
521 def do_batch(batch_file, download_dir, quick):
522 """ Read a file in line by line, parsing each as a set of calls to this script."""
523 with open(batch_file) as handle:
524 for line in handle:
525 line = line.strip()
526 if not line:
527 # Skip empty lines
528 continue
529 logging.info("Handling instruction {}".format(line))
530 command_arr = line.split()
531 if command_arr[0] == "thing":
532 logging.debug(
533 "Handling batch thing instruction: {}".format(line))
534 Thing(command_arr[1]).download(download_dir)
535 continue
536 if command_arr[0] == "collection":
537 logging.debug(
538 "Handling batch collection instruction: {}".format(line))
539 Collection(command_arr[1], command_arr[2],
540 download_dir, quick).download()
541 continue
542 if command_arr[0] == "user":
543 logging.debug(
544 "Handling batch collection instruction: {}".format(line))
545 Designs(command_arr[1], download_dir, quick).download()
546 continue
547 logging.warning("Unable to parse current instruction. Skipping.")
548
549
550 def main():
551 """ Entry point for script being run as a command. """
552 parser = argparse.ArgumentParser()
553 parser.add_argument("-l", "--log-level", choices=[
554 'debug', 'info', 'warning'], default='info', help="level of logging desired")
555 parser.add_argument("-d", "--directory",
556 help="Target directory to download into")
557 parser.add_argument("-f", "--log-file",
558 help="Place to log debug information to")
559 parser.add_argument("-q", "--quick", action="store_true",
560 help="Assume date ordering on posts")
561
562 subparsers = parser.add_subparsers(
563 help="Type of thing to download", dest="subcommand")
564 collection_parser = subparsers.add_parser(
565 'collection', help="Download one or more entire collection(s)")
566 collection_parser.add_argument(
567 "owner", help="The owner of the collection(s) to get")
568 collection_parser.add_argument(
569 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
570 thing_parser = subparsers.add_parser(
571 'thing', help="Download a single thing.")
572 thing_parser.add_argument(
573 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
574 user_parser = subparsers.add_parser(
575 "user", help="Download all things by one or more users")
576 user_parser.add_argument(
577 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
578 batch_parser = subparsers.add_parser(
579 "batch", help="Perform multiple actions written in a text file")
580 batch_parser.add_argument(
581 "batch_file", help="The name of the file to read.")
582 subparsers.add_parser("version", help="Show the current version")
583
584 args = parser.parse_args()
585 if not args.subcommand:
586 parser.print_help()
587 sys.exit(1)
588 if not args.directory:
589 args.directory = os.getcwd()
590
591 logger = logging.getLogger()
592 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
593 logger.setLevel(logging.DEBUG)
594 console_handler = logging.StreamHandler()
595 console_handler.setLevel(args.log_level.upper())
596
597 logger.addHandler(console_handler)
598 if args.log_file:
599 file_handler = logging.FileHandler(args.log_file)
600 file_handler.setLevel(logging.DEBUG)
601 file_handler.setFormatter(formatter)
602 logger.addHandler(file_handler)
603
604
605 # Start downloader
606 thing_queue = multiprocessing.JoinableQueue()
607 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
608 downloaders = [Downloader(thing_queue, args.directory) for _ in range(DOWNLOADER_COUNT)]
609 for downloader in downloaders:
610 downloader.start()
611
612
613 if args.subcommand.startswith("collection"):
614 for collection in args.collections:
615 Collection(args.owner, collection, args.directory, args.quick).download()
616 if args.subcommand == "thing":
617 for thing in args.things:
618 thing_queue.put(thing)
619 if args.subcommand == "user":
620 for user in args.users:
621 Designs(user, args.directory, args.quick).download()
622 if args.subcommand == "version":
623 print("thingy_grabber.py version {}".format(VERSION))
624 if args.subcommand == "batch":
625 do_batch(args.batch_file, args.directory, args.quick)
626
627 # Stop the downloader processes
628 for downloader in downloaders:
629 thing_queue.put(None)
630
631 if __name__ == "__main__":
632 multiprocessing.freeze_support()
633 main()