thingy_grabber.py

   1 #!/usr/bin/env python3
   2 """
   3 Thingiverse bulk downloader
   4 """
   5
   6 import re
   7 import sys
   8 import os
   9 import argparse
  10 import unicodedata
  11 import requests
  12 import logging
  13 import multiprocessing
  14 import enum
  15 from shutil import copyfile
  16 from bs4 import BeautifulSoup
  17 from dataclasses import dataclass
  18 import selenium
  19 from selenium import webdriver
  20 from selenium.webdriver.common.by import By
  21 from selenium.webdriver.support.ui import WebDriverWait
  22 from selenium.webdriver.support import expected_conditions as EC
  23 from selenium.webdriver.firefox.options import Options
  24
  25 URL_BASE = "https://www.thingiverse.com"
  26 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
  27 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
  28
  29 ID_REGEX = re.compile(r'"id":(\d*),')
  30 TOTAL_REGEX = re.compile(r'"total":(\d*),')
  31 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
  32 # This appears to be fixed at 12, but if it changes would screw the rest up.
  33 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
  34 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
  35
  36 DOWNLOADER_COUNT = 1
  37 RETRY_COUNT = 3
  38
  39 VERSION = "0.7.0"
  40
  41
  42 #BROWSER = webdriver.PhantomJS('./phantomjs')
  43 options = Options()
  44 BROWSER = webdriver.Firefox(options=options)
  45
  46 BROWSER.set_window_size(1980, 1080)
  47
  48
  49 @dataclass
  50 class FileLink:
  51     name: str
  52     last_update: str
  53     link: str
  54
  55
  56 class State(enum.Enum):
  57     OK = enum.auto()
  58     FAILED = enum.auto()
  59     ALREADY_DOWNLOADED = enum.auto()
  60
  61
  62 def strip_ws(value):
  63     """ Remove whitespace from a string """
  64     return str(NO_WHITESPACE_REGEX.sub('-', value))
  65
  66
  67 def slugify(value):
  68     """
  69     Normalizes string, converts to lowercase, removes non-alpha characters,
  70     and converts spaces to hyphens.
  71     """
  72     value = unicodedata.normalize('NFKD', value).encode(
  73         'ascii', 'ignore').decode()
  74     value = str(re.sub(r'[^\w\s-]', '', value).strip())
  75     value = str(NO_WHITESPACE_REGEX.sub('-', value))
  76     return value
  77
  78 class PageChecker(object):
  79     def __init__(self):
  80         self.log = []
  81         self.title = None
  82         self.file_count = None
  83         self.files = None
  84
  85
  86     def __call__(self, _):
  87         try:
  88             self.log.append("call")
  89             if self.title is None:
  90                 # first find the name
  91                 name = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=ThingPage__modelName]"))
  92                 if name is None:
  93                     return False
  94                 self.title = name.text
  95
  96             if self.file_count is None:
  97                 # OK. Do we know how many files we have to download?
  98                 metrics = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=MetricButton]"))
  99                 self.log.append("got some metrics: {}".format(len(metrics)))
 100                 cur_count = int([x.text.split("\n")[0] for x in metrics if x.text.endswith("\nThing Files")][0])
 101                 self.log.append(cur_count)
 102                 if cur_count == 0:
 103                     return False
 104                 self.file_count = cur_count
 105
 106             self.log.append("looking for {} files".format(self.file_count))
 107             fileRows = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=ThingFile__fileRow]"))
 108             self.log.append("found {} files".format(len(fileRows)))
 109             if len(fileRows) >= self.file_count:
 110                 self.files = fileRows
 111                 return True
 112             return False
 113         except Exception:
 114             return False
 115
 116
 117
 118
 119 class Downloader(multiprocessing.Process):
 120     """
 121     Class to handle downloading the things we have found to get.
 122     """
 123
 124     def __init__(self, thing_queue, download_directory):
 125         multiprocessing.Process.__init__(self)
 126         # TODO: add parameters
 127         self.thing_queue = thing_queue
 128         self.download_directory = download_directory
 129
 130     def run(self):
 131         """ actual download loop.
 132         """
 133         while True:
 134             thing_id = self.thing_queue.get()
 135             if thing_id is None:
 136                 logging.info("Shutting download queue")
 137                 self.thing_queue.task_done()
 138                 break
 139             logging.info("Handling id {}".format(thing_id))
 140             Thing(thing_id).download(self.download_directory)
 141             self.thing_queue.task_done()
 142         return
 143
 144
 145
 146
 147
 148 class Grouping:
 149     """ Holds details of a group of things for download
 150         This is effectively (although not actually) an abstract class
 151         - use Collection or Designs instead.
 152     """
 153
 154     def __init__(self, quick):
 155         self.things = []
 156         self.total = 0
 157         self.req_id = None
 158         self.last_page = 0
 159         self.per_page = None
 160         # Should we stop downloading when we hit a known datestamp?
 161         self.quick = quick
 162         # These should be set by child classes.
 163         self.url = None
 164         self.download_dir = None
 165         self.collection_url = None
 166
 167     def _get_small_grouping(self, req):
 168         """ Handle small groupings """
 169         soup = BeautifulSoup(req.text, features='lxml')
 170         links = soup.find_all('a', {'class': 'card-img-holder'})
 171         self.things = [x['href'].split(':')[1] for x in links]
 172         self.total = len(self.things)
 173
 174         return self.things
 175
 176     def get(self):
 177         """ retrieve the things of the grouping. """
 178         if self.things:
 179             # We've already done it.
 180             return self.things
 181
 182         # Check for initialisation:
 183         if not self.url:
 184             logging.error("No URL set - object not initialised properly?")
 185             raise ValueError("No URL set - object not initialised properly?")
 186
 187         # Get the internal details of the grouping.
 188         logging.debug("Querying {}".format(self.url))
 189         c_req = requests.get(self.url)
 190         total = TOTAL_REGEX.search(c_req.text)
 191         if total is None:
 192             # This is a small (<13) items grouping. Pull the list from this req.
 193             return self._get_small_grouping(c_req)
 194         self.total = total.groups()[0]
 195         self.req_id = ID_REGEX.search(c_req.text).groups()[0]
 196         self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
 197         self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
 198         parameters = {
 199             'base_url': self.url,
 200             'page': '1',
 201             'per_page': '12',
 202             'id': self.req_id
 203         }
 204         for current_page in range(1, self.last_page + 1):
 205             parameters['page'] = current_page
 206             req = requests.post(self.collection_url, parameters)
 207             soup = BeautifulSoup(req.text, features='lxml')
 208             links = soup.find_all('a', {'class': 'card-img-holder'})
 209             self.things += [x['href'].split(':')[1] for x in links]
 210
 211         return self.things
 212
 213     def download(self):
 214         """ Downloads all the files in a collection """
 215         if not self.things:
 216             self.get()
 217
 218         if not self.download_dir:
 219             raise ValueError(
 220                 "No download_dir set - invalidly initialised object?")
 221
 222         base_dir = os.getcwd()
 223         try:
 224             os.mkdir(self.download_dir)
 225         except FileExistsError:
 226             logging.info("Target directory {} already exists. Assuming a resume."
 227                          .format(self.download_dir))
 228         logging.info("Downloading {} thing(s).".format(self.total))
 229         for idx, thing in enumerate(self.things):
 230             logging.info("Downloading thing {}".format(idx))
 231             RC = Thing(thing).download(self.download_dir)
 232             if self.quick and RC==State.ALREADY_DOWNLOADED:
 233                 logging.info("Caught up, stopping.")
 234                 return
 235
 236
 237 class Collection(Grouping):
 238     """ Holds details of a collection. """
 239
 240     def __init__(self, user, name, directory, quick):
 241         Grouping.__init__(self, quick)
 242         self.user = user
 243         self.name = name
 244         self.url = "{}/{}/collections/{}".format(
 245             URL_BASE, self.user, strip_ws(self.name))
 246         self.download_dir = os.path.join(directory,
 247                                          "{}-{}".format(slugify(self.user), slugify(self.name)))
 248         self.collection_url = URL_COLLECTION
 249
 250
 251 class Designs(Grouping):
 252     """ Holds details of all of a users' designs. """
 253
 254     def __init__(self, user, directory, quick):
 255         Grouping.__init__(self, quick)
 256         self.user = user
 257         self.url = "{}/{}/designs".format(URL_BASE, self.user)
 258         self.download_dir = os.path.join(
 259             directory, "{} designs".format(slugify(self.user)))
 260         self.collection_url = USER_COLLECTION
 261
 262
 263 class Thing:
 264     """ An individual design on thingiverse. """
 265
 266     def __init__(self, thing_id):
 267         self.thing_id = thing_id
 268         self.last_time = None
 269         self._parsed = False
 270         self._needs_download = True
 271         self.text = None
 272         self.title = None
 273         self.download_dir = None
 274
 275     def _parse(self, base_dir):
 276         """ Work out what, if anything needs to be done. """
 277         if self._parsed:
 278             return
 279
 280         url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
 281         try:
 282             BROWSER.get(url)
 283             wait = WebDriverWait(BROWSER, 20)
 284             pc = PageChecker()
 285             wait.until(pc)
 286         except requests.exceptions.ConnectionError as error:
 287             logging.error("Unable to connect for thing {}: {}".format(
 288                 self.thing_id, error))
 289             return
 290
 291         self.title = pc.title
 292         self._file_links=[]
 293         for link in pc.files:
 294             link_title, link_details, _ = link.text.split("\n")
 295             #link_details we be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
 296             link_date = link_details.split("|")[1][10:-1]
 297             link_link = link.find_element_by_xpath(".//a").get_attribute("href")
 298             self._file_links.append(FileLink(link_title, link_date, link_link))
 299
 300
 301         self.old_download_dir = os.path.join(base_dir, self.title)
 302         self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, self.title))
 303
 304         logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
 305
 306         if not os.path.exists(self.download_dir):
 307             if os.path.exists(self.old_download_dir):
 308                 logging.info("Found previous style download directory. Moving it")
 309                 copyfile(self.old_download_dir, self.download_dir)
 310             else:
 311                 # Not yet downloaded
 312                 self._parsed = True
 313                 return
 314
 315         timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
 316         if not os.path.exists(timestamp_file):
 317             # Old download from before
 318             logging.warning(
 319                 "Old-style download directory found. Assuming update required.")
 320             self._parsed = True
 321             return
 322
 323         try:
 324             with open(timestamp_file, 'r') as timestamp_handle:
 325                 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
 326                 self.last_time = timestamp_handle.readlines()[0].split(' ')[0]
 327             logging.info("last downloaded version: {}".format(self.last_time))
 328         except FileNotFoundError:
 329             # Not run on this thing before.
 330             logging.info(
 331                 "Old-style download directory found. Assuming update required.")
 332             self.last_time = None
 333             self._needs_download = True
 334             self._parsed = True
 335             return
 336
 337         # OK, so we have a timestamp, lets see if there is anything new to get
 338         for file_link in self._file_links:
 339             if file_link.last_update > self.last_time:
 340                 logging.info(
 341                     "Found new/updated file {}".format(file_link["title"]))
 342                 self._needs_download = True
 343                 self._parsed = True
 344                 return
 345
 346         # Got here, so nope, no new files.
 347         self._needs_download = False
 348         self._parsed = True
 349
 350     def download(self, base_dir):
 351         """ Download all files for a given thing.
 352             Returns True iff the thing is now downloaded (not iff it downloads the thing!)
 353         """
 354         if not self._parsed:
 355             self._parse(base_dir)
 356
 357         if not self._parsed:
 358             logging.error(
 359                 "Unable to parse {} - aborting download".format(self.thing_id))
 360             return State.FAILED
 361
 362         if not self._needs_download:
 363             print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title))
 364             return State.ALREADY_DOWNLOADED
 365
 366         # Have we already downloaded some things?
 367         timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
 368         prev_dir = None
 369         if os.path.exists(self.download_dir):
 370             if not os.path.exists(timestamp_file):
 371                 # edge case: old style dir w/out timestamp.
 372                 logging.warning(
 373                     "Old style download dir found for {}".format(self.title))
 374                 prev_count = 0
 375                 target_dir = "{}_old".format(self.download_dir)
 376                 while os.path.exists(target_dir):
 377                     prev_count = prev_count + 1
 378                     target_dir = "{}_old_{}".format(self.download_dir, prev_count)
 379                 os.rename(self.download_dir, target_dir)
 380             else:
 381                 prev_dir = "{}_{}".format(self.download_dir, slugify(self.last_time))
 382                 os.rename(self.download_dir, prev_dir)
 383
 384         # Get the list of files to download
 385
 386         new_file_links = []
 387         old_file_links = []
 388         new_last_time = None
 389
 390         if not self.last_time:
 391             # If we don't have anything to copy from, then it is all new.
 392             logging.debug("No last time, downloading all files")
 393             new_file_links = self._file_links
 394             new_last_time = new_file_links[0].last_update
 395
 396             for file_link in new_file_links:
 397                 new_last_time = max(new_last_time, file_link.last_update)
 398             logging.debug("New timestamp will be {}".format(new_last_time))
 399         else:
 400             new_last_time = self.last_time
 401             for file_link in self._file_links:
 402                 if file_link.last_update > self.last_time:
 403                     new_file_links.append(file_link)
 404                     new_last_time = max(new_last_time, file_link.last_update)
 405                 else:
 406                     old_file_links.append(file_link)
 407
 408         logging.debug("new timestamp {}".format(new_last_time))
 409
 410         # OK. Time to get to work.
 411         logging.debug("Generating download_dir")
 412         os.mkdir(self.download_dir)
 413         filelist_file = os.path.join(self.download_dir, "filelist.txt")
 414         with open(filelist_file, 'w') as fl_handle:
 415             for fl in self._file_links:
 416               base_link = fl.link
 417               try:
 418                 fl.link=requests.get(fl.link, allow_redirects=False).headers['location']
 419               except Exception e:
 420                 logging.warn("Unable to get actual target for {}".format(base_link))
 421
 422               fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update, base_link))
 423
 424
 425         # First grab the cached files (if any)
 426         logging.info("Copying {} unchanged files.".format(len(old_file_links)))
 427         for file_link in old_file_links:
 428             old_file = os.path.join(prev_dir, file_link.name)
 429             new_file = os.path.join(self.download_dir, file_link.name)
 430             try:
 431                 logging.debug("Copying {} to {}".format(old_file, new_file))
 432                 copyfile(old_file, new_file)
 433             except FileNotFoundError:
 434                 logging.warning(
 435                     "Unable to find {} in old archive, redownloading".format(file_link["title"]))
 436                 new_file_links.append(file_link)
 437
 438         # Now download the new ones
 439         logging.info("Downloading {} new files of {}".format(
 440             len(new_file_links), len(self._file_links)))
 441         try:
 442             for file_link in new_file_links:
 443                 file_name = os.path.join(self.download_dir, file_link.name)
 444                 logging.debug("Downloading {} from {} to {}".format(
 445                     file_link.name, file_link.link, file_name))
 446                 data_req = requests.get(file_link.link)
 447                 with open(file_name, 'wb') as handle:
 448                     handle.write(data_req.content)
 449         except Exception as exception:
 450             logging.error("Failed to download {} - {}".format(file_link.name, exception))
 451             os.rename(self.download_dir, "{}_failed".format(self.download_dir))
 452             return State.FAILED
 453
 454
 455         """
 456         # People like images. But this doesn't work yet.
 457         image_dir = os.path.join(self.download_dir, 'images')
 458         imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
 459                          .find_all('div', {'class': 'gallery-photo'})
 460         logging.info("Downloading {} images.".format(len(imagelinks)))
 461         try:
 462             os.mkdir(image_dir)
 463             for imagelink in imagelinks:
 464                 url = next(filter(None, [imagelink[x] for x in ['data-full',
 465                                                                 'data-large',
 466                                                                 'data-medium',
 467                                                                 'data-thumb']]), None)
 468                 if not url:
 469                     logging.warning(
 470                         "Unable to find any urls for {}".format(imagelink))
 471                     continue
 472
 473                 filename = os.path.basename(url)
 474                 if filename.endswith('stl'):
 475                     filename = "{}.png".format(filename)
 476                 image_req = requests.get(url)
 477                 with open(os.path.join(image_dir, filename), 'wb') as handle:
 478                     handle.write(image_req.content)
 479         except Exception as exception:
 480             print("Failed to download {} - {}".format(filename, exception))
 481             os.rename(self.download_dir, "{}_failed".format(self.download_dir))
 482             return State.FAILED
 483
 484         # instructions are good too.
 485         logging.info("Downloading readme")
 486         try:
 487             readme_txt = soup.find('meta', property='og:description')[
 488                 'content']
 489             with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
 490                 readme_handle.write("{}\n".format(readme_txt))
 491         except (TypeError, KeyError) as exception:
 492             logging.warning("No readme? {}".format(exception))
 493         except IOError as exception:
 494             logging.warning("Failed to write readme! {}".format(exception))
 495
 496         # Best get some licenses
 497         logging.info("Downloading license")
 498         try:
 499             license_txt = soup.find('div', {'class': 'license-text'}).text
 500             if license_txt:
 501                 with open(os.path.join(self.download_dir, 'license.txt'), 'w') as license_handle:
 502                     license_handle.write("{}\n".format(license_txt))
 503         except AttributeError as exception:
 504             logging.warning("No license? {}".format(exception))
 505         except IOError as exception:
 506             logging.warning("Failed to write license! {}".format(exception))
 507         """
 508         try:
 509             # Now write the timestamp
 510             with open(timestamp_file, 'w') as timestamp_handle:
 511                 timestamp_handle.write(new_last_time)
 512         except Exception as exception:
 513             print("Failed to write timestamp file - {}".format(exception))
 514             os.rename(self.download_dir, "{}_failed".format(self.download_dir))
 515             return State.FAILED
 516         self._needs_download = False
 517         logging.debug("Download of {} finished".format(self.title))
 518         return State.OK
 519
 520
 521 def do_batch(batch_file, download_dir, quick):
 522     """ Read a file in line by line, parsing each as a set of calls to this script."""
 523     with open(batch_file) as handle:
 524         for line in handle:
 525             line = line.strip()
 526             if not line:
 527                 # Skip empty lines
 528                 continue
 529             logging.info("Handling instruction {}".format(line))
 530             command_arr = line.split()
 531             if command_arr[0] == "thing":
 532                 logging.debug(
 533                     "Handling batch thing instruction: {}".format(line))
 534                 Thing(command_arr[1]).download(download_dir)
 535                 continue
 536             if command_arr[0] == "collection":
 537                 logging.debug(
 538                     "Handling batch collection instruction: {}".format(line))
 539                 Collection(command_arr[1], command_arr[2],
 540                            download_dir, quick).download()
 541                 continue
 542             if command_arr[0] == "user":
 543                 logging.debug(
 544                     "Handling batch collection instruction: {}".format(line))
 545                 Designs(command_arr[1], download_dir, quick).download()
 546                 continue
 547             logging.warning("Unable to parse current instruction. Skipping.")
 548
 549
 550 def main():
 551     """ Entry point for script being run as a command. """
 552     parser = argparse.ArgumentParser()
 553     parser.add_argument("-l", "--log-level", choices=[
 554                         'debug', 'info', 'warning'], default='info', help="level of logging desired")
 555     parser.add_argument("-d", "--directory",
 556                         help="Target directory to download into")
 557     parser.add_argument("-f", "--log-file",
 558                         help="Place to log debug information to")
 559     parser.add_argument("-q", "--quick", action="store_true",
 560                         help="Assume date ordering on posts")
 561
 562     subparsers = parser.add_subparsers(
 563         help="Type of thing to download", dest="subcommand")
 564     collection_parser = subparsers.add_parser(
 565         'collection', help="Download one or more entire collection(s)")
 566     collection_parser.add_argument(
 567         "owner", help="The owner of the collection(s) to get")
 568     collection_parser.add_argument(
 569         "collections", nargs="+",  help="Space seperated list of the name(s) of collection to get")
 570     thing_parser = subparsers.add_parser(
 571         'thing', help="Download a single thing.")
 572     thing_parser.add_argument(
 573         "things", nargs="*", help="Space seperated list of thing ID(s) to download")
 574     user_parser = subparsers.add_parser(
 575         "user",  help="Download all things by one or more users")
 576     user_parser.add_argument(
 577         "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
 578     batch_parser = subparsers.add_parser(
 579         "batch", help="Perform multiple actions written in a text file")
 580     batch_parser.add_argument(
 581         "batch_file", help="The name of the file to read.")
 582     subparsers.add_parser("version", help="Show the current version")
 583
 584     args = parser.parse_args()
 585     if not args.subcommand:
 586         parser.print_help()
 587         sys.exit(1)
 588     if not args.directory:
 589         args.directory = os.getcwd()
 590
 591     logger = logging.getLogger()
 592     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 593     logger.setLevel(logging.DEBUG)
 594     console_handler = logging.StreamHandler()
 595     console_handler.setLevel(args.log_level.upper())
 596
 597     logger.addHandler(console_handler)
 598     if args.log_file:
 599         file_handler = logging.FileHandler(args.log_file)
 600         file_handler.setLevel(logging.DEBUG)
 601         file_handler.setFormatter(formatter)
 602         logger.addHandler(file_handler)
 603
 604
 605     # Start downloader
 606     thing_queue = multiprocessing.JoinableQueue()
 607     logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
 608     downloaders = [Downloader(thing_queue, args.directory) for _ in range(DOWNLOADER_COUNT)]
 609     for downloader in downloaders:
 610         downloader.start()
 611
 612
 613     if args.subcommand.startswith("collection"):
 614         for collection in args.collections:
 615             Collection(args.owner, collection, args.directory, args.quick).download()
 616     if args.subcommand == "thing":
 617         for thing in args.things:
 618             thing_queue.put(thing)
 619     if args.subcommand == "user":
 620         for user in args.users:
 621             Designs(user, args.directory, args.quick).download()
 622     if args.subcommand == "version":
 623         print("thingy_grabber.py version {}".format(VERSION))
 624     if args.subcommand == "batch":
 625         do_batch(args.batch_file, args.directory, args.quick)
 626
 627     # Stop the downloader processes
 628     for downloader in downloaders:
 629         thing_queue.put(None)
 630
 631 if __name__ == "__main__":
 632     multiprocessing.freeze_support()
 633     main()