thingy_grabber.py

   1 #!/usr/bin/env python3
   2 """
   3 Thingiverse bulk downloader
   4 """
   5
   6 import re
   7 import sys
   8 import os
   9 import argparse
  10 import unicodedata
  11 import requests
  12 import logging
  13 import multiprocessing
  14 import enum
  15 from shutil import copyfile
  16 from bs4 import BeautifulSoup
  17
  18 URL_BASE = "https://www.thingiverse.com"
  19 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
  20 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
  21
  22 ID_REGEX = re.compile(r'"id":(\d*),')
  23 TOTAL_REGEX = re.compile(r'"total":(\d*),')
  24 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
  25 # This appears to be fixed at 12, but if it changes would screw the rest up.
  26 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
  27 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
  28
  29 DOWNLOADER_COUNT = 1
  30 RETRY_COUNT = 3
  31
  32 VERSION = "0.7.0"
  33
  34 class State(enum.Enum):
  35     OK = enum.auto()
  36     FAILED = enum.auto()
  37     ALREADY_DOWNLOADED = enum.auto()
  38
  39
  40 def strip_ws(value):
  41     """ Remove whitespace from a string """
  42     return str(NO_WHITESPACE_REGEX.sub('-', value))
  43
  44
  45 def slugify(value):
  46     """
  47     Normalizes string, converts to lowercase, removes non-alpha characters,
  48     and converts spaces to hyphens.
  49     """
  50     value = unicodedata.normalize('NFKD', value).encode(
  51         'ascii', 'ignore').decode()
  52     value = str(re.sub(r'[^\w\s-]', '', value).strip())
  53     value = str(NO_WHITESPACE_REGEX.sub('-', value))
  54     #value = str(re.sub(r'[-\s]+', '-', value))
  55     return value
  56
  57 class Downloader(multiprocessing.Process):
  58     """
  59     Class to handle downloading the things we have found to get.
  60     """
  61
  62     def __init__(self, thing_queue, download_directory):
  63         multiprocessing.Process.__init__(self)
  64         # TODO: add parameters
  65         self.thing_queue = thing_queue
  66         self.download_directory = download_directory
  67
  68     def run(self):
  69         """ actual download loop.
  70         """
  71         while True:
  72             thing_id = self.thing_queue.get()
  73             if thing_id is None:
  74                 logging.info("Shutting download queue")
  75                 self.thing_queue.task_done()
  76                 break
  77             logging.info("Handling id {}".format(thing_id))
  78             Thing(thing_id).download(self.download_directory)
  79             self.thing_queue.task_done()
  80         return
  81
  82
  83
  84
  85
  86 class Grouping:
  87     """ Holds details of a group of things for download
  88         This is effectively (although not actually) an abstract class
  89         - use Collection or Designs instead.
  90     """
  91
  92     def __init__(self, quick):
  93         self.things = []
  94         self.total = 0
  95         self.req_id = None
  96         self.last_page = 0
  97         self.per_page = None
  98         # Should we stop downloading when we hit a known datestamp?
  99         self.quick = quick
 100         # These should be set by child classes.
 101         self.url = None
 102         self.download_dir = None
 103         self.collection_url = None
 104
 105     def _get_small_grouping(self, req):
 106         """ Handle small groupings """
 107         soup = BeautifulSoup(req.text, features='lxml')
 108         links = soup.find_all('a', {'class': 'card-img-holder'})
 109         self.things = [x['href'].split(':')[1] for x in links]
 110         self.total = len(self.things)
 111
 112         return self.things
 113
 114     def get(self):
 115         """ retrieve the things of the grouping. """
 116         if self.things:
 117             # We've already done it.
 118             return self.things
 119
 120         # Check for initialisation:
 121         if not self.url:
 122             logging.error("No URL set - object not initialised properly?")
 123             raise ValueError("No URL set - object not initialised properly?")
 124
 125         # Get the internal details of the grouping.
 126         logging.debug("Querying {}".format(self.url))
 127         c_req = requests.get(self.url)
 128         total = TOTAL_REGEX.search(c_req.text)
 129         if total is None:
 130             # This is a small (<13) items grouping. Pull the list from this req.
 131             return self._get_small_grouping(c_req)
 132         self.total = total.groups()[0]
 133         self.req_id = ID_REGEX.search(c_req.text).groups()[0]
 134         self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
 135         self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
 136         parameters = {
 137             'base_url': self.url,
 138             'page': '1',
 139             'per_page': '12',
 140             'id': self.req_id
 141         }
 142         for current_page in range(1, self.last_page + 1):
 143             parameters['page'] = current_page
 144             req = requests.post(self.collection_url, parameters)
 145             soup = BeautifulSoup(req.text, features='lxml')
 146             links = soup.find_all('a', {'class': 'card-img-holder'})
 147             self.things += [x['href'].split(':')[1] for x in links]
 148
 149         return self.things
 150
 151     def download(self):
 152         """ Downloads all the files in a collection """
 153         if not self.things:
 154             self.get()
 155
 156         if not self.download_dir:
 157             raise ValueError(
 158                 "No download_dir set - invalidly initialised object?")
 159
 160         base_dir = os.getcwd()
 161         try:
 162             os.mkdir(self.download_dir)
 163         except FileExistsError:
 164             logging.info("Target directory {} already exists. Assuming a resume."
 165                          .format(self.download_dir))
 166         logging.info("Downloading {} thing(s).".format(self.total))
 167         for idx, thing in enumerate(self.things):
 168             logging.info("Downloading thing {}".format(idx))
 169             RC = Thing(thing).download(self.download_dir)
 170             if self.quick and RC==State.ALREADY_DOWNLOADED:
 171                 logging.info("Caught up, stopping.")
 172                 return
 173
 174
 175 class Collection(Grouping):
 176     """ Holds details of a collection. """
 177
 178     def __init__(self, user, name, directory, quick):
 179         Grouping.__init__(self, quick)
 180         self.user = user
 181         self.name = name
 182         self.url = "{}/{}/collections/{}".format(
 183             URL_BASE, self.user, strip_ws(self.name))
 184         self.download_dir = os.path.join(directory,
 185                                          "{}-{}".format(slugify(self.user), slugify(self.name)))
 186         self.collection_url = URL_COLLECTION
 187
 188
 189 class Designs(Grouping):
 190     """ Holds details of all of a users' designs. """
 191
 192     def __init__(self, user, directory, quick):
 193         Grouping.__init__(self, quick)
 194         self.user = user
 195         self.url = "{}/{}/designs".format(URL_BASE, self.user)
 196         self.download_dir = os.path.join(
 197             directory, "{} designs".format(slugify(self.user)))
 198         self.collection_url = USER_COLLECTION
 199
 200
 201 class Thing:
 202     """ An individual design on thingiverse. """
 203
 204     def __init__(self, thing_id):
 205         self.thing_id = thing_id
 206         self.last_time = None
 207         self._parsed = False
 208         self._needs_download = True
 209         self.text = None
 210         self.title = None
 211         self.download_dir = None
 212
 213     def _parse(self, base_dir):
 214         """ Work out what, if anything needs to be done. """
 215         if self._parsed:
 216             return
 217
 218         url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
 219         try:
 220             req = requests.get(url)
 221         except requests.exceptions.ConnectionError as error:
 222             logging.error("Unable to connect for thing {}: {}".format(
 223                 self.thing_id, error))
 224             return
 225
 226         self.text = req.text
 227         soup = BeautifulSoup(self.text, features='lxml')
 228         #import code
 229         #code.interact(local=dict(globals(), **locals()))
 230         try:
 231             self.title = slugify(soup.find_all('h1')[0].text.strip())
 232         except IndexError:
 233             logging.warning(
 234                 "No title found for thing {}".format(self.thing_id))
 235             self.title = self.thing_id
 236
 237         if req.status_code == 404:
 238             logging.warning(
 239                 "404 for thing {} - DMCA or invalid number?".format(self.thing_id))
 240             return
 241
 242         if req.status_code > 299:
 243             logging.warning(
 244                 "bad status code {}  for thing {} - try again later?".format(req.status_code, self.thing_id))
 245             return
 246
 247         self.download_dir = os.path.join(base_dir, self.title)
 248
 249         logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
 250
 251         if not os.path.exists(self.download_dir):
 252             # Not yet downloaded
 253             self._parsed = True
 254             return
 255
 256         timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
 257         if not os.path.exists(timestamp_file):
 258             # Old download from before
 259             logging.warning(
 260                 "Old-style download directory found. Assuming update required.")
 261             self._parsed = True
 262             return
 263
 264         try:
 265             with open(timestamp_file, 'r') as timestamp_handle:
 266                 self.last_time = timestamp_handle.readlines()[0]
 267             logging.info("last downloaded version: {}".format(self.last_time))
 268         except FileNotFoundError:
 269             # Not run on this thing before.
 270             logging.info(
 271                 "Old-style download directory found. Assuming update required.")
 272             self.last_time = None
 273             self._parsed = True
 274             return
 275
 276         # OK, so we have a timestamp, lets see if there is anything new to get
 277         file_links = soup.find_all('a', {'class': 'file-download'})
 278         for file_link in file_links:
 279             timestamp = file_link.find_all('time')[0]['datetime']
 280             logging.debug("Checking {} (updated {})".format(
 281                 file_link["title"], timestamp))
 282             if timestamp > self.last_time:
 283                 logging.info(
 284                     "Found new/updated file {}".format(file_link["title"]))
 285                 self._needs_download = True
 286                 self._parsed = True
 287                 return
 288         # Got here, so nope, no new files.
 289         self._needs_download = False
 290         self._parsed = True
 291
 292     def download(self, base_dir):
 293         """ Download all files for a given thing.
 294             Returns True iff the thing is now downloaded (not iff it downloads the thing!)
 295         """
 296         if not self._parsed:
 297             self._parse(base_dir)
 298
 299         if not self._parsed:
 300             logging.error(
 301                 "Unable to parse {} - aborting download".format(self.thing_id))
 302             return State.FAILED
 303
 304         if not self._needs_download:
 305             print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title))
 306             return State.ALREADY_DOWNLOADED
 307
 308         # Have we already downloaded some things?
 309         timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
 310         prev_dir = None
 311         if os.path.exists(self.download_dir):
 312             if not os.path.exists(timestamp_file):
 313                 # edge case: old style dir w/out timestamp.
 314                 logging.warning(
 315                     "Old style download dir found for {}".format(self.title))
 316                 prev_count = 0
 317                 target_dir = "{}_old".format(self.download_dir)
 318                 while os.path.exists(target_dir):
 319                     prev_count = prev_count + 1
 320                     target_dir = "{}_old_{}".format(self.download_dir, prev_count)
 321                 os.rename(self.download_dir, target_dir)
 322             else:
 323                 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
 324                 os.rename(self.download_dir, prev_dir)
 325
 326         # Get the list of files to download
 327         soup = BeautifulSoup(self.text, features='lxml')
 328         file_links = soup.find_all('a', {'class': 'file-download'})
 329
 330         new_file_links = []
 331         old_file_links = []
 332         new_last_time = None
 333
 334         if not self.last_time:
 335             # If we don't have anything to copy from, then it is all new.
 336             new_file_links = file_links
 337             try:
 338                 new_last_time = file_links[0].find_all('time')[0]['datetime']
 339             except:
 340                 import code
 341                 code.interact(local=dict(globals(), **locals()))
 342
 343             for file_link in file_links:
 344                 timestamp = file_link.find_all('time')[0]['datetime']
 345                 logging.debug("Found file {} from {}".format(
 346                     file_link["title"], timestamp))
 347                 if timestamp > new_last_time:
 348                     new_last_time = timestamp
 349         else:
 350             for file_link in file_links:
 351                 timestamp = file_link.find_all('time')[0]['datetime']
 352                 logging.debug("Checking {} (updated {})".format(
 353                     file_link["title"], timestamp))
 354                 if timestamp > self.last_time:
 355                     new_file_links.append(file_link)
 356                 else:
 357                     old_file_links.append(file_link)
 358                 if not new_last_time or timestamp > new_last_time:
 359                     new_last_time = timestamp
 360
 361         logging.debug("new timestamp {}".format(new_last_time))
 362
 363         # OK. Time to get to work.
 364         logging.debug("Generating download_dir")
 365         os.mkdir(self.download_dir)
 366         # First grab the cached files (if any)
 367         logging.info("Copying {} unchanged files.".format(len(old_file_links)))
 368         for file_link in old_file_links:
 369             old_file = os.path.join(prev_dir, file_link["title"])
 370             new_file = os.path.join(self.download_dir, file_link["title"])
 371             try:
 372                 logging.debug("Copying {} to {}".format(old_file, new_file))
 373                 copyfile(old_file, new_file)
 374             except FileNotFoundError:
 375                 logging.warning(
 376                     "Unable to find {} in old archive, redownloading".format(file_link["title"]))
 377                 new_file_links.append(file_link)
 378
 379         # Now download the new ones
 380         files = [("{}{}".format(URL_BASE, x['href']), x["title"])
 381                  for x in new_file_links]
 382         logging.info("Downloading {} new files of {}".format(
 383             len(new_file_links), len(file_links)))
 384         try:
 385             for url, name in files:
 386                 file_name = os.path.join(self.download_dir, name)
 387                 logging.debug("Downloading {} from {} to {}".format(
 388                     name, url, file_name))
 389                 data_req = requests.get(url)
 390                 with open(file_name, 'wb') as handle:
 391                     handle.write(data_req.content)
 392         except Exception as exception:
 393             logging.error("Failed to download {} - {}".format(name, exception))
 394             os.rename(self.download_dir, "{}_failed".format(self.download_dir))
 395             return State.FAILED
 396
 397         # People like images
 398         image_dir = os.path.join(self.download_dir, 'images')
 399         imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
 400                          .find_all('div', {'class': 'gallery-photo'})
 401         logging.info("Downloading {} images.".format(len(imagelinks)))
 402         try:
 403             os.mkdir(image_dir)
 404             for imagelink in imagelinks:
 405                 url = next(filter(None, [imagelink[x] for x in ['data-full',
 406                                                                 'data-large',
 407                                                                 'data-medium',
 408                                                                 'data-thumb']]), None)
 409                 if not url:
 410                     logging.warning(
 411                         "Unable to find any urls for {}".format(imagelink))
 412                     continue
 413
 414                 filename = os.path.basename(url)
 415                 if filename.endswith('stl'):
 416                     filename = "{}.png".format(filename)
 417                 image_req = requests.get(url)
 418                 with open(os.path.join(image_dir, filename), 'wb') as handle:
 419                     handle.write(image_req.content)
 420         except Exception as exception:
 421             print("Failed to download {} - {}".format(filename, exception))
 422             os.rename(self.download_dir, "{}_failed".format(self.download_dir))
 423             return State.FAILED
 424
 425         # instructions are good too.
 426         logging.info("Downloading readme")
 427         try:
 428             readme_txt = soup.find('meta', property='og:description')[
 429                 'content']
 430             with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
 431                 readme_handle.write("{}\n".format(readme_txt))
 432         except (TypeError, KeyError) as exception:
 433             logging.warning("No readme? {}".format(exception))
 434         except IOError as exception:
 435             logging.warning("Failed to write readme! {}".format(exception))
 436
 437         # Best get some licenses
 438         logging.info("Downloading license")
 439         try:
 440             license_txt = soup.find('div', {'class': 'license-text'}).text
 441             if license_txt:
 442                 with open(os.path.join(self.download_dir, 'license.txt'), 'w') as license_handle:
 443                     license_handle.write("{}\n".format(license_txt))
 444         except AttributeError as exception:
 445             logging.warning("No license? {}".format(exception))
 446         except IOError as exception:
 447             logging.warning("Failed to write license! {}".format(exception))
 448
 449         try:
 450             # Now write the timestamp
 451             with open(timestamp_file, 'w') as timestamp_handle:
 452                 timestamp_handle.write(new_last_time)
 453         except Exception as exception:
 454             print("Failed to write timestamp file - {}".format(exception))
 455             os.rename(self.download_dir, "{}_failed".format(self.download_dir))
 456             return State.FAILED
 457         self._needs_download = False
 458         logging.debug("Download of {} finished".format(self.title))
 459         return State.OK
 460
 461
 462 def do_batch(batch_file, download_dir, quick):
 463     """ Read a file in line by line, parsing each as a set of calls to this script."""
 464     with open(batch_file) as handle:
 465         for line in handle:
 466             line = line.strip()
 467             logging.info("Handling instruction {}".format(line))
 468             command_arr = line.split()
 469             if command_arr[0] == "thing":
 470                 logging.debug(
 471                     "Handling batch thing instruction: {}".format(line))
 472                 Thing(command_arr[1]).download(download_dir)
 473                 continue
 474             if command_arr[0] == "collection":
 475                 logging.debug(
 476                     "Handling batch collection instruction: {}".format(line))
 477                 Collection(command_arr[1], command_arr[2],
 478                            download_dir, quick).download()
 479                 continue
 480             if command_arr[0] == "user":
 481                 logging.debug(
 482                     "Handling batch collection instruction: {}".format(line))
 483                 Designs(command_arr[1], download_dir, quick).download()
 484                 continue
 485             logging.warning("Unable to parse current instruction. Skipping.")
 486
 487
 488 def main():
 489     """ Entry point for script being run as a command. """
 490     parser = argparse.ArgumentParser()
 491     parser.add_argument("-l", "--log-level", choices=[
 492                         'debug', 'info', 'warning'], default='info', help="level of logging desired")
 493     parser.add_argument("-d", "--directory",
 494                         help="Target directory to download into")
 495     parser.add_argument("-f", "--log-file",
 496                         help="Place to log debug information to")
 497     parser.add_argument("-q", "--quick", action="store_true",
 498                         help="Assume date ordering on posts")
 499
 500     subparsers = parser.add_subparsers(
 501         help="Type of thing to download", dest="subcommand")
 502     collection_parser = subparsers.add_parser(
 503         'collection', help="Download one or more entire collection(s)")
 504     collection_parser.add_argument(
 505         "owner", help="The owner of the collection(s) to get")
 506     collection_parser.add_argument(
 507         "collections", nargs="+",  help="Space seperated list of the name(s) of collection to get")
 508     thing_parser = subparsers.add_parser(
 509         'thing', help="Download a single thing.")
 510     thing_parser.add_argument(
 511         "things", nargs="*", help="Space seperated list of thing ID(s) to download")
 512     user_parser = subparsers.add_parser(
 513         "user",  help="Download all things by one or more users")
 514     user_parser.add_argument(
 515         "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
 516     batch_parser = subparsers.add_parser(
 517         "batch", help="Perform multiple actions written in a text file")
 518     batch_parser.add_argument(
 519         "batch_file", help="The name of the file to read.")
 520     subparsers.add_parser("version", help="Show the current version")
 521
 522     args = parser.parse_args()
 523     if not args.subcommand:
 524         parser.print_help()
 525         sys.exit(1)
 526     if not args.directory:
 527         args.directory = os.getcwd()
 528
 529     logger = logging.getLogger()
 530     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 531     logger.setLevel(logging.DEBUG)
 532     console_handler = logging.StreamHandler()
 533     console_handler.setLevel(args.log_level.upper())
 534
 535     logger.addHandler(console_handler)
 536     if args.log_file:
 537         file_handler = logging.FileHandler(args.log_file)
 538         file_handler.setLevel(logging.DEBUG)
 539         file_handler.setFormatter(formatter)
 540         logger.addHandler(file_handler)
 541
 542
 543     # Start downloader
 544     thing_queue = multiprocessing.JoinableQueue()
 545     logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
 546     downloaders = [Downloader(thing_queue, args.directory) for _ in range(DOWNLOADER_COUNT)]
 547     for downloader in downloaders:
 548         downloader.start()
 549
 550
 551     if args.subcommand.startswith("collection"):
 552         for collection in args.collections:
 553             Collection(args.owner, collection, args.directory, args.quick).download()
 554     if args.subcommand == "thing":
 555         for thing in args.things:
 556             thing_queue.put(thing)
 557     if args.subcommand == "user":
 558         for user in args.users:
 559             Designs(user, args.directory, args.quick).download()
 560     if args.subcommand == "version":
 561         print("thingy_grabber.py version {}".format(VERSION))
 562     if args.subcommand == "batch":
 563         do_batch(args.batch_file, args.directory, args.quick)
 564
 565     # Stop the downloader processes
 566     for downloader in downloaders:
 567         thing_queue.put(None)
 568
 569 if __name__ == "__main__":
 570     main()