thingy_grabber.py

   1 #!/usr/bin/env python3
   2 """
   3 Thingiverse bulk downloader
   4 """
   5
   6 import re
   7 import os
   8 import argparse
   9 import unicodedata
  10 import requests
  11 from bs4 import BeautifulSoup
  12
  13 URL_BASE = "https://www.thingiverse.com"
  14 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
  15
  16 ID_REGEX = re.compile(r'"id":(\d*),')
  17 TOTAL_REGEX = re.compile(r'"total":(\d*),')
  18 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
  19 # This appears to be fixed at 12, but if it changes would screw the rest up.
  20 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
  21 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
  22
  23 VERBOSE = False
  24
  25 def strip_ws(value):
  26     """ Remove whitespace from a string """
  27     return str(NO_WHITESPACE_REGEX.sub('-', value))
  28
  29 def slugify(value):
  30     """
  31     Normalizes string, converts to lowercase, removes non-alpha characters,
  32     and converts spaces to hyphens.
  33     """
  34     value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
  35     value = str(re.sub(r'[^\w\s-]', '', value).strip())
  36     value = str(NO_WHITESPACE_REGEX.sub('-', value))
  37     #value = str(re.sub(r'[-\s]+', '-', value))
  38     return value
  39
  40 class Collection:
  41     """ Holds details of a collection. """
  42     def __init__(self, user, name):
  43         self.user = user
  44         self.name = name
  45         self.things = []
  46         self.total = 0
  47         self.req_id = None
  48         self.last_page = 0
  49         self.per_page = None
  50
  51     def _get_small_collection(self, req):
  52         """ Handle small collections """
  53         soup = BeautifulSoup(req.text, features='lxml')
  54         links = soup.find_all('a', {'class':'card-img-holder'})
  55         self.things = [x['href'].split(':')[1] for x in links]
  56
  57         return self.things
  58
  59     def get_collection(self):
  60         """ retrieve the things of the collection. """
  61         if self.things:
  62             # We've already done it.
  63             return self.things
  64
  65         # Get the internal details of the collection.
  66         c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
  67         if VERBOSE:
  68             print("Querying {}".format(c_url))
  69         c_req = requests.get(c_url)
  70         total = TOTAL_REGEX.search(c_req.text)
  71         if total is None:
  72             # This is a small (<13) items collection. Pull the list from this req.
  73             return self._get_small_collection(c_req)
  74         self.total = total.groups()[0]
  75         self.req_id = ID_REGEX.search(c_req.text).groups()[0]
  76         self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
  77         self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
  78         parameters = {
  79             'base_url':"{}/collections/{}".format(self.user, self.name),
  80             'page':'1',
  81             'per_page':'12',
  82             'id':self.req_id
  83         }
  84         for current_page in range(1, self.last_page + 1):
  85             parameters['page'] = current_page
  86             req = requests.post(URL_COLLECTION, parameters)
  87             soup = BeautifulSoup(req.text, features='lxml')
  88             links = soup.find_all('a', {'class':'card-img-holder'})
  89             self.things += [x['href'].split(':')[1] for x in links]
  90
  91         return self.things
  92
  93     def download(self):
  94         """ Downloads all the files in a collection """
  95         if not self.things:
  96             self.get_collection()
  97         base_dir = os.getcwd()
  98         new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
  99         target_dir = os.path.join(base_dir, new_dir)
 100         try:
 101             os.mkdir(target_dir)
 102         except FileExistsError:
 103             print("Target directory {} already exists. Assuming a resume.".format(new_dir))
 104         os.chdir(target_dir)
 105         for thing in self.things:
 106             download_thing(thing)
 107
 108
 109 def download_thing(thing):
 110     """ Downloads all the files for a given thing. """
 111     file_url = "{}/thing:{}/files".format(URL_BASE, thing)
 112     file_req = requests.get(file_url)
 113     file_soup = BeautifulSoup(file_req.text, features='lxml')
 114
 115     title = slugify(file_soup.find_all('h1')[0].text.strip())
 116     base_dir = os.getcwd()
 117     try:
 118         os.mkdir(title)
 119     except FileExistsError:
 120         print("Directory for {} ({}) already exists, skipping".format(thing, title))
 121         return
 122     print("Downloading {} ({})".format(thing, title))
 123     os.chdir(title)
 124
 125     file_links = file_soup.find_all('a', {'class':'file-download'})
 126     files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in file_links]
 127
 128     try:
 129         for url, name in files:
 130             data_req = requests.get(url)
 131             with open(name, 'wb') as handle:
 132                 handle.write(data_req.content)
 133     except Exception as exception:
 134         print("Failed to download {} - {}".format(name, exception))
 135         os.chdir(base_dir)
 136         os.rename(title, "{}_failed".format(title))
 137         return
 138
 139     os.chdir(base_dir)
 140
 141 def main():
 142     """ Entry point for script being run as a command. """
 143     parser = argparse.ArgumentParser()
 144     parser.add_argument("owner", help="The owner of the collection to get")
 145     parser.add_argument("collection", help="The name of the collection to get")
 146     parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
 147     args = parser.parse_args()
 148     global VERBOSE
 149     VERBOSE = args.verbose
 150
 151     collection = Collection(args.owner, args.collection)
 152     print(collection.get_collection())
 153     collection.download()
 154
 155 if __name__ == "__main__":
 156     main()