thingy_grabber.py

   1 #!/usr/bin/env python3
   2 """
   3 Thingiverse bulk downloader
   4 """
   5
   6 import re
   7 import os
   8 import argparse
   9 import unicodedata
  10 import requests
  11 from bs4 import BeautifulSoup
  12
  13 URL_BASE = "https://www.thingiverse.com"
  14 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
  15
  16 ID_REGEX = re.compile(r'"id":(\d*),')
  17 TOTAL_REGEX = re.compile(r'"total":(\d*),')
  18 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
  19 # This appears to be fixed at 12, but if it changes would screw the rest up.
  20 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
  21 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
  22
  23 VERBOSE = False
  24
  25 def strip_ws(value):
  26     """ Remove whitespace from a string """
  27     return str(NO_WHITESPACE_REGEX.sub('-', value))
  28
  29 def slugify(value):
  30     """
  31     Normalizes string, converts to lowercase, removes non-alpha characters,
  32     and converts spaces to hyphens.
  33     """
  34     value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
  35     value = str(re.sub(r'[^\w\s-]', '', value).strip())
  36     value = str(NO_WHITESPACE_REGEX.sub('-', value))
  37     #value = str(re.sub(r'[-\s]+', '-', value))
  38     return value
  39
  40 class Collection:
  41     """ Holds details of a collection. """
  42     def __init__(self, user, name):
  43         self.user = user
  44         self.name = name
  45         self.things = []
  46         self.total = 0
  47         self.req_id = None
  48         self.last_page = 0
  49         self.per_page = None
  50
  51     def _get_small_collection(self, req):
  52         """ Handle small collections """
  53         soup = BeautifulSoup(req.text, features='lxml')
  54         links = soup.find_all('a', {'class':'card-img-holder'})
  55         self.things = [x['href'].split(':')[1] for x in links]
  56
  57         return self.things
  58
  59     def get_collection(self):
  60         """ retrieve the things of the collection. """
  61         if self.things:
  62             # We've already done it.
  63             return self.things
  64
  65         # Get the internal details of the collection.
  66         c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
  67         if VERBOSE:
  68             print("Querying {}".format(c_url))
  69         c_req = requests.get(c_url)
  70         total = TOTAL_REGEX.search(c_req.text)
  71         if total is None:
  72             # This is a small (<13) items collection. Pull the list from this req.
  73             return self._get_small_collection(c_req)
  74         self.total = total.groups()[0]
  75         self.req_id = ID_REGEX.search(c_req.text).groups()[0]
  76         self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
  77         self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
  78         parameters = {
  79             'base_url':"{}/collections/{}".format(self.user, self.name),
  80             'page':'1',
  81             'per_page':'12',
  82             'id':self.req_id
  83         }
  84         for current_page in range(1, self.last_page + 1):
  85             parameters['page'] = current_page
  86             req = requests.post(URL_COLLECTION, parameters)
  87             soup = BeautifulSoup(req.text, features='lxml')
  88             links = soup.find_all('a', {'class':'card-img-holder'})
  89             self.things += [x['href'].split(':')[1] for x in links]
  90
  91         return self.things
  92
  93     def download(self):
  94         """ Downloads all the files in a collection """
  95         if not self.things:
  96             self.get_collection()
  97         base_dir = os.getcwd()
  98         new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
  99         target_dir = os.path.join(base_dir, new_dir)
 100         try:
 101             os.mkdir(target_dir)
 102         except FileExistsError:
 103             print("Target directory {} already exists. Assuming a resume.".format(new_dir))
 104         os.chdir(target_dir)
 105         for thing in self.things:
 106             download_thing(thing)
 107
 108
 109 def download_thing(thing):
 110     """ Downloads all the files for a given thing. """
 111     file_url = "{}/thing:{}/files".format(URL_BASE, thing)
 112     file_req = requests.get(file_url)
 113     file_soup = BeautifulSoup(file_req.text, features='lxml')
 114
 115     title = slugify(file_soup.find_all('h1')[0].text.strip())
 116     base_dir = os.getcwd()
 117     try:
 118         os.mkdir(title)
 119     except FileExistsError:
 120         pass
 121
 122     print("Downloading {} ({})".format(thing, title))
 123     os.chdir(title)
 124     last_time = None
 125
 126     try:
 127         with open('timestamp.txt', 'r') as fh:
 128             last_time = fh.readlines()[0]
 129         if VERBOSE:
 130             print("last downloaded version: {}".format(last_time))
 131     except FileNotFoundError:
 132         # Not run on this thing before.
 133         if VERBOSE:
 134             print('Directory for thing already exists, checking for update.')
 135         last_time = None
 136
 137     file_links = file_soup.find_all('a', {'class':'file-download'})
 138     new_last_time = last_time
 139     new_file_links = []
 140
 141     for file_link in file_links:
 142         timestamp = file_link.find_all('time')[0]['datetime']
 143         if VERBOSE:
 144             print("Checking {} (updated {})".format(file_link["title"], timestamp))
 145         if not last_time or timestamp > last_time:
 146             new_file_links.append(file_link)
 147         if not new_last_time or timestamp > new_last_time:
 148             new_last_time = timestamp
 149
 150     if last_time and new_last_time <= last_time:
 151         print("Thing already downloaded. Skipping.")
 152     files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
 153
 154     try:
 155         for url, name in files:
 156             if VERBOSE:
 157                 print("Downloading {} from {}".format(name, url))
 158             data_req = requests.get(url)
 159             with open(name, 'wb') as handle:
 160                 handle.write(data_req.content)
 161         # now write timestamp
 162         with open('timestamp.txt', 'w') as fh:
 163             fh.write(new_last_time)
 164     except Exception as exception:
 165         print("Failed to download {} - {}".format(name, exception))
 166         os.chdir(base_dir)
 167         os.rename(title, "{}_failed".format(title))
 168         return
 169
 170
 171     os.chdir(base_dir)
 172
 173 def main():
 174     """ Entry point for script being run as a command. """
 175     parser = argparse.ArgumentParser()
 176     parser.add_argument("owner", help="The owner of the collection to get")
 177     parser.add_argument("collection", help="The name of the collection to get")
 178     parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
 179     args = parser.parse_args()
 180     global VERBOSE
 181     VERBOSE = args.verbose
 182
 183     collection = Collection(args.owner, args.collection)
 184     print(collection.get_collection())
 185     collection.download()
 186
 187 if __name__ == "__main__":
 188     main()