thingy_grabber.py

   1 #!/usr/bin/env python3
   2 """
   3 Thingiverse bulk downloader
   4 """
   5
   6 import re
   7 import sys
   8 import os
   9 import argparse
  10 import unicodedata
  11 import requests
  12 from bs4 import BeautifulSoup
  13
  14 URL_BASE = "https://www.thingiverse.com"
  15 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
  16
  17 ID_REGEX = re.compile(r'"id":(\d*),')
  18 TOTAL_REGEX = re.compile(r'"total":(\d*),')
  19 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
  20 # This appears to be fixed at 12, but if it changes would screw the rest up.
  21 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
  22 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
  23
  24 VERBOSE = False
  25
  26 def strip_ws(value):
  27     """ Remove whitespace from a string """
  28     return str(NO_WHITESPACE_REGEX.sub('-', value))
  29
  30 def slugify(value):
  31     """
  32     Normalizes string, converts to lowercase, removes non-alpha characters,
  33     and converts spaces to hyphens.
  34     """
  35     value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
  36     value = str(re.sub(r'[^\w\s-]', '', value).strip())
  37     value = str(NO_WHITESPACE_REGEX.sub('-', value))
  38     #value = str(re.sub(r'[-\s]+', '-', value))
  39     return value
  40
  41 class Collection:
  42     """ Holds details of a collection. """
  43     def __init__(self, user, name):
  44         self.user = user
  45         self.name = name
  46         self.things = []
  47         self.total = 0
  48         self.req_id = None
  49         self.last_page = 0
  50         self.per_page = None
  51
  52     def _get_small_collection(self, req):
  53         """ Handle small collections """
  54         soup = BeautifulSoup(req.text, features='lxml')
  55         links = soup.find_all('a', {'class':'card-img-holder'})
  56         self.things = [x['href'].split(':')[1] for x in links]
  57
  58         return self.things
  59
  60     def get_collection(self):
  61         """ retrieve the things of the collection. """
  62         if self.things:
  63             # We've already done it.
  64             return self.things
  65
  66         # Get the internal details of the collection.
  67         c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
  68         if VERBOSE:
  69             print("Querying {}".format(c_url))
  70         c_req = requests.get(c_url)
  71         total = TOTAL_REGEX.search(c_req.text)
  72         if total is None:
  73             # This is a small (<13) items collection. Pull the list from this req.
  74             return self._get_small_collection(c_req)
  75         self.total = total.groups()[0]
  76         self.req_id = ID_REGEX.search(c_req.text).groups()[0]
  77         self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
  78         self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
  79         parameters = {
  80             'base_url':"{}/collections/{}".format(self.user, self.name),
  81             'page':'1',
  82             'per_page':'12',
  83             'id':self.req_id
  84         }
  85         for current_page in range(1, self.last_page + 1):
  86             parameters['page'] = current_page
  87             req = requests.post(URL_COLLECTION, parameters)
  88             soup = BeautifulSoup(req.text, features='lxml')
  89             links = soup.find_all('a', {'class':'card-img-holder'})
  90             self.things += [x['href'].split(':')[1] for x in links]
  91
  92         return self.things
  93
  94     def download(self):
  95         """ Downloads all the files in a collection """
  96         if not self.things:
  97             self.get_collection()
  98         base_dir = os.getcwd()
  99         new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
 100         target_dir = os.path.join(base_dir, new_dir)
 101         try:
 102             os.mkdir(target_dir)
 103         except FileExistsError:
 104             print("Target directory {} already exists. Assuming a resume.".format(new_dir))
 105         os.chdir(target_dir)
 106         for thing in self.things:
 107             download_thing(thing)
 108
 109
 110 def download_thing(thing):
 111     """ Downloads all the files for a given thing. """
 112     file_url = "{}/thing:{}/files".format(URL_BASE, thing)
 113     file_req = requests.get(file_url)
 114     file_soup = BeautifulSoup(file_req.text, features='lxml')
 115
 116     title = slugify(file_soup.find_all('h1')[0].text.strip())
 117     base_dir = os.getcwd()
 118     try:
 119         os.mkdir(title)
 120     except FileExistsError:
 121         pass
 122
 123     print("Downloading {} ({})".format(thing, title))
 124     os.chdir(title)
 125     last_time = None
 126
 127     try:
 128         with open('timestamp.txt', 'r') as timestamp_handle:
 129             last_time = timestamp_handle.readlines()[0]
 130         if VERBOSE:
 131             print("last downloaded version: {}".format(last_time))
 132     except FileNotFoundError:
 133         # Not run on this thing before.
 134         if VERBOSE:
 135             print('Directory for thing already exists, checking for update.')
 136         last_time = None
 137
 138     file_links = file_soup.find_all('a', {'class':'file-download'})
 139     new_last_time = last_time
 140     new_file_links = []
 141
 142     for file_link in file_links:
 143         timestamp = file_link.find_all('time')[0]['datetime']
 144         if VERBOSE:
 145             print("Checking {} (updated {})".format(file_link["title"], timestamp))
 146         if not last_time or timestamp > last_time:
 147             new_file_links.append(file_link)
 148         if not new_last_time or timestamp > new_last_time:
 149             new_last_time = timestamp
 150
 151     if last_time and new_last_time <= last_time:
 152         print("Thing already downloaded. Skipping.")
 153     files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
 154
 155     try:
 156         for url, name in files:
 157             if VERBOSE:
 158                 print("Downloading {} from {}".format(name, url))
 159             data_req = requests.get(url)
 160             with open(name, 'wb') as handle:
 161                 handle.write(data_req.content)
 162         # now write timestamp
 163         with open('timestamp.txt', 'w') as timestamp_handle:
 164             timestamp_handle.write(new_last_time)
 165     except Exception as exception:
 166         print("Failed to download {} - {}".format(name, exception))
 167         os.chdir(base_dir)
 168         os.rename(title, "{}_failed".format(title))
 169         return
 170
 171
 172     os.chdir(base_dir)
 173
 174 def main():
 175     """ Entry point for script being run as a command. """
 176     parser = argparse.ArgumentParser()
 177     parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
 178     subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
 179     collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
 180     collection_parser.add_argument("owner", help="The owner of the collection to get")
 181     collection_parser.add_argument("collection", help="The name of the collection to get")
 182     thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
 183     thing_parser.add_argument("thing", help="Thing ID to download")
 184
 185     args = parser.parse_args()
 186     if not args.subcommand:
 187         parser.print_help()
 188         sys.exit(1)
 189     global VERBOSE
 190     VERBOSE = args.verbose
 191     if args.subcommand.startswith("collection"):
 192         collection = Collection(args.owner, args.collection)
 193         print(collection.get_collection())
 194         collection.download()
 195     if args.subcommand == "thing":
 196         download_thing(args.thing)
 197
 198
 199 if __name__ == "__main__":
 200     main()