thingy_grabber.py

   1 #!/usr/bin/env python3
   2 """
   3 Thingiverse bulk downloader
   4 """
   5
   6 import re
   7 import sys
   8 import os
   9 import argparse
  10 import unicodedata
  11 import requests
  12 from bs4 import BeautifulSoup
  13
  14 URL_BASE = "https://www.thingiverse.com"
  15 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
  16
  17 ID_REGEX = re.compile(r'"id":(\d*),')
  18 TOTAL_REGEX = re.compile(r'"total":(\d*),')
  19 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
  20 # This appears to be fixed at 12, but if it changes would screw the rest up.
  21 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
  22 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
  23
  24 VERBOSE = False
  25
  26 def strip_ws(value):
  27     """ Remove whitespace from a string """
  28     return str(NO_WHITESPACE_REGEX.sub('-', value))
  29
  30 def slugify(value):
  31     """
  32     Normalizes string, converts to lowercase, removes non-alpha characters,
  33     and converts spaces to hyphens.
  34     """
  35     value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
  36     value = str(re.sub(r'[^\w\s-]', '', value).strip())
  37     value = str(NO_WHITESPACE_REGEX.sub('-', value))
  38     #value = str(re.sub(r'[-\s]+', '-', value))
  39     return value
  40
  41 class Grouping:
  42     """ Holds details of a group of things. """
  43     def __init__(self):
  44         self.things = []
  45         self.total = 0
  46         self.req_id = None
  47         self.last_page = 0
  48         self.per_page = None
  49         # These two should be set by child classes.
  50         self.url = None
  51         self.download_dir = None
  52
  53     def _get_small_grouping(self, req):
  54         """ Handle small groupings """
  55         soup = BeautifulSoup(req.text, features='lxml')
  56         links = soup.find_all('a', {'class':'card-img-holder'})
  57         self.things = [x['href'].split(':')[1] for x in links]
  58
  59         return self.things
  60
  61     def get(self):
  62         """ retrieve the things of the grouping. """
  63         if self.things:
  64             # We've already done it.
  65             return self.things
  66
  67         # Check for initialisation:
  68         if not self.url:
  69             print("No URL set - object not initialised properly?")
  70             raise ValueError("No URL set - object not initialised properly?")
  71
  72         # Get the internal details of the grouping.
  73         if VERBOSE:
  74             print("Querying {}".format(self.url))
  75         c_req = requests.get(self.url)
  76         total = TOTAL_REGEX.search(c_req.text)
  77         if total is None:
  78             # This is a small (<13) items grouping. Pull the list from this req.
  79             return self._get_small_grouping(c_req)
  80         self.total = total.groups()[0]
  81         self.req_id = ID_REGEX.search(c_req.text).groups()[0]
  82         self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
  83         self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
  84         parameters = {
  85             'base_url':self.url,
  86             'page':'1',
  87             'per_page':'12',
  88             'id':self.req_id
  89         }
  90         for current_page in range(1, self.last_page + 1):
  91             parameters['page'] = current_page
  92             req = requests.post(URL_COLLECTION, parameters)
  93             soup = BeautifulSoup(req.text, features='lxml')
  94             links = soup.find_all('a', {'class':'card-img-holder'})
  95             self.things += [x['href'].split(':')[1] for x in links]
  96
  97         return self.things
  98
  99     def download(self):
 100         """ Downloads all the files in a collection """
 101         if not self.things:
 102             self.get()
 103
 104         if not self.download_dir:
 105             raise ValueError("No download_dir set - invalidly initialised object?")
 106
 107         base_dir = os.getcwd()
 108         try:
 109             os.mkdir(self.download_dir)
 110         except FileExistsError:
 111             print("Target directory {} already exists. Assuming a resume.".format(self.download_dir))
 112         os.chdir(self.download_dir)
 113         for thing in self.things:
 114             download_thing(thing)
 115         os.chdir(base_dir)
 116
 117 class Collection(Grouping):
 118     """ Holds details of a collection. """
 119     def __init__(self, user, name):
 120         Grouping.__init__(self)
 121         self.user = user
 122         self.name = name
 123         self.url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
 124         self.download_dir = os.path.join(os.getcwd(), "{}-{}".format(slugify(self.user), slugify(self.name)))
 125
 126 class Designs(Grouping):
 127     """ Holds details of all of a users' designs. """
 128     def __init__(self, user):
 129         Grouping.__init__(self)
 130         self.user = user
 131         self.url = "{}/{}/designs".format(URL_BASE, self.user)
 132         self.download_dir = os.path.join(os.getcwd(), "{} designs".format(slugify(self.user)))
 133
 134 def download_thing(thing):
 135     """ Downloads all the files for a given thing. """
 136     file_url = "{}/thing:{}/files".format(URL_BASE, thing)
 137     file_req = requests.get(file_url)
 138     file_soup = BeautifulSoup(file_req.text, features='lxml')
 139
 140     title = slugify(file_soup.find_all('h1')[0].text.strip())
 141     base_dir = os.getcwd()
 142     try:
 143         os.mkdir(title)
 144     except FileExistsError:
 145         pass
 146
 147     print("Downloading {} ({})".format(thing, title))
 148     os.chdir(title)
 149     last_time = None
 150
 151     try:
 152         with open('timestamp.txt', 'r') as timestamp_handle:
 153             last_time = timestamp_handle.readlines()[0]
 154         if VERBOSE:
 155             print("last downloaded version: {}".format(last_time))
 156     except FileNotFoundError:
 157         # Not run on this thing before.
 158         if VERBOSE:
 159             print('Directory for thing already exists, checking for update.')
 160         last_time = None
 161
 162     file_links = file_soup.find_all('a', {'class':'file-download'})
 163     new_last_time = last_time
 164     new_file_links = []
 165
 166     for file_link in file_links:
 167         timestamp = file_link.find_all('time')[0]['datetime']
 168         if VERBOSE:
 169             print("Checking {} (updated {})".format(file_link["title"], timestamp))
 170         if not last_time or timestamp > last_time:
 171             new_file_links.append(file_link)
 172         if not new_last_time or timestamp > new_last_time:
 173             new_last_time = timestamp
 174
 175     if last_time and new_last_time <= last_time:
 176         print("Thing already downloaded. Skipping.")
 177     files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
 178
 179     try:
 180         for url, name in files:
 181             if VERBOSE:
 182                 print("Downloading {} from {}".format(name, url))
 183             data_req = requests.get(url)
 184             with open(name, 'wb') as handle:
 185                 handle.write(data_req.content)
 186         # now write timestamp
 187         with open('timestamp.txt', 'w') as timestamp_handle:
 188             timestamp_handle.write(new_last_time)
 189     except Exception as exception:
 190         print("Failed to download {} - {}".format(name, exception))
 191         os.chdir(base_dir)
 192         os.rename(title, "{}_failed".format(title))
 193         return
 194
 195
 196     os.chdir(base_dir)
 197
 198 def main():
 199     """ Entry point for script being run as a command. """
 200     parser = argparse.ArgumentParser()
 201     parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
 202     subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
 203     collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
 204     collection_parser.add_argument("owner", help="The owner of the collection to get")
 205     collection_parser.add_argument("collection", help="The name of the collection to get")
 206     thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
 207     thing_parser.add_argument("thing", help="Thing ID to download")
 208     user_parser = subparsers.add_parser("user", help="Download all things by a user")
 209     user_parser.add_argument("user", help="The user to get the designs of")
 210
 211     args = parser.parse_args()
 212     if not args.subcommand:
 213         parser.print_help()
 214         sys.exit(1)
 215     global VERBOSE
 216     VERBOSE = args.verbose
 217     if args.subcommand.startswith("collection"):
 218         collection = Collection(args.owner, args.collection)
 219         print(collection.get())
 220         collection.download()
 221     if args.subcommand == "thing":
 222         download_thing(args.thing)
 223     if args.subcommand == "user":
 224         designs = Designs(args.user)
 225         print(designs.get())
 226         designs.download()
 227
 228
 229
 230 if __name__ == "__main__":
 231     main()