| 1 | #!/usr/bin/env python3 |
| 2 | """ |
| 3 | Thingiverse bulk downloader |
| 4 | """ |
| 5 | |
| 6 | import re |
| 7 | import sys |
| 8 | import os |
| 9 | import argparse |
| 10 | import unicodedata |
| 11 | import requests |
| 12 | from shutil import copyfile |
| 13 | from bs4 import BeautifulSoup |
| 14 | |
| 15 | URL_BASE = "https://www.thingiverse.com" |
| 16 | URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things" |
| 17 | USER_COLLECTION = URL_BASE + "/ajax/user/designs" |
| 18 | |
| 19 | ID_REGEX = re.compile(r'"id":(\d*),') |
| 20 | TOTAL_REGEX = re.compile(r'"total":(\d*),') |
| 21 | LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),') |
| 22 | # This appears to be fixed at 12, but if it changes would screw the rest up. |
| 23 | PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),') |
| 24 | NO_WHITESPACE_REGEX = re.compile(r'[-\s]+') |
| 25 | |
| 26 | VERSION = "0.4.0" |
| 27 | |
| 28 | VERBOSE = False |
| 29 | |
| 30 | def strip_ws(value): |
| 31 | """ Remove whitespace from a string """ |
| 32 | return str(NO_WHITESPACE_REGEX.sub('-', value)) |
| 33 | |
| 34 | def slugify(value): |
| 35 | """ |
| 36 | Normalizes string, converts to lowercase, removes non-alpha characters, |
| 37 | and converts spaces to hyphens. |
| 38 | """ |
| 39 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() |
| 40 | value = str(re.sub(r'[^\w\s-]', '', value).strip()) |
| 41 | value = str(NO_WHITESPACE_REGEX.sub('-', value)) |
| 42 | #value = str(re.sub(r'[-\s]+', '-', value)) |
| 43 | return value |
| 44 | |
| 45 | class Grouping: |
| 46 | """ Holds details of a group of things for download |
| 47 | This is effectively (although not actually) an abstract class |
| 48 | - use Collection or Designs instead. |
| 49 | """ |
| 50 | def __init__(self): |
| 51 | self.things = [] |
| 52 | self.total = 0 |
| 53 | self.req_id = None |
| 54 | self.last_page = 0 |
| 55 | self.per_page = None |
| 56 | # These should be set by child classes. |
| 57 | self.url = None |
| 58 | self.download_dir = None |
| 59 | self.collection_url = None |
| 60 | |
| 61 | def _get_small_grouping(self, req): |
| 62 | """ Handle small groupings """ |
| 63 | soup = BeautifulSoup(req.text, features='lxml') |
| 64 | links = soup.find_all('a', {'class':'card-img-holder'}) |
| 65 | self.things = [x['href'].split(':')[1] for x in links] |
| 66 | |
| 67 | return self.things |
| 68 | |
| 69 | def get(self): |
| 70 | """ retrieve the things of the grouping. """ |
| 71 | if self.things: |
| 72 | # We've already done it. |
| 73 | return self.things |
| 74 | |
| 75 | # Check for initialisation: |
| 76 | if not self.url: |
| 77 | print("No URL set - object not initialised properly?") |
| 78 | raise ValueError("No URL set - object not initialised properly?") |
| 79 | |
| 80 | # Get the internal details of the grouping. |
| 81 | if VERBOSE: |
| 82 | print("Querying {}".format(self.url)) |
| 83 | c_req = requests.get(self.url) |
| 84 | total = TOTAL_REGEX.search(c_req.text) |
| 85 | if total is None: |
| 86 | # This is a small (<13) items grouping. Pull the list from this req. |
| 87 | return self._get_small_grouping(c_req) |
| 88 | self.total = total.groups()[0] |
| 89 | self.req_id = ID_REGEX.search(c_req.text).groups()[0] |
| 90 | self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0]) |
| 91 | self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0] |
| 92 | parameters = { |
| 93 | 'base_url':self.url, |
| 94 | 'page':'1', |
| 95 | 'per_page':'12', |
| 96 | 'id':self.req_id |
| 97 | } |
| 98 | for current_page in range(1, self.last_page + 1): |
| 99 | parameters['page'] = current_page |
| 100 | req = requests.post(self.collection_url, parameters) |
| 101 | soup = BeautifulSoup(req.text, features='lxml') |
| 102 | links = soup.find_all('a', {'class':'card-img-holder'}) |
| 103 | self.things += [x['href'].split(':')[1] for x in links] |
| 104 | |
| 105 | return self.things |
| 106 | |
| 107 | def download(self): |
| 108 | """ Downloads all the files in a collection """ |
| 109 | if not self.things: |
| 110 | self.get() |
| 111 | |
| 112 | if not self.download_dir: |
| 113 | raise ValueError("No download_dir set - invalidly initialised object?") |
| 114 | |
| 115 | base_dir = os.getcwd() |
| 116 | try: |
| 117 | os.mkdir(self.download_dir) |
| 118 | except FileExistsError: |
| 119 | print("Target directory {} already exists. Assuming a resume." |
| 120 | .format(self.download_dir)) |
| 121 | if VERBOSE: |
| 122 | print("Downloading {} things.".format(self.total)) |
| 123 | for thing in self.things: |
| 124 | Thing(thing).download(self.download_dir) |
| 125 | |
| 126 | class Collection(Grouping): |
| 127 | """ Holds details of a collection. """ |
| 128 | def __init__(self, user, name, directory): |
| 129 | Grouping.__init__(self) |
| 130 | self.user = user |
| 131 | self.name = name |
| 132 | self.url = "{}/{}/collections/{}".format( |
| 133 | URL_BASE, self.user, strip_ws(self.name)) |
| 134 | self.download_dir = os.path.join(directory, |
| 135 | "{}-{}".format(slugify(self.user), slugify(self.name))) |
| 136 | self.collection_url = URL_COLLECTION |
| 137 | |
| 138 | class Designs(Grouping): |
| 139 | """ Holds details of all of a users' designs. """ |
| 140 | def __init__(self, user, directory): |
| 141 | Grouping.__init__(self) |
| 142 | self.user = user |
| 143 | self.url = "{}/{}/designs".format(URL_BASE, self.user) |
| 144 | self.download_dir = os.path.join(directory, "{} designs".format(slugify(self.user))) |
| 145 | self.collection_url = USER_COLLECTION |
| 146 | |
| 147 | class Thing: |
| 148 | """ An individual design on thingiverse. """ |
| 149 | def __init__(self, thing_id): |
| 150 | self.thing_id = thing_id |
| 151 | self.last_time = None |
| 152 | self._parsed = False |
| 153 | self._needs_download = True |
| 154 | self.text = None |
| 155 | self.title = None |
| 156 | self.download_dir = None |
| 157 | |
| 158 | def _parse(self, base_dir): |
| 159 | """ Work out what, if anything needs to be done. """ |
| 160 | if self._parsed: |
| 161 | return |
| 162 | |
| 163 | url = "{}/thing:{}/files".format(URL_BASE, self.thing_id) |
| 164 | req = requests.get(url) |
| 165 | self.text = req.text |
| 166 | soup = BeautifulSoup(self.text, features='lxml') |
| 167 | |
| 168 | self.title = slugify(soup.find_all('h1')[0].text.strip()) |
| 169 | self.download_dir = os.path.join(base_dir, self.title) |
| 170 | |
| 171 | if not os.path.exists(self.download_dir): |
| 172 | # Not yet downloaded |
| 173 | self._parsed = True |
| 174 | return |
| 175 | |
| 176 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') |
| 177 | if not os.path.exists(timestamp_file): |
| 178 | # Old download from before |
| 179 | if VERBOSE: |
| 180 | print("Old-style download directory found. Assuming update required.") |
| 181 | self._parsed = True |
| 182 | return |
| 183 | |
| 184 | try: |
| 185 | with open(timestamp_file, 'r') as timestamp_handle: |
| 186 | self.last_time = timestamp_handle.readlines()[0] |
| 187 | if VERBOSE: |
| 188 | print("last downloaded version: {}".format(self.last_time)) |
| 189 | except FileNotFoundError: |
| 190 | # Not run on this thing before. |
| 191 | if VERBOSE: |
| 192 | print("Old-style download directory found. Assuming update required.") |
| 193 | self.last_time = None |
| 194 | self._parsed = True |
| 195 | return |
| 196 | |
| 197 | # OK, so we have a timestamp, lets see if there is anything new to get |
| 198 | file_links = soup.find_all('a', {'class':'file-download'}) |
| 199 | for file_link in file_links: |
| 200 | timestamp = file_link.find_all('time')[0]['datetime'] |
| 201 | if VERBOSE: |
| 202 | print("Checking {} (updated {})".format(file_link["title"], timestamp)) |
| 203 | if timestamp > self.last_time: |
| 204 | print("Found new/updated file {}".format(file_link["title"])) |
| 205 | self._needs_download = True |
| 206 | self._parsed = True |
| 207 | return |
| 208 | # Got here, so nope, no new files. |
| 209 | print("Found no new files for {}".format(self.title)) |
| 210 | self._needs_download = False |
| 211 | self._parsed = True |
| 212 | |
| 213 | def download(self, base_dir): |
| 214 | """ Download all files for a given thing. """ |
| 215 | if not self._parsed: |
| 216 | self._parse(base_dir) |
| 217 | |
| 218 | if not self._needs_download: |
| 219 | if VERBOSE: |
| 220 | print("{} already downloaded - skipping.".format(self.title)) |
| 221 | return |
| 222 | |
| 223 | # Have we already downloaded some things? |
| 224 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') |
| 225 | prev_dir = None |
| 226 | if os.path.exists(self.download_dir): |
| 227 | if not os.path.exists(timestamp_file): |
| 228 | # edge case: old style dir w/out timestamp. |
| 229 | print("Old style download dir found for {}".format(self.title)) |
| 230 | os.rename(self.download_dir, "{}_old".format(self.download_dir)) |
| 231 | else: |
| 232 | prev_dir = "{}_{}".format(self.download_dir, self.last_time) |
| 233 | os.rename(self.download_dir, prev_dir) |
| 234 | |
| 235 | # Get the list of files to download |
| 236 | soup = BeautifulSoup(self.text, features='lxml') |
| 237 | file_links = soup.find_all('a', {'class':'file-download'}) |
| 238 | |
| 239 | new_file_links = [] |
| 240 | old_file_links = [] |
| 241 | new_last_time = None |
| 242 | |
| 243 | if not self.last_time: |
| 244 | # If we don't have anything to copy from, then it is all new. |
| 245 | new_file_links = file_links |
| 246 | new_last_time = file_links[0].find_all('time')[0]['datetime'] |
| 247 | for file_link in file_links: |
| 248 | timestamp = file_link.find_all('time')[0]['datetime'] |
| 249 | if VERBOSE: |
| 250 | print("Found file {} from {}".format(file_link["title"], timestamp)) |
| 251 | if timestamp > new_last_time: |
| 252 | new_last_time = timestamp |
| 253 | else: |
| 254 | for file_link in file_links: |
| 255 | timestamp = file_link.find_all('time')[0]['datetime'] |
| 256 | if VERBOSE: |
| 257 | print("Checking {} (updated {})".format(file_link["title"], timestamp)) |
| 258 | if timestamp > self.last_time: |
| 259 | new_file_links.append(file_link) |
| 260 | else: |
| 261 | old_file_links.append(file_link) |
| 262 | if not new_last_time or timestamp > new_last_time: |
| 263 | new_last_time = timestamp |
| 264 | |
| 265 | if VERBOSE: |
| 266 | print("new timestamp {}".format(new_last_time)) |
| 267 | |
| 268 | # OK. Time to get to work. |
| 269 | os.mkdir(self.download_dir) |
| 270 | # First grab the cached files (if any) |
| 271 | for file_link in old_file_links: |
| 272 | old_file = os.path.join(prev_dir, file_link["title"]) |
| 273 | new_file = os.path.join(self.download_dir, file_link["title"]) |
| 274 | try: |
| 275 | if VERBOSE: |
| 276 | print("Copying {} to {}".format(old_file, new_file)) |
| 277 | copyfile(old_file, new_file) |
| 278 | except FileNotFoundError: |
| 279 | print("Unable to find {} in old archive, redownloading".format(file_link["title"])) |
| 280 | new_file_links.append(file_link) |
| 281 | |
| 282 | # Now download the new ones |
| 283 | files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links] |
| 284 | try: |
| 285 | for url, name in files: |
| 286 | file_name = os.path.join(self.download_dir, name) |
| 287 | if VERBOSE: |
| 288 | print("Downloading {} from {} to {}".format(name, url, file_name)) |
| 289 | data_req = requests.get(url) |
| 290 | with open(file_name, 'wb') as handle: |
| 291 | handle.write(data_req.content) |
| 292 | except Exception as exception: |
| 293 | print("Failed to download {} - {}".format(name, exception)) |
| 294 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) |
| 295 | return |
| 296 | |
| 297 | try: |
| 298 | # Now write the timestamp |
| 299 | with open(timestamp_file, 'w') as timestamp_handle: |
| 300 | timestamp_handle.write(new_last_time) |
| 301 | except Exception as exception: |
| 302 | print("Failed to write timestamp file - {}".format(exception)) |
| 303 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) |
| 304 | return |
| 305 | self._needs_download = False |
| 306 | if VERBOSE: |
| 307 | print("Download of {} finished".format(self.title)) |
| 308 | |
| 309 | def main(): |
| 310 | """ Entry point for script being run as a command. """ |
| 311 | parser = argparse.ArgumentParser() |
| 312 | parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true") |
| 313 | parser.add_argument("-d", "--directory", help="Target directory to download into") |
| 314 | subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand") |
| 315 | collection_parser = subparsers.add_parser('collection', help="Download an entire collection") |
| 316 | collection_parser.add_argument("owner", help="The owner of the collection to get") |
| 317 | collection_parser.add_argument("collection", help="The name of the collection to get") |
| 318 | thing_parser = subparsers.add_parser('thing', help="Download a single thing.") |
| 319 | thing_parser.add_argument("thing", help="Thing ID to download") |
| 320 | user_parser = subparsers.add_parser("user", help="Download all things by a user") |
| 321 | user_parser.add_argument("user", help="The user to get the designs of") |
| 322 | version_parser = subparsers.add_parser("version", help="Show the current version") |
| 323 | |
| 324 | args = parser.parse_args() |
| 325 | if not args.subcommand: |
| 326 | parser.print_help() |
| 327 | sys.exit(1) |
| 328 | if not args.directory: |
| 329 | args.directory = os.getcwd() |
| 330 | |
| 331 | global VERBOSE |
| 332 | VERBOSE = args.verbose |
| 333 | if args.subcommand.startswith("collection"): |
| 334 | collection = Collection(args.owner, args.collection, args.directory) |
| 335 | print(collection.get()) |
| 336 | collection.download() |
| 337 | if args.subcommand == "thing": |
| 338 | Thing(args.thing).download(args.directory) |
| 339 | if args.subcommand == "user": |
| 340 | designs = Designs(args.user, args.directory) |
| 341 | print(designs.get()) |
| 342 | designs.download() |
| 343 | if args.subcommand == "version": |
| 344 | print("thingy_grabber.py version {}".format(VERSION)) |
| 345 | |
| 346 | if __name__ == "__main__": |
| 347 | main() |