move failed downloads sideways
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
7import os
8import argparse
9import unicodedata
10import requests
11from bs4 import BeautifulSoup
12
13URL_BASE = "https://www.thingiverse.com"
14URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
15
16ID_REGEX = re.compile(r'"id":(\d*),')
17TOTAL_REGEX = re.compile(r'"total":(\d*),')
18LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
19# This appears to be fixed at 12, but if it changes would screw the rest up.
20PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
21NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
22
23VERBOSE = False
24
25def strip_ws(value):
26 """ Remove whitespace from a string """
27 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9
OM
28
29def slugify(value):
30 """
31 Normalizes string, converts to lowercase, removes non-alpha characters,
32 and converts spaces to hyphens.
33 """
34 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
35 value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4
OM
36 value = str(NO_WHITESPACE_REGEX.sub('-', value))
37 #value = str(re.sub(r'[-\s]+', '-', value))
975060c9
OM
38 return value
39
40class Collection:
41 """ Holds details of a collection. """
42 def __init__(self, user, name):
43 self.user = user
44 self.name = name
45 self.things = []
46 self.total = 0
47 self.req_id = None
48 self.last_page = 0
49 self.per_page = None
50
51 def _get_small_collection(self, req):
52 """ Handle small collections """
53 soup = BeautifulSoup(req.text, features='lxml')
54 links = soup.find_all('a', {'class':'card-img-holder'})
55 self.things = [x['href'].split(':')[1] for x in links]
56
57 return self.things
58
59 def get_collection(self):
60 """ retrieve the things of the collection. """
61 if self.things:
62 # We've already done it.
63 return self.things
64
65 # Get the internal details of the collection.
dd8c35f4
OM
66 c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
67 if VERBOSE:
68 print("Querying {}".format(c_url))
975060c9
OM
69 c_req = requests.get(c_url)
70 total = TOTAL_REGEX.search(c_req.text)
71 if total is None:
72 # This is a small (<13) items collection. Pull the list from this req.
73 return self._get_small_collection(c_req)
74 self.total = total.groups()[0]
75 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
76 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
77 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
78 parameters = {
79 'base_url':"{}/collections/{}".format(self.user, self.name),
80 'page':'1',
81 'per_page':'12',
82 'id':self.req_id
83 }
84 for current_page in range(1, self.last_page + 1):
85 parameters['page'] = current_page
86 req = requests.post(URL_COLLECTION, parameters)
87 soup = BeautifulSoup(req.text, features='lxml')
88 links = soup.find_all('a', {'class':'card-img-holder'})
89 self.things += [x['href'].split(':')[1] for x in links]
90
91 return self.things
92
93 def download(self):
94 """ Downloads all the files in a collection """
95 if not self.things:
96 self.get_collection()
97 base_dir = os.getcwd()
98 new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
99 target_dir = os.path.join(base_dir, new_dir)
100 try:
101 os.mkdir(target_dir)
102 except FileExistsError:
103 print("Target directory {} already exists. Assuming a resume.".format(new_dir))
104 os.chdir(target_dir)
105 for thing in self.things:
106 download_thing(thing)
107
108
109def download_thing(thing):
110 """ Downloads all the files for a given thing. """
111 file_url = "{}/thing:{}/files".format(URL_BASE, thing)
112 file_req = requests.get(file_url)
113 file_soup = BeautifulSoup(file_req.text, features='lxml')
114
115 title = slugify(file_soup.find_all('h1')[0].text.strip())
116 base_dir = os.getcwd()
117 try:
118 os.mkdir(title)
119 except FileExistsError:
120 print("Directory for {} ({}) already exists, skipping".format(thing, title))
121 return
122 print("Downloading {} ({})".format(thing, title))
123 os.chdir(title)
124
125 file_links = file_soup.find_all('a', {'class':'file-download'})
126 files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in file_links]
127
a7152c35
OM
128 try:
129 for url, name in files:
130 data_req = requests.get(url)
131 with open(name, 'wb') as handle:
132 handle.write(data_req.content)
133 except Exception as exception:
134 print("Failed to download {} - {}".format(name, exception))
135 os.chdir(base_dir)
136 os.rename(title, "{}_failed".format(title))
137 return
138
975060c9
OM
139 os.chdir(base_dir)
140
141def main():
142 """ Entry point for script being run as a command. """
143 parser = argparse.ArgumentParser()
144 parser.add_argument("owner", help="The owner of the collection to get")
145 parser.add_argument("collection", help="The name of the collection to get")
dd8c35f4 146 parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
975060c9 147 args = parser.parse_args()
dd8c35f4
OM
148 global VERBOSE
149 VERBOSE = args.verbose
975060c9
OM
150
151 collection = Collection(args.owner, args.collection)
152 print(collection.get_collection())
153 collection.download()
154
155if __name__ == "__main__":
156 main()