Commit | Line | Data |
---|---|---|
975060c9 OM |
1 | #!/usr/bin/env python3 |
2 | """ | |
3 | Thingiverse bulk downloader | |
4 | """ | |
5 | ||
6 | import re | |
4a98996b | 7 | import sys |
975060c9 OM |
8 | import os |
9 | import argparse | |
10 | import unicodedata | |
11 | import requests | |
3c82f75b | 12 | from shutil import copyfile |
975060c9 OM |
13 | from bs4 import BeautifulSoup |
14 | ||
15 | URL_BASE = "https://www.thingiverse.com" | |
16 | URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things" | |
17 | ||
18 | ID_REGEX = re.compile(r'"id":(\d*),') | |
19 | TOTAL_REGEX = re.compile(r'"total":(\d*),') | |
20 | LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),') | |
21 | # This appears to be fixed at 12, but if it changes would screw the rest up. | |
22 | PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),') | |
dd8c35f4 OM |
23 | NO_WHITESPACE_REGEX = re.compile(r'[-\s]+') |
24 | ||
25 | VERBOSE = False | |
26 | ||
27 | def strip_ws(value): | |
28 | """ Remove whitespace from a string """ | |
29 | return str(NO_WHITESPACE_REGEX.sub('-', value)) | |
975060c9 OM |
30 | |
31 | def slugify(value): | |
32 | """ | |
33 | Normalizes string, converts to lowercase, removes non-alpha characters, | |
34 | and converts spaces to hyphens. | |
35 | """ | |
36 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() | |
37 | value = str(re.sub(r'[^\w\s-]', '', value).strip()) | |
dd8c35f4 OM |
38 | value = str(NO_WHITESPACE_REGEX.sub('-', value)) |
39 | #value = str(re.sub(r'[-\s]+', '-', value)) | |
975060c9 OM |
40 | return value |
41 | ||
3522a3bf | 42 | class Grouping: |
3c82f75b OM |
43 | """ Holds details of a group of things. |
44 | This is effectively (although not actually) an abstract class | |
45 | - use Collection or Designs instead. | |
46 | """ | |
3522a3bf | 47 | def __init__(self): |
975060c9 OM |
48 | self.things = [] |
49 | self.total = 0 | |
50 | self.req_id = None | |
51 | self.last_page = 0 | |
52 | self.per_page = None | |
3522a3bf OM |
53 | # These two should be set by child classes. |
54 | self.url = None | |
55 | self.download_dir = None | |
975060c9 | 56 | |
3522a3bf OM |
57 | def _get_small_grouping(self, req): |
58 | """ Handle small groupings """ | |
975060c9 OM |
59 | soup = BeautifulSoup(req.text, features='lxml') |
60 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
61 | self.things = [x['href'].split(':')[1] for x in links] | |
62 | ||
63 | return self.things | |
64 | ||
3522a3bf OM |
65 | def get(self): |
66 | """ retrieve the things of the grouping. """ | |
975060c9 OM |
67 | if self.things: |
68 | # We've already done it. | |
69 | return self.things | |
70 | ||
3522a3bf OM |
71 | # Check for initialisation: |
72 | if not self.url: | |
73 | print("No URL set - object not initialised properly?") | |
74 | raise ValueError("No URL set - object not initialised properly?") | |
75 | ||
76 | # Get the internal details of the grouping. | |
dd8c35f4 | 77 | if VERBOSE: |
3522a3bf OM |
78 | print("Querying {}".format(self.url)) |
79 | c_req = requests.get(self.url) | |
975060c9 OM |
80 | total = TOTAL_REGEX.search(c_req.text) |
81 | if total is None: | |
3522a3bf OM |
82 | # This is a small (<13) items grouping. Pull the list from this req. |
83 | return self._get_small_grouping(c_req) | |
975060c9 OM |
84 | self.total = total.groups()[0] |
85 | self.req_id = ID_REGEX.search(c_req.text).groups()[0] | |
86 | self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0]) | |
87 | self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0] | |
88 | parameters = { | |
3522a3bf | 89 | 'base_url':self.url, |
975060c9 OM |
90 | 'page':'1', |
91 | 'per_page':'12', | |
92 | 'id':self.req_id | |
93 | } | |
94 | for current_page in range(1, self.last_page + 1): | |
95 | parameters['page'] = current_page | |
96 | req = requests.post(URL_COLLECTION, parameters) | |
97 | soup = BeautifulSoup(req.text, features='lxml') | |
98 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
99 | self.things += [x['href'].split(':')[1] for x in links] | |
100 | ||
101 | return self.things | |
102 | ||
103 | def download(self): | |
104 | """ Downloads all the files in a collection """ | |
105 | if not self.things: | |
3522a3bf OM |
106 | self.get() |
107 | ||
108 | if not self.download_dir: | |
109 | raise ValueError("No download_dir set - invalidly initialised object?") | |
110 | ||
975060c9 | 111 | base_dir = os.getcwd() |
975060c9 | 112 | try: |
3522a3bf | 113 | os.mkdir(self.download_dir) |
975060c9 | 114 | except FileExistsError: |
3c82f75b OM |
115 | print("Target directory {} already exists. Assuming a resume." |
116 | .format(self.download_dir)) | |
975060c9 | 117 | for thing in self.things: |
3c82f75b | 118 | Thing(thing).download(self.download_dir) |
975060c9 | 119 | |
3522a3bf OM |
120 | class Collection(Grouping): |
121 | """ Holds details of a collection. """ | |
122 | def __init__(self, user, name): | |
123 | Grouping.__init__(self) | |
124 | self.user = user | |
125 | self.name = name | |
3c82f75b OM |
126 | self.url = "{}/{}/collections/{}".format( |
127 | URL_BASE, self.user, strip_ws(self.name)) | |
128 | self.download_dir = os.path.join(os.getcwd(), | |
129 | "{}-{}".format(slugify(self.user), slugify(self.name))) | |
3522a3bf OM |
130 | |
131 | class Designs(Grouping): | |
132 | """ Holds details of all of a users' designs. """ | |
133 | def __init__(self, user): | |
134 | Grouping.__init__(self) | |
135 | self.user = user | |
136 | self.url = "{}/{}/designs".format(URL_BASE, self.user) | |
137 | self.download_dir = os.path.join(os.getcwd(), "{} designs".format(slugify(self.user))) | |
975060c9 | 138 | |
3c82f75b OM |
139 | class Thing: |
140 | """ An individual design on thingiverse. """ | |
141 | def __init__(self, thing_id): | |
142 | self.thing_id = thing_id | |
143 | self.last_time = None | |
144 | self._parsed = False | |
145 | self._needs_download = True | |
146 | self.text = None | |
147 | self.title = None | |
148 | self.download_dir = None | |
975060c9 | 149 | |
3c82f75b OM |
150 | def _parse(self, base_dir): |
151 | """ Work out what, if anything needs to be done. """ | |
152 | if self._parsed: | |
153 | return | |
e36c2a07 | 154 | |
3c82f75b OM |
155 | url = "{}/thing:{}/files".format(URL_BASE, self.thing_id) |
156 | req = requests.get(url) | |
157 | self.text = req.text | |
158 | soup = BeautifulSoup(self.text, features='lxml') | |
159 | ||
160 | self.title = slugify(soup.find_all('h1')[0].text.strip()) | |
161 | self.download_dir = os.path.join(base_dir, self.title) | |
162 | ||
163 | if not os.path.exists(self.download_dir): | |
164 | # Not yet downloaded | |
165 | self._parsed = True | |
166 | return | |
167 | ||
168 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') | |
169 | if not os.path.exists(timestamp_file): | |
170 | # Old download from before | |
171 | if VERBOSE: | |
172 | print("Old-style download directory found. Assuming update required.") | |
173 | self._parsed = True | |
174 | return | |
175 | ||
176 | try: | |
177 | with open(timestamp_file, 'r') as timestamp_handle: | |
178 | self.last_time = timestamp_handle.readlines()[0] | |
179 | if VERBOSE: | |
180 | print("last downloaded version: {}".format(self.last_time)) | |
181 | except FileNotFoundError: | |
182 | # Not run on this thing before. | |
183 | if VERBOSE: | |
184 | print("Old-style download directory found. Assuming update required.") | |
185 | self.last_time = None | |
186 | self._parsed = True | |
187 | return | |
188 | ||
189 | # OK, so we have a timestamp, lets see if there is anything new to get | |
190 | file_links = soup.find_all('a', {'class':'file-download'}) | |
191 | for file_link in file_links: | |
192 | timestamp = file_link.find_all('time')[0]['datetime'] | |
e36c2a07 | 193 | if VERBOSE: |
3c82f75b OM |
194 | print("Checking {} (updated {})".format(file_link["title"], timestamp)) |
195 | if timestamp > self.last_time: | |
196 | print("Found new/updated file {}".format(file_link["title"])) | |
197 | self._needs_download = True | |
198 | self._parsed = True | |
199 | return | |
200 | # Got here, so nope, no new files. | |
201 | print("Found no new files for {}".format(self.title)) | |
202 | self._needs_download = False | |
203 | self._parsed = True | |
204 | ||
205 | def download(self, base_dir): | |
206 | """ Download all files for a given thing. """ | |
207 | if not self._parsed: | |
208 | self._parse(base_dir) | |
209 | ||
210 | if not self._needs_download: | |
211 | if VERBOSE: | |
212 | print("{} already downloaded - skipping.".format(self.title)) | |
213 | return | |
214 | ||
215 | # Have we already downloaded some things? | |
216 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') | |
217 | prev_dir = None | |
218 | if os.path.exists(self.download_dir): | |
219 | if not os.path.exists(timestamp_file): | |
220 | # edge case: old style dir w/out timestamp. | |
221 | print("Old style download dir found for {}".format(self.title)) | |
222 | os.rename(self.download_dir, "{}_old".format(self.download_dir)) | |
223 | else: | |
224 | prev_dir = "{}_{}".format(self.download_dir, self.last_time) | |
225 | os.rename(self.download_dir, prev_dir) | |
226 | ||
227 | # Get the list of files to download | |
228 | soup = BeautifulSoup(self.text, features='lxml') | |
229 | file_links = soup.find_all('a', {'class':'file-download'}) | |
230 | ||
231 | new_file_links = [] | |
232 | old_file_links = [] | |
233 | new_last_time = None | |
234 | ||
235 | if not self.last_time: | |
236 | # If we don't have anything to copy from, then it is all new. | |
237 | new_file_links = file_links | |
238 | new_last_time = file_links[0].find_all('time')[0]['datetime'] | |
239 | for file_link in file_links: | |
240 | timestamp = file_link.find_all('time')[0]['datetime'] | |
241 | if VERBOSE: | |
242 | print("Found file {} from {}".format(file_link["title"], timestamp)) | |
243 | if timestamp > new_last_time: | |
244 | new_last_time = timestamp | |
245 | else: | |
246 | for file_link in file_links: | |
247 | timestamp = file_link.find_all('time')[0]['datetime'] | |
248 | if VERBOSE: | |
249 | print("Checking {} (updated {})".format(file_link["title"], timestamp)) | |
250 | if timestamp > self.last_time: | |
251 | new_file_links.append(file_link) | |
252 | else: | |
253 | old_file_links.append(file_link) | |
254 | if not new_last_time or timestamp > new_last_time: | |
255 | new_last_time = timestamp | |
256 | ||
257 | if VERBOSE: | |
258 | print("new timestamp {}".format(new_last_time)) | |
259 | ||
260 | # OK. Time to get to work. | |
261 | os.mkdir(self.download_dir) | |
262 | # First grab the cached files (if any) | |
263 | for file_link in old_file_links: | |
264 | old_file = os.path.join(prev_dir, file_link["title"]) | |
265 | new_file = os.path.join(self.download_dir, file_link["title"]) | |
266 | try: | |
267 | if VERBOSE: | |
268 | print("Copying {} to {}".format(old_file, new_file)) | |
269 | copyfile(old_file, new_file) | |
270 | except FileNotFoundError: | |
271 | print("Unable to find {} in old archive, redownloading".format(file_link["title"])) | |
272 | new_file_links.append(file_link) | |
273 | ||
274 | # Now download the new ones | |
275 | files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links] | |
276 | try: | |
277 | for url, name in files: | |
278 | file_name = os.path.join(self.download_dir, name) | |
279 | if VERBOSE: | |
280 | print("Downloading {} from {} to {}".format(name, url, file_name)) | |
281 | data_req = requests.get(url) | |
282 | with open(file_name, 'wb') as handle: | |
283 | handle.write(data_req.content) | |
284 | except Exception as exception: | |
285 | print("Failed to download {} - {}".format(name, exception)) | |
286 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) | |
287 | return | |
288 | ||
289 | try: | |
290 | # Now write the timestamp | |
291 | with open(timestamp_file, 'w') as timestamp_handle: | |
292 | timestamp_handle.write(new_last_time) | |
293 | except Exception as exception: | |
294 | print("Failed to write timestamp file - {}".format(exception)) | |
295 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) | |
296 | return | |
297 | self._needs_download = False | |
298 | if VERBOSE: | |
299 | print("Download of {} finished".format(self.title)) | |
975060c9 OM |
300 | |
301 | def main(): | |
302 | """ Entry point for script being run as a command. """ | |
303 | parser = argparse.ArgumentParser() | |
dd8c35f4 | 304 | parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true") |
4a98996b OM |
305 | subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand") |
306 | collection_parser = subparsers.add_parser('collection', help="Download an entire collection") | |
307 | collection_parser.add_argument("owner", help="The owner of the collection to get") | |
308 | collection_parser.add_argument("collection", help="The name of the collection to get") | |
309 | thing_parser = subparsers.add_parser('thing', help="Download a single thing.") | |
310 | thing_parser.add_argument("thing", help="Thing ID to download") | |
3522a3bf OM |
311 | user_parser = subparsers.add_parser("user", help="Download all things by a user") |
312 | user_parser.add_argument("user", help="The user to get the designs of") | |
4a98996b | 313 | |
975060c9 | 314 | args = parser.parse_args() |
4a98996b OM |
315 | if not args.subcommand: |
316 | parser.print_help() | |
317 | sys.exit(1) | |
dd8c35f4 OM |
318 | global VERBOSE |
319 | VERBOSE = args.verbose | |
4a98996b OM |
320 | if args.subcommand.startswith("collection"): |
321 | collection = Collection(args.owner, args.collection) | |
3522a3bf | 322 | print(collection.get()) |
4a98996b OM |
323 | collection.download() |
324 | if args.subcommand == "thing": | |
3c82f75b | 325 | Thing(args.thing).download(os.getcwd()) |
3522a3bf OM |
326 | if args.subcommand == "user": |
327 | designs = Designs(args.user) | |
328 | print(designs.get()) | |
329 | designs.download() | |
330 | ||
975060c9 | 331 | |
975060c9 OM |
332 | |
333 | if __name__ == "__main__": | |
334 | main() |