Commit | Line | Data |
---|---|---|
975060c9 OM |
1 | #!/usr/bin/env python3 |
2 | """ | |
3 | Thingiverse bulk downloader | |
4 | """ | |
5 | ||
6 | import re | |
4a98996b | 7 | import sys |
975060c9 OM |
8 | import os |
9 | import argparse | |
10 | import unicodedata | |
11 | import requests | |
fa2f3251 | 12 | import logging |
3c82f75b | 13 | from shutil import copyfile |
975060c9 OM |
14 | from bs4 import BeautifulSoup |
15 | ||
16 | URL_BASE = "https://www.thingiverse.com" | |
17 | URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things" | |
948bd56f | 18 | USER_COLLECTION = URL_BASE + "/ajax/user/designs" |
975060c9 OM |
19 | |
20 | ID_REGEX = re.compile(r'"id":(\d*),') | |
21 | TOTAL_REGEX = re.compile(r'"total":(\d*),') | |
22 | LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),') | |
23 | # This appears to be fixed at 12, but if it changes would screw the rest up. | |
24 | PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),') | |
dd8c35f4 OM |
25 | NO_WHITESPACE_REGEX = re.compile(r'[-\s]+') |
26 | ||
db8066ec OM |
27 | VERSION = "0.4.0" |
28 | ||
dd8c35f4 OM |
29 | def strip_ws(value): |
30 | """ Remove whitespace from a string """ | |
31 | return str(NO_WHITESPACE_REGEX.sub('-', value)) | |
975060c9 OM |
32 | |
33 | def slugify(value): | |
34 | """ | |
35 | Normalizes string, converts to lowercase, removes non-alpha characters, | |
36 | and converts spaces to hyphens. | |
37 | """ | |
38 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() | |
39 | value = str(re.sub(r'[^\w\s-]', '', value).strip()) | |
dd8c35f4 OM |
40 | value = str(NO_WHITESPACE_REGEX.sub('-', value)) |
41 | #value = str(re.sub(r'[-\s]+', '-', value)) | |
975060c9 OM |
42 | return value |
43 | ||
3522a3bf | 44 | class Grouping: |
d66f1f78 | 45 | """ Holds details of a group of things for download |
3c82f75b OM |
46 | This is effectively (although not actually) an abstract class |
47 | - use Collection or Designs instead. | |
48 | """ | |
3522a3bf | 49 | def __init__(self): |
975060c9 OM |
50 | self.things = [] |
51 | self.total = 0 | |
52 | self.req_id = None | |
53 | self.last_page = 0 | |
54 | self.per_page = None | |
948bd56f | 55 | # These should be set by child classes. |
3522a3bf OM |
56 | self.url = None |
57 | self.download_dir = None | |
948bd56f | 58 | self.collection_url = None |
975060c9 | 59 | |
3522a3bf OM |
60 | def _get_small_grouping(self, req): |
61 | """ Handle small groupings """ | |
975060c9 OM |
62 | soup = BeautifulSoup(req.text, features='lxml') |
63 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
64 | self.things = [x['href'].split(':')[1] for x in links] | |
fa2f3251 | 65 | self.total = len(self.things) |
975060c9 OM |
66 | |
67 | return self.things | |
68 | ||
3522a3bf OM |
69 | def get(self): |
70 | """ retrieve the things of the grouping. """ | |
975060c9 OM |
71 | if self.things: |
72 | # We've already done it. | |
73 | return self.things | |
74 | ||
3522a3bf OM |
75 | # Check for initialisation: |
76 | if not self.url: | |
fa2f3251 | 77 | logging.error("No URL set - object not initialised properly?") |
3522a3bf OM |
78 | raise ValueError("No URL set - object not initialised properly?") |
79 | ||
80 | # Get the internal details of the grouping. | |
fa2f3251 | 81 | logging.debug("Querying {}".format(self.url)) |
3522a3bf | 82 | c_req = requests.get(self.url) |
975060c9 OM |
83 | total = TOTAL_REGEX.search(c_req.text) |
84 | if total is None: | |
3522a3bf OM |
85 | # This is a small (<13) items grouping. Pull the list from this req. |
86 | return self._get_small_grouping(c_req) | |
975060c9 OM |
87 | self.total = total.groups()[0] |
88 | self.req_id = ID_REGEX.search(c_req.text).groups()[0] | |
89 | self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0]) | |
90 | self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0] | |
91 | parameters = { | |
3522a3bf | 92 | 'base_url':self.url, |
975060c9 OM |
93 | 'page':'1', |
94 | 'per_page':'12', | |
95 | 'id':self.req_id | |
96 | } | |
97 | for current_page in range(1, self.last_page + 1): | |
98 | parameters['page'] = current_page | |
948bd56f | 99 | req = requests.post(self.collection_url, parameters) |
975060c9 OM |
100 | soup = BeautifulSoup(req.text, features='lxml') |
101 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
102 | self.things += [x['href'].split(':')[1] for x in links] | |
103 | ||
104 | return self.things | |
105 | ||
106 | def download(self): | |
107 | """ Downloads all the files in a collection """ | |
108 | if not self.things: | |
3522a3bf OM |
109 | self.get() |
110 | ||
111 | if not self.download_dir: | |
112 | raise ValueError("No download_dir set - invalidly initialised object?") | |
113 | ||
975060c9 | 114 | base_dir = os.getcwd() |
975060c9 | 115 | try: |
3522a3bf | 116 | os.mkdir(self.download_dir) |
975060c9 | 117 | except FileExistsError: |
fa2f3251 OM |
118 | logging.info("Target directory {} already exists. Assuming a resume." |
119 | .format(self.download_dir)) | |
120 | logging.info("Downloading {} thing(s).".format(self.total)) | |
121 | for idx,thing in enumerate(self.things): | |
122 | logging.info("Downloading thing {}".format(idx)) | |
3c82f75b | 123 | Thing(thing).download(self.download_dir) |
975060c9 | 124 | |
3522a3bf OM |
125 | class Collection(Grouping): |
126 | """ Holds details of a collection. """ | |
d66f1f78 | 127 | def __init__(self, user, name, directory): |
3522a3bf OM |
128 | Grouping.__init__(self) |
129 | self.user = user | |
130 | self.name = name | |
3c82f75b OM |
131 | self.url = "{}/{}/collections/{}".format( |
132 | URL_BASE, self.user, strip_ws(self.name)) | |
d66f1f78 | 133 | self.download_dir = os.path.join(directory, |
3c82f75b | 134 | "{}-{}".format(slugify(self.user), slugify(self.name))) |
948bd56f | 135 | self.collection_url = URL_COLLECTION |
3522a3bf OM |
136 | |
137 | class Designs(Grouping): | |
138 | """ Holds details of all of a users' designs. """ | |
d66f1f78 | 139 | def __init__(self, user, directory): |
3522a3bf OM |
140 | Grouping.__init__(self) |
141 | self.user = user | |
142 | self.url = "{}/{}/designs".format(URL_BASE, self.user) | |
d66f1f78 | 143 | self.download_dir = os.path.join(directory, "{} designs".format(slugify(self.user))) |
948bd56f | 144 | self.collection_url = USER_COLLECTION |
975060c9 | 145 | |
3c82f75b OM |
146 | class Thing: |
147 | """ An individual design on thingiverse. """ | |
148 | def __init__(self, thing_id): | |
149 | self.thing_id = thing_id | |
150 | self.last_time = None | |
151 | self._parsed = False | |
152 | self._needs_download = True | |
153 | self.text = None | |
154 | self.title = None | |
155 | self.download_dir = None | |
975060c9 | 156 | |
3c82f75b OM |
157 | def _parse(self, base_dir): |
158 | """ Work out what, if anything needs to be done. """ | |
159 | if self._parsed: | |
160 | return | |
e36c2a07 | 161 | |
3c82f75b OM |
162 | url = "{}/thing:{}/files".format(URL_BASE, self.thing_id) |
163 | req = requests.get(url) | |
164 | self.text = req.text | |
165 | soup = BeautifulSoup(self.text, features='lxml') | |
680039fe OM |
166 | #import code |
167 | #code.interact(local=dict(globals(), **locals())) | |
3c82f75b OM |
168 | self.title = slugify(soup.find_all('h1')[0].text.strip()) |
169 | self.download_dir = os.path.join(base_dir, self.title) | |
170 | ||
fa2f3251 OM |
171 | logging.debug("Parsing {} ({})".format(self.thing_id, self.title)) |
172 | ||
3c82f75b OM |
173 | if not os.path.exists(self.download_dir): |
174 | # Not yet downloaded | |
175 | self._parsed = True | |
176 | return | |
177 | ||
178 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') | |
179 | if not os.path.exists(timestamp_file): | |
180 | # Old download from before | |
fa2f3251 | 181 | logging.warning("Old-style download directory found. Assuming update required.") |
3c82f75b OM |
182 | self._parsed = True |
183 | return | |
184 | ||
185 | try: | |
186 | with open(timestamp_file, 'r') as timestamp_handle: | |
187 | self.last_time = timestamp_handle.readlines()[0] | |
fa2f3251 | 188 | logging.info("last downloaded version: {}".format(self.last_time)) |
3c82f75b OM |
189 | except FileNotFoundError: |
190 | # Not run on this thing before. | |
fa2f3251 | 191 | logging.info("Old-style download directory found. Assuming update required.") |
3c82f75b OM |
192 | self.last_time = None |
193 | self._parsed = True | |
194 | return | |
195 | ||
196 | # OK, so we have a timestamp, lets see if there is anything new to get | |
197 | file_links = soup.find_all('a', {'class':'file-download'}) | |
198 | for file_link in file_links: | |
199 | timestamp = file_link.find_all('time')[0]['datetime'] | |
fa2f3251 | 200 | logging.debug("Checking {} (updated {})".format(file_link["title"], timestamp)) |
3c82f75b | 201 | if timestamp > self.last_time: |
fa2f3251 | 202 | logging.info("Found new/updated file {}".format(file_link["title"])) |
3c82f75b OM |
203 | self._needs_download = True |
204 | self._parsed = True | |
205 | return | |
206 | # Got here, so nope, no new files. | |
3c82f75b OM |
207 | self._needs_download = False |
208 | self._parsed = True | |
209 | ||
210 | def download(self, base_dir): | |
211 | """ Download all files for a given thing. """ | |
212 | if not self._parsed: | |
213 | self._parse(base_dir) | |
214 | ||
215 | if not self._needs_download: | |
fa2f3251 | 216 | print("{} already downloaded - skipping.".format(self.title)) |
3c82f75b OM |
217 | return |
218 | ||
219 | # Have we already downloaded some things? | |
220 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') | |
221 | prev_dir = None | |
222 | if os.path.exists(self.download_dir): | |
223 | if not os.path.exists(timestamp_file): | |
224 | # edge case: old style dir w/out timestamp. | |
fa2f3251 | 225 | logging.warning("Old style download dir found for {}".format(self.title)) |
3c82f75b OM |
226 | os.rename(self.download_dir, "{}_old".format(self.download_dir)) |
227 | else: | |
228 | prev_dir = "{}_{}".format(self.download_dir, self.last_time) | |
229 | os.rename(self.download_dir, prev_dir) | |
230 | ||
231 | # Get the list of files to download | |
232 | soup = BeautifulSoup(self.text, features='lxml') | |
233 | file_links = soup.find_all('a', {'class':'file-download'}) | |
234 | ||
235 | new_file_links = [] | |
236 | old_file_links = [] | |
237 | new_last_time = None | |
238 | ||
239 | if not self.last_time: | |
240 | # If we don't have anything to copy from, then it is all new. | |
241 | new_file_links = file_links | |
242 | new_last_time = file_links[0].find_all('time')[0]['datetime'] | |
243 | for file_link in file_links: | |
244 | timestamp = file_link.find_all('time')[0]['datetime'] | |
fa2f3251 | 245 | logging.debug("Found file {} from {}".format(file_link["title"], timestamp)) |
3c82f75b OM |
246 | if timestamp > new_last_time: |
247 | new_last_time = timestamp | |
248 | else: | |
249 | for file_link in file_links: | |
250 | timestamp = file_link.find_all('time')[0]['datetime'] | |
fa2f3251 | 251 | logging.debug("Checking {} (updated {})".format(file_link["title"], timestamp)) |
3c82f75b OM |
252 | if timestamp > self.last_time: |
253 | new_file_links.append(file_link) | |
254 | else: | |
255 | old_file_links.append(file_link) | |
256 | if not new_last_time or timestamp > new_last_time: | |
257 | new_last_time = timestamp | |
258 | ||
fa2f3251 | 259 | logging.debug("new timestamp {}".format(new_last_time)) |
3c82f75b OM |
260 | |
261 | # OK. Time to get to work. | |
fa2f3251 | 262 | logging.debug("Generating download_dir") |
3c82f75b OM |
263 | os.mkdir(self.download_dir) |
264 | # First grab the cached files (if any) | |
fa2f3251 | 265 | logging.info("Copying {} unchanged files.".format(len(old_file_links))) |
3c82f75b OM |
266 | for file_link in old_file_links: |
267 | old_file = os.path.join(prev_dir, file_link["title"]) | |
268 | new_file = os.path.join(self.download_dir, file_link["title"]) | |
269 | try: | |
fa2f3251 | 270 | logging.debug("Copying {} to {}".format(old_file, new_file)) |
3c82f75b OM |
271 | copyfile(old_file, new_file) |
272 | except FileNotFoundError: | |
fa2f3251 | 273 | logging.warning("Unable to find {} in old archive, redownloading".format(file_link["title"])) |
3c82f75b OM |
274 | new_file_links.append(file_link) |
275 | ||
276 | # Now download the new ones | |
277 | files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links] | |
fa2f3251 | 278 | logging.info("Downloading {} new files of {}".format(len(new_file_links), len(file_links))) |
3c82f75b OM |
279 | try: |
280 | for url, name in files: | |
281 | file_name = os.path.join(self.download_dir, name) | |
fa2f3251 | 282 | logging.debug("Downloading {} from {} to {}".format(name, url, file_name)) |
3c82f75b OM |
283 | data_req = requests.get(url) |
284 | with open(file_name, 'wb') as handle: | |
285 | handle.write(data_req.content) | |
286 | except Exception as exception: | |
fa2f3251 | 287 | logging.error("Failed to download {} - {}".format(name, exception)) |
3c82f75b OM |
288 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) |
289 | return | |
290 | ||
680039fe OM |
291 | # People like images |
292 | image_dir = os.path.join(self.download_dir, 'images') | |
fa2f3251 OM |
293 | imagelinks = soup.find_all('span', {'class':'gallery-slider'})[0] \ |
294 | .find_all('div', {'class':'gallery-photo'}) | |
295 | logging.info("Downloading {} images.".format(len(imagelinks))) | |
680039fe OM |
296 | try: |
297 | os.mkdir(image_dir) | |
fa2f3251 | 298 | for imagelink in imagelinks: |
680039fe OM |
299 | url = imagelink['data-full'] |
300 | filename = os.path.basename(url) | |
301 | if filename.endswith('stl'): | |
302 | filename = "{}.png".format(filename) | |
303 | image_req = requests.get(url) | |
304 | with open(os.path.join(image_dir, filename), 'wb') as handle: | |
305 | handle.write(image_req.content) | |
306 | except Exception as exception: | |
307 | print("Failed to download {} - {}".format(filename, exception)) | |
308 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) | |
309 | return | |
310 | ||
311 | ||
312 | ||
313 | ||
3c82f75b OM |
314 | try: |
315 | # Now write the timestamp | |
316 | with open(timestamp_file, 'w') as timestamp_handle: | |
317 | timestamp_handle.write(new_last_time) | |
318 | except Exception as exception: | |
319 | print("Failed to write timestamp file - {}".format(exception)) | |
320 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) | |
321 | return | |
322 | self._needs_download = False | |
fa2f3251 | 323 | logging.debug("Download of {} finished".format(self.title)) |
975060c9 | 324 | |
1ab49020 OM |
325 | def do_batch(batch_file, download_dir): |
326 | """ Read a file in line by line, parsing each as a set of calls to this script.""" | |
327 | with open(batch_file) as handle: | |
328 | for line in handle: | |
329 | line = line.strip() | |
330 | logging.info("Handling instruction {}".format(line)) | |
331 | command_arr = line.split() | |
332 | if command_arr[0] == "thing": | |
333 | logging.debug("Handling batch thing instruction: {}".format(line)) | |
334 | Thing(command_arr[1]).download(download_dir) | |
335 | continue | |
336 | if command_arr[0] == "collection": | |
337 | logging.debug("Handling batch collection instruction: {}".format(line)) | |
338 | Collection(command_arr[1], command_arr[2], download_dir).download() | |
339 | continue | |
340 | if command_arr[0] == "user": | |
341 | logging.debug("Handling batch collection instruction: {}".format(line)) | |
342 | Designs(command_arr[1], download_dir).download() | |
343 | continue | |
344 | logging.warning("Unable to parse current instruction. Skipping.") | |
345 | ||
975060c9 OM |
346 | def main(): |
347 | """ Entry point for script being run as a command. """ | |
348 | parser = argparse.ArgumentParser() | |
fa2f3251 | 349 | parser.add_argument("-l", "--log-level", choices=['debug','info','warning'], default='info', help="level of logging desired") |
d66f1f78 | 350 | parser.add_argument("-d", "--directory", help="Target directory to download into") |
4a98996b OM |
351 | subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand") |
352 | collection_parser = subparsers.add_parser('collection', help="Download an entire collection") | |
353 | collection_parser.add_argument("owner", help="The owner of the collection to get") | |
354 | collection_parser.add_argument("collection", help="The name of the collection to get") | |
355 | thing_parser = subparsers.add_parser('thing', help="Download a single thing.") | |
356 | thing_parser.add_argument("thing", help="Thing ID to download") | |
3522a3bf OM |
357 | user_parser = subparsers.add_parser("user", help="Download all things by a user") |
358 | user_parser.add_argument("user", help="The user to get the designs of") | |
1ab49020 OM |
359 | batch_parser = subparsers.add_parser("batch", help="Perform multiple actions written in a text file") |
360 | batch_parser.add_argument("batch_file", help="The name of the file to read.") | |
680039fe | 361 | subparsers.add_parser("version", help="Show the current version") |
4a98996b | 362 | |
975060c9 | 363 | args = parser.parse_args() |
4a98996b OM |
364 | if not args.subcommand: |
365 | parser.print_help() | |
366 | sys.exit(1) | |
d66f1f78 OM |
367 | if not args.directory: |
368 | args.directory = os.getcwd() | |
fa2f3251 OM |
369 | logging.basicConfig(level=getattr(logging, args.log_level.upper())) |
370 | ||
d66f1f78 | 371 | |
4a98996b | 372 | if args.subcommand.startswith("collection"): |
1ab49020 | 373 | Collection(args.owner, args.collection, args.directory).download() |
4a98996b | 374 | if args.subcommand == "thing": |
d66f1f78 | 375 | Thing(args.thing).download(args.directory) |
3522a3bf | 376 | if args.subcommand == "user": |
1ab49020 | 377 | Designs(args.user, args.directory).download() |
db8066ec OM |
378 | if args.subcommand == "version": |
379 | print("thingy_grabber.py version {}".format(VERSION)) | |
1ab49020 OM |
380 | if args.subcommand == "batch": |
381 | do_batch(args.batch_file, args.directory) | |
382 | ||
975060c9 OM |
383 | |
384 | if __name__ == "__main__": | |
385 | main() |