Commit | Line | Data |
---|---|---|
975060c9 OM |
1 | #!/usr/bin/env python3 |
2 | """ | |
3 | Thingiverse bulk downloader | |
4 | """ | |
5 | ||
6 | import re | |
4a98996b | 7 | import sys |
975060c9 OM |
8 | import os |
9 | import argparse | |
10 | import unicodedata | |
11 | import requests | |
3c82f75b | 12 | from shutil import copyfile |
975060c9 OM |
13 | from bs4 import BeautifulSoup |
14 | ||
15 | URL_BASE = "https://www.thingiverse.com" | |
16 | URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things" | |
948bd56f | 17 | USER_COLLECTION = URL_BASE + "/ajax/user/designs" |
975060c9 OM |
18 | |
19 | ID_REGEX = re.compile(r'"id":(\d*),') | |
20 | TOTAL_REGEX = re.compile(r'"total":(\d*),') | |
21 | LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),') | |
22 | # This appears to be fixed at 12, but if it changes would screw the rest up. | |
23 | PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),') | |
dd8c35f4 OM |
24 | NO_WHITESPACE_REGEX = re.compile(r'[-\s]+') |
25 | ||
db8066ec OM |
26 | VERSION = "0.4.0" |
27 | ||
dd8c35f4 OM |
28 | VERBOSE = False |
29 | ||
30 | def strip_ws(value): | |
31 | """ Remove whitespace from a string """ | |
32 | return str(NO_WHITESPACE_REGEX.sub('-', value)) | |
975060c9 OM |
33 | |
34 | def slugify(value): | |
35 | """ | |
36 | Normalizes string, converts to lowercase, removes non-alpha characters, | |
37 | and converts spaces to hyphens. | |
38 | """ | |
39 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() | |
40 | value = str(re.sub(r'[^\w\s-]', '', value).strip()) | |
dd8c35f4 OM |
41 | value = str(NO_WHITESPACE_REGEX.sub('-', value)) |
42 | #value = str(re.sub(r'[-\s]+', '-', value)) | |
975060c9 OM |
43 | return value |
44 | ||
3522a3bf | 45 | class Grouping: |
d66f1f78 | 46 | """ Holds details of a group of things for download |
3c82f75b OM |
47 | This is effectively (although not actually) an abstract class |
48 | - use Collection or Designs instead. | |
49 | """ | |
3522a3bf | 50 | def __init__(self): |
975060c9 OM |
51 | self.things = [] |
52 | self.total = 0 | |
53 | self.req_id = None | |
54 | self.last_page = 0 | |
55 | self.per_page = None | |
948bd56f | 56 | # These should be set by child classes. |
3522a3bf OM |
57 | self.url = None |
58 | self.download_dir = None | |
948bd56f | 59 | self.collection_url = None |
975060c9 | 60 | |
3522a3bf OM |
61 | def _get_small_grouping(self, req): |
62 | """ Handle small groupings """ | |
975060c9 OM |
63 | soup = BeautifulSoup(req.text, features='lxml') |
64 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
65 | self.things = [x['href'].split(':')[1] for x in links] | |
66 | ||
67 | return self.things | |
68 | ||
3522a3bf OM |
69 | def get(self): |
70 | """ retrieve the things of the grouping. """ | |
975060c9 OM |
71 | if self.things: |
72 | # We've already done it. | |
73 | return self.things | |
74 | ||
3522a3bf OM |
75 | # Check for initialisation: |
76 | if not self.url: | |
77 | print("No URL set - object not initialised properly?") | |
78 | raise ValueError("No URL set - object not initialised properly?") | |
79 | ||
80 | # Get the internal details of the grouping. | |
dd8c35f4 | 81 | if VERBOSE: |
3522a3bf OM |
82 | print("Querying {}".format(self.url)) |
83 | c_req = requests.get(self.url) | |
975060c9 OM |
84 | total = TOTAL_REGEX.search(c_req.text) |
85 | if total is None: | |
3522a3bf OM |
86 | # This is a small (<13) items grouping. Pull the list from this req. |
87 | return self._get_small_grouping(c_req) | |
975060c9 OM |
88 | self.total = total.groups()[0] |
89 | self.req_id = ID_REGEX.search(c_req.text).groups()[0] | |
90 | self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0]) | |
91 | self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0] | |
92 | parameters = { | |
3522a3bf | 93 | 'base_url':self.url, |
975060c9 OM |
94 | 'page':'1', |
95 | 'per_page':'12', | |
96 | 'id':self.req_id | |
97 | } | |
98 | for current_page in range(1, self.last_page + 1): | |
99 | parameters['page'] = current_page | |
948bd56f | 100 | req = requests.post(self.collection_url, parameters) |
975060c9 OM |
101 | soup = BeautifulSoup(req.text, features='lxml') |
102 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
103 | self.things += [x['href'].split(':')[1] for x in links] | |
104 | ||
105 | return self.things | |
106 | ||
107 | def download(self): | |
108 | """ Downloads all the files in a collection """ | |
109 | if not self.things: | |
3522a3bf OM |
110 | self.get() |
111 | ||
112 | if not self.download_dir: | |
113 | raise ValueError("No download_dir set - invalidly initialised object?") | |
114 | ||
975060c9 | 115 | base_dir = os.getcwd() |
975060c9 | 116 | try: |
3522a3bf | 117 | os.mkdir(self.download_dir) |
975060c9 | 118 | except FileExistsError: |
3c82f75b OM |
119 | print("Target directory {} already exists. Assuming a resume." |
120 | .format(self.download_dir)) | |
948bd56f OM |
121 | if VERBOSE: |
122 | print("Downloading {} things.".format(self.total)) | |
975060c9 | 123 | for thing in self.things: |
3c82f75b | 124 | Thing(thing).download(self.download_dir) |
975060c9 | 125 | |
3522a3bf OM |
126 | class Collection(Grouping): |
127 | """ Holds details of a collection. """ | |
d66f1f78 | 128 | def __init__(self, user, name, directory): |
3522a3bf OM |
129 | Grouping.__init__(self) |
130 | self.user = user | |
131 | self.name = name | |
3c82f75b OM |
132 | self.url = "{}/{}/collections/{}".format( |
133 | URL_BASE, self.user, strip_ws(self.name)) | |
d66f1f78 | 134 | self.download_dir = os.path.join(directory, |
3c82f75b | 135 | "{}-{}".format(slugify(self.user), slugify(self.name))) |
948bd56f | 136 | self.collection_url = URL_COLLECTION |
3522a3bf OM |
137 | |
138 | class Designs(Grouping): | |
139 | """ Holds details of all of a users' designs. """ | |
d66f1f78 | 140 | def __init__(self, user, directory): |
3522a3bf OM |
141 | Grouping.__init__(self) |
142 | self.user = user | |
143 | self.url = "{}/{}/designs".format(URL_BASE, self.user) | |
d66f1f78 | 144 | self.download_dir = os.path.join(directory, "{} designs".format(slugify(self.user))) |
948bd56f | 145 | self.collection_url = USER_COLLECTION |
975060c9 | 146 | |
3c82f75b OM |
147 | class Thing: |
148 | """ An individual design on thingiverse. """ | |
149 | def __init__(self, thing_id): | |
150 | self.thing_id = thing_id | |
151 | self.last_time = None | |
152 | self._parsed = False | |
153 | self._needs_download = True | |
154 | self.text = None | |
155 | self.title = None | |
156 | self.download_dir = None | |
975060c9 | 157 | |
3c82f75b OM |
158 | def _parse(self, base_dir): |
159 | """ Work out what, if anything needs to be done. """ | |
160 | if self._parsed: | |
161 | return | |
e36c2a07 | 162 | |
3c82f75b OM |
163 | url = "{}/thing:{}/files".format(URL_BASE, self.thing_id) |
164 | req = requests.get(url) | |
165 | self.text = req.text | |
166 | soup = BeautifulSoup(self.text, features='lxml') | |
167 | ||
168 | self.title = slugify(soup.find_all('h1')[0].text.strip()) | |
169 | self.download_dir = os.path.join(base_dir, self.title) | |
170 | ||
171 | if not os.path.exists(self.download_dir): | |
172 | # Not yet downloaded | |
173 | self._parsed = True | |
174 | return | |
175 | ||
176 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') | |
177 | if not os.path.exists(timestamp_file): | |
178 | # Old download from before | |
179 | if VERBOSE: | |
180 | print("Old-style download directory found. Assuming update required.") | |
181 | self._parsed = True | |
182 | return | |
183 | ||
184 | try: | |
185 | with open(timestamp_file, 'r') as timestamp_handle: | |
186 | self.last_time = timestamp_handle.readlines()[0] | |
187 | if VERBOSE: | |
188 | print("last downloaded version: {}".format(self.last_time)) | |
189 | except FileNotFoundError: | |
190 | # Not run on this thing before. | |
191 | if VERBOSE: | |
192 | print("Old-style download directory found. Assuming update required.") | |
193 | self.last_time = None | |
194 | self._parsed = True | |
195 | return | |
196 | ||
197 | # OK, so we have a timestamp, lets see if there is anything new to get | |
198 | file_links = soup.find_all('a', {'class':'file-download'}) | |
199 | for file_link in file_links: | |
200 | timestamp = file_link.find_all('time')[0]['datetime'] | |
e36c2a07 | 201 | if VERBOSE: |
3c82f75b OM |
202 | print("Checking {} (updated {})".format(file_link["title"], timestamp)) |
203 | if timestamp > self.last_time: | |
204 | print("Found new/updated file {}".format(file_link["title"])) | |
205 | self._needs_download = True | |
206 | self._parsed = True | |
207 | return | |
208 | # Got here, so nope, no new files. | |
209 | print("Found no new files for {}".format(self.title)) | |
210 | self._needs_download = False | |
211 | self._parsed = True | |
212 | ||
213 | def download(self, base_dir): | |
214 | """ Download all files for a given thing. """ | |
215 | if not self._parsed: | |
216 | self._parse(base_dir) | |
217 | ||
218 | if not self._needs_download: | |
219 | if VERBOSE: | |
220 | print("{} already downloaded - skipping.".format(self.title)) | |
221 | return | |
222 | ||
223 | # Have we already downloaded some things? | |
224 | timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') | |
225 | prev_dir = None | |
226 | if os.path.exists(self.download_dir): | |
227 | if not os.path.exists(timestamp_file): | |
228 | # edge case: old style dir w/out timestamp. | |
229 | print("Old style download dir found for {}".format(self.title)) | |
230 | os.rename(self.download_dir, "{}_old".format(self.download_dir)) | |
231 | else: | |
232 | prev_dir = "{}_{}".format(self.download_dir, self.last_time) | |
233 | os.rename(self.download_dir, prev_dir) | |
234 | ||
235 | # Get the list of files to download | |
236 | soup = BeautifulSoup(self.text, features='lxml') | |
237 | file_links = soup.find_all('a', {'class':'file-download'}) | |
238 | ||
239 | new_file_links = [] | |
240 | old_file_links = [] | |
241 | new_last_time = None | |
242 | ||
243 | if not self.last_time: | |
244 | # If we don't have anything to copy from, then it is all new. | |
245 | new_file_links = file_links | |
246 | new_last_time = file_links[0].find_all('time')[0]['datetime'] | |
247 | for file_link in file_links: | |
248 | timestamp = file_link.find_all('time')[0]['datetime'] | |
249 | if VERBOSE: | |
250 | print("Found file {} from {}".format(file_link["title"], timestamp)) | |
251 | if timestamp > new_last_time: | |
252 | new_last_time = timestamp | |
253 | else: | |
254 | for file_link in file_links: | |
255 | timestamp = file_link.find_all('time')[0]['datetime'] | |
256 | if VERBOSE: | |
257 | print("Checking {} (updated {})".format(file_link["title"], timestamp)) | |
258 | if timestamp > self.last_time: | |
259 | new_file_links.append(file_link) | |
260 | else: | |
261 | old_file_links.append(file_link) | |
262 | if not new_last_time or timestamp > new_last_time: | |
263 | new_last_time = timestamp | |
264 | ||
265 | if VERBOSE: | |
266 | print("new timestamp {}".format(new_last_time)) | |
267 | ||
268 | # OK. Time to get to work. | |
269 | os.mkdir(self.download_dir) | |
270 | # First grab the cached files (if any) | |
271 | for file_link in old_file_links: | |
272 | old_file = os.path.join(prev_dir, file_link["title"]) | |
273 | new_file = os.path.join(self.download_dir, file_link["title"]) | |
274 | try: | |
275 | if VERBOSE: | |
276 | print("Copying {} to {}".format(old_file, new_file)) | |
277 | copyfile(old_file, new_file) | |
278 | except FileNotFoundError: | |
279 | print("Unable to find {} in old archive, redownloading".format(file_link["title"])) | |
280 | new_file_links.append(file_link) | |
281 | ||
282 | # Now download the new ones | |
283 | files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links] | |
284 | try: | |
285 | for url, name in files: | |
286 | file_name = os.path.join(self.download_dir, name) | |
287 | if VERBOSE: | |
288 | print("Downloading {} from {} to {}".format(name, url, file_name)) | |
289 | data_req = requests.get(url) | |
290 | with open(file_name, 'wb') as handle: | |
291 | handle.write(data_req.content) | |
292 | except Exception as exception: | |
293 | print("Failed to download {} - {}".format(name, exception)) | |
294 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) | |
295 | return | |
296 | ||
297 | try: | |
298 | # Now write the timestamp | |
299 | with open(timestamp_file, 'w') as timestamp_handle: | |
300 | timestamp_handle.write(new_last_time) | |
301 | except Exception as exception: | |
302 | print("Failed to write timestamp file - {}".format(exception)) | |
303 | os.rename(self.download_dir, "{}_failed".format(self.download_dir)) | |
304 | return | |
305 | self._needs_download = False | |
306 | if VERBOSE: | |
307 | print("Download of {} finished".format(self.title)) | |
975060c9 OM |
308 | |
309 | def main(): | |
310 | """ Entry point for script being run as a command. """ | |
311 | parser = argparse.ArgumentParser() | |
dd8c35f4 | 312 | parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true") |
d66f1f78 | 313 | parser.add_argument("-d", "--directory", help="Target directory to download into") |
4a98996b OM |
314 | subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand") |
315 | collection_parser = subparsers.add_parser('collection', help="Download an entire collection") | |
316 | collection_parser.add_argument("owner", help="The owner of the collection to get") | |
317 | collection_parser.add_argument("collection", help="The name of the collection to get") | |
318 | thing_parser = subparsers.add_parser('thing', help="Download a single thing.") | |
319 | thing_parser.add_argument("thing", help="Thing ID to download") | |
3522a3bf OM |
320 | user_parser = subparsers.add_parser("user", help="Download all things by a user") |
321 | user_parser.add_argument("user", help="The user to get the designs of") | |
db8066ec | 322 | version_parser = subparsers.add_parser("version", help="Show the current version") |
4a98996b | 323 | |
975060c9 | 324 | args = parser.parse_args() |
4a98996b OM |
325 | if not args.subcommand: |
326 | parser.print_help() | |
327 | sys.exit(1) | |
d66f1f78 OM |
328 | if not args.directory: |
329 | args.directory = os.getcwd() | |
330 | ||
dd8c35f4 OM |
331 | global VERBOSE |
332 | VERBOSE = args.verbose | |
4a98996b | 333 | if args.subcommand.startswith("collection"): |
d66f1f78 | 334 | collection = Collection(args.owner, args.collection, args.directory) |
3522a3bf | 335 | print(collection.get()) |
4a98996b OM |
336 | collection.download() |
337 | if args.subcommand == "thing": | |
d66f1f78 | 338 | Thing(args.thing).download(args.directory) |
3522a3bf | 339 | if args.subcommand == "user": |
d66f1f78 | 340 | designs = Designs(args.user, args.directory) |
3522a3bf OM |
341 | print(designs.get()) |
342 | designs.download() | |
db8066ec OM |
343 | if args.subcommand == "version": |
344 | print("thingy_grabber.py version {}".format(VERSION)) | |
975060c9 OM |
345 | |
346 | if __name__ == "__main__": | |
347 | main() |