Commit | Line | Data |
---|---|---|
975060c9 OM |
1 | #!/usr/bin/env python3 |
2 | """ | |
3 | Thingiverse bulk downloader | |
4 | """ | |
5 | ||
6 | import re | |
4a98996b | 7 | import sys |
975060c9 OM |
8 | import os |
9 | import argparse | |
10 | import unicodedata | |
11 | import requests | |
12 | from bs4 import BeautifulSoup | |
13 | ||
14 | URL_BASE = "https://www.thingiverse.com" | |
15 | URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things" | |
16 | ||
17 | ID_REGEX = re.compile(r'"id":(\d*),') | |
18 | TOTAL_REGEX = re.compile(r'"total":(\d*),') | |
19 | LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),') | |
20 | # This appears to be fixed at 12, but if it changes would screw the rest up. | |
21 | PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),') | |
dd8c35f4 OM |
22 | NO_WHITESPACE_REGEX = re.compile(r'[-\s]+') |
23 | ||
24 | VERBOSE = False | |
25 | ||
26 | def strip_ws(value): | |
27 | """ Remove whitespace from a string """ | |
28 | return str(NO_WHITESPACE_REGEX.sub('-', value)) | |
975060c9 OM |
29 | |
30 | def slugify(value): | |
31 | """ | |
32 | Normalizes string, converts to lowercase, removes non-alpha characters, | |
33 | and converts spaces to hyphens. | |
34 | """ | |
35 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() | |
36 | value = str(re.sub(r'[^\w\s-]', '', value).strip()) | |
dd8c35f4 OM |
37 | value = str(NO_WHITESPACE_REGEX.sub('-', value)) |
38 | #value = str(re.sub(r'[-\s]+', '-', value)) | |
975060c9 OM |
39 | return value |
40 | ||
3522a3bf OM |
41 | class Grouping: |
42 | """ Holds details of a group of things. """ | |
43 | def __init__(self): | |
975060c9 OM |
44 | self.things = [] |
45 | self.total = 0 | |
46 | self.req_id = None | |
47 | self.last_page = 0 | |
48 | self.per_page = None | |
3522a3bf OM |
49 | # These two should be set by child classes. |
50 | self.url = None | |
51 | self.download_dir = None | |
975060c9 | 52 | |
3522a3bf OM |
53 | def _get_small_grouping(self, req): |
54 | """ Handle small groupings """ | |
975060c9 OM |
55 | soup = BeautifulSoup(req.text, features='lxml') |
56 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
57 | self.things = [x['href'].split(':')[1] for x in links] | |
58 | ||
59 | return self.things | |
60 | ||
3522a3bf OM |
61 | def get(self): |
62 | """ retrieve the things of the grouping. """ | |
975060c9 OM |
63 | if self.things: |
64 | # We've already done it. | |
65 | return self.things | |
66 | ||
3522a3bf OM |
67 | # Check for initialisation: |
68 | if not self.url: | |
69 | print("No URL set - object not initialised properly?") | |
70 | raise ValueError("No URL set - object not initialised properly?") | |
71 | ||
72 | # Get the internal details of the grouping. | |
dd8c35f4 | 73 | if VERBOSE: |
3522a3bf OM |
74 | print("Querying {}".format(self.url)) |
75 | c_req = requests.get(self.url) | |
975060c9 OM |
76 | total = TOTAL_REGEX.search(c_req.text) |
77 | if total is None: | |
3522a3bf OM |
78 | # This is a small (<13) items grouping. Pull the list from this req. |
79 | return self._get_small_grouping(c_req) | |
975060c9 OM |
80 | self.total = total.groups()[0] |
81 | self.req_id = ID_REGEX.search(c_req.text).groups()[0] | |
82 | self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0]) | |
83 | self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0] | |
84 | parameters = { | |
3522a3bf | 85 | 'base_url':self.url, |
975060c9 OM |
86 | 'page':'1', |
87 | 'per_page':'12', | |
88 | 'id':self.req_id | |
89 | } | |
90 | for current_page in range(1, self.last_page + 1): | |
91 | parameters['page'] = current_page | |
92 | req = requests.post(URL_COLLECTION, parameters) | |
93 | soup = BeautifulSoup(req.text, features='lxml') | |
94 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
95 | self.things += [x['href'].split(':')[1] for x in links] | |
96 | ||
97 | return self.things | |
98 | ||
99 | def download(self): | |
100 | """ Downloads all the files in a collection """ | |
101 | if not self.things: | |
3522a3bf OM |
102 | self.get() |
103 | ||
104 | if not self.download_dir: | |
105 | raise ValueError("No download_dir set - invalidly initialised object?") | |
106 | ||
975060c9 | 107 | base_dir = os.getcwd() |
975060c9 | 108 | try: |
3522a3bf | 109 | os.mkdir(self.download_dir) |
975060c9 | 110 | except FileExistsError: |
3522a3bf OM |
111 | print("Target directory {} already exists. Assuming a resume.".format(self.download_dir)) |
112 | os.chdir(self.download_dir) | |
975060c9 OM |
113 | for thing in self.things: |
114 | download_thing(thing) | |
3522a3bf | 115 | os.chdir(base_dir) |
975060c9 | 116 | |
3522a3bf OM |
117 | class Collection(Grouping): |
118 | """ Holds details of a collection. """ | |
119 | def __init__(self, user, name): | |
120 | Grouping.__init__(self) | |
121 | self.user = user | |
122 | self.name = name | |
123 | self.url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name)) | |
124 | self.download_dir = os.path.join(os.getcwd(), "{}-{}".format(slugify(self.user), slugify(self.name))) | |
125 | ||
126 | class Designs(Grouping): | |
127 | """ Holds details of all of a users' designs. """ | |
128 | def __init__(self, user): | |
129 | Grouping.__init__(self) | |
130 | self.user = user | |
131 | self.url = "{}/{}/designs".format(URL_BASE, self.user) | |
132 | self.download_dir = os.path.join(os.getcwd(), "{} designs".format(slugify(self.user))) | |
975060c9 OM |
133 | |
134 | def download_thing(thing): | |
135 | """ Downloads all the files for a given thing. """ | |
136 | file_url = "{}/thing:{}/files".format(URL_BASE, thing) | |
137 | file_req = requests.get(file_url) | |
138 | file_soup = BeautifulSoup(file_req.text, features='lxml') | |
139 | ||
140 | title = slugify(file_soup.find_all('h1')[0].text.strip()) | |
141 | base_dir = os.getcwd() | |
142 | try: | |
143 | os.mkdir(title) | |
144 | except FileExistsError: | |
e36c2a07 OM |
145 | pass |
146 | ||
975060c9 OM |
147 | print("Downloading {} ({})".format(thing, title)) |
148 | os.chdir(title) | |
e36c2a07 OM |
149 | last_time = None |
150 | ||
151 | try: | |
4a98996b OM |
152 | with open('timestamp.txt', 'r') as timestamp_handle: |
153 | last_time = timestamp_handle.readlines()[0] | |
e36c2a07 OM |
154 | if VERBOSE: |
155 | print("last downloaded version: {}".format(last_time)) | |
156 | except FileNotFoundError: | |
157 | # Not run on this thing before. | |
158 | if VERBOSE: | |
159 | print('Directory for thing already exists, checking for update.') | |
160 | last_time = None | |
975060c9 OM |
161 | |
162 | file_links = file_soup.find_all('a', {'class':'file-download'}) | |
e36c2a07 OM |
163 | new_last_time = last_time |
164 | new_file_links = [] | |
165 | ||
166 | for file_link in file_links: | |
167 | timestamp = file_link.find_all('time')[0]['datetime'] | |
168 | if VERBOSE: | |
169 | print("Checking {} (updated {})".format(file_link["title"], timestamp)) | |
170 | if not last_time or timestamp > last_time: | |
171 | new_file_links.append(file_link) | |
172 | if not new_last_time or timestamp > new_last_time: | |
173 | new_last_time = timestamp | |
174 | ||
175 | if last_time and new_last_time <= last_time: | |
176 | print("Thing already downloaded. Skipping.") | |
177 | files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links] | |
975060c9 | 178 | |
a7152c35 OM |
179 | try: |
180 | for url, name in files: | |
e36c2a07 OM |
181 | if VERBOSE: |
182 | print("Downloading {} from {}".format(name, url)) | |
a7152c35 OM |
183 | data_req = requests.get(url) |
184 | with open(name, 'wb') as handle: | |
185 | handle.write(data_req.content) | |
e36c2a07 | 186 | # now write timestamp |
4a98996b OM |
187 | with open('timestamp.txt', 'w') as timestamp_handle: |
188 | timestamp_handle.write(new_last_time) | |
a7152c35 OM |
189 | except Exception as exception: |
190 | print("Failed to download {} - {}".format(name, exception)) | |
191 | os.chdir(base_dir) | |
192 | os.rename(title, "{}_failed".format(title)) | |
193 | return | |
194 | ||
e36c2a07 | 195 | |
975060c9 OM |
196 | os.chdir(base_dir) |
197 | ||
198 | def main(): | |
199 | """ Entry point for script being run as a command. """ | |
200 | parser = argparse.ArgumentParser() | |
dd8c35f4 | 201 | parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true") |
4a98996b OM |
202 | subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand") |
203 | collection_parser = subparsers.add_parser('collection', help="Download an entire collection") | |
204 | collection_parser.add_argument("owner", help="The owner of the collection to get") | |
205 | collection_parser.add_argument("collection", help="The name of the collection to get") | |
206 | thing_parser = subparsers.add_parser('thing', help="Download a single thing.") | |
207 | thing_parser.add_argument("thing", help="Thing ID to download") | |
3522a3bf OM |
208 | user_parser = subparsers.add_parser("user", help="Download all things by a user") |
209 | user_parser.add_argument("user", help="The user to get the designs of") | |
4a98996b | 210 | |
975060c9 | 211 | args = parser.parse_args() |
4a98996b OM |
212 | if not args.subcommand: |
213 | parser.print_help() | |
214 | sys.exit(1) | |
dd8c35f4 OM |
215 | global VERBOSE |
216 | VERBOSE = args.verbose | |
4a98996b OM |
217 | if args.subcommand.startswith("collection"): |
218 | collection = Collection(args.owner, args.collection) | |
3522a3bf | 219 | print(collection.get()) |
4a98996b OM |
220 | collection.download() |
221 | if args.subcommand == "thing": | |
222 | download_thing(args.thing) | |
3522a3bf OM |
223 | if args.subcommand == "user": |
224 | designs = Designs(args.user) | |
225 | print(designs.get()) | |
226 | designs.download() | |
227 | ||
975060c9 | 228 | |
975060c9 OM |
229 | |
230 | if __name__ == "__main__": | |
231 | main() |