88c624562dce6b64e65e4a012dc949fc4c361be0
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 from shutil import copyfile
13 from bs4 import BeautifulSoup
14
15 URL_BASE = "https://www.thingiverse.com"
16 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
17 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
18
19 ID_REGEX = re.compile(r'"id":(\d*),')
20 TOTAL_REGEX = re.compile(r'"total":(\d*),')
21 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
22 # This appears to be fixed at 12, but if it changes would screw the rest up.
23 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
24 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
25
26 VERBOSE = False
27
28 def strip_ws(value):
29 """ Remove whitespace from a string """
30 return str(NO_WHITESPACE_REGEX.sub('-', value))
31
32 def slugify(value):
33 """
34 Normalizes string, converts to lowercase, removes non-alpha characters,
35 and converts spaces to hyphens.
36 """
37 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
38 value = str(re.sub(r'[^\w\s-]', '', value).strip())
39 value = str(NO_WHITESPACE_REGEX.sub('-', value))
40 #value = str(re.sub(r'[-\s]+', '-', value))
41 return value
42
43 class Grouping:
44 """ Holds details of a group of things.
45 This is effectively (although not actually) an abstract class
46 - use Collection or Designs instead.
47 """
48 def __init__(self):
49 self.things = []
50 self.total = 0
51 self.req_id = None
52 self.last_page = 0
53 self.per_page = None
54 # These should be set by child classes.
55 self.url = None
56 self.download_dir = None
57 self.collection_url = None
58
59 def _get_small_grouping(self, req):
60 """ Handle small groupings """
61 soup = BeautifulSoup(req.text, features='lxml')
62 links = soup.find_all('a', {'class':'card-img-holder'})
63 self.things = [x['href'].split(':')[1] for x in links]
64
65 return self.things
66
67 def get(self):
68 """ retrieve the things of the grouping. """
69 if self.things:
70 # We've already done it.
71 return self.things
72
73 # Check for initialisation:
74 if not self.url:
75 print("No URL set - object not initialised properly?")
76 raise ValueError("No URL set - object not initialised properly?")
77
78 # Get the internal details of the grouping.
79 if VERBOSE:
80 print("Querying {}".format(self.url))
81 c_req = requests.get(self.url)
82 total = TOTAL_REGEX.search(c_req.text)
83 if total is None:
84 # This is a small (<13) items grouping. Pull the list from this req.
85 return self._get_small_grouping(c_req)
86 self.total = total.groups()[0]
87 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
88 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
89 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
90 parameters = {
91 'base_url':self.url,
92 'page':'1',
93 'per_page':'12',
94 'id':self.req_id
95 }
96 for current_page in range(1, self.last_page + 1):
97 parameters['page'] = current_page
98 req = requests.post(self.collection_url, parameters)
99 soup = BeautifulSoup(req.text, features='lxml')
100 links = soup.find_all('a', {'class':'card-img-holder'})
101 self.things += [x['href'].split(':')[1] for x in links]
102
103 return self.things
104
105 def download(self):
106 """ Downloads all the files in a collection """
107 if not self.things:
108 self.get()
109
110 if not self.download_dir:
111 raise ValueError("No download_dir set - invalidly initialised object?")
112
113 base_dir = os.getcwd()
114 try:
115 os.mkdir(self.download_dir)
116 except FileExistsError:
117 print("Target directory {} already exists. Assuming a resume."
118 .format(self.download_dir))
119 if VERBOSE:
120 print("Downloading {} things.".format(self.total))
121 for thing in self.things:
122 Thing(thing).download(self.download_dir)
123
124 class Collection(Grouping):
125 """ Holds details of a collection. """
126 def __init__(self, user, name):
127 Grouping.__init__(self)
128 self.user = user
129 self.name = name
130 self.url = "{}/{}/collections/{}".format(
131 URL_BASE, self.user, strip_ws(self.name))
132 self.download_dir = os.path.join(os.getcwd(),
133 "{}-{}".format(slugify(self.user), slugify(self.name)))
134 self.collection_url = URL_COLLECTION
135
136 class Designs(Grouping):
137 """ Holds details of all of a users' designs. """
138 def __init__(self, user):
139 Grouping.__init__(self)
140 self.user = user
141 self.url = "{}/{}/designs".format(URL_BASE, self.user)
142 self.download_dir = os.path.join(os.getcwd(), "{} designs".format(slugify(self.user)))
143 self.collection_url = USER_COLLECTION
144
145 class Thing:
146 """ An individual design on thingiverse. """
147 def __init__(self, thing_id):
148 self.thing_id = thing_id
149 self.last_time = None
150 self._parsed = False
151 self._needs_download = True
152 self.text = None
153 self.title = None
154 self.download_dir = None
155
156 def _parse(self, base_dir):
157 """ Work out what, if anything needs to be done. """
158 if self._parsed:
159 return
160
161 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
162 req = requests.get(url)
163 self.text = req.text
164 soup = BeautifulSoup(self.text, features='lxml')
165
166 self.title = slugify(soup.find_all('h1')[0].text.strip())
167 self.download_dir = os.path.join(base_dir, self.title)
168
169 if not os.path.exists(self.download_dir):
170 # Not yet downloaded
171 self._parsed = True
172 return
173
174 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
175 if not os.path.exists(timestamp_file):
176 # Old download from before
177 if VERBOSE:
178 print("Old-style download directory found. Assuming update required.")
179 self._parsed = True
180 return
181
182 try:
183 with open(timestamp_file, 'r') as timestamp_handle:
184 self.last_time = timestamp_handle.readlines()[0]
185 if VERBOSE:
186 print("last downloaded version: {}".format(self.last_time))
187 except FileNotFoundError:
188 # Not run on this thing before.
189 if VERBOSE:
190 print("Old-style download directory found. Assuming update required.")
191 self.last_time = None
192 self._parsed = True
193 return
194
195 # OK, so we have a timestamp, lets see if there is anything new to get
196 file_links = soup.find_all('a', {'class':'file-download'})
197 for file_link in file_links:
198 timestamp = file_link.find_all('time')[0]['datetime']
199 if VERBOSE:
200 print("Checking {} (updated {})".format(file_link["title"], timestamp))
201 if timestamp > self.last_time:
202 print("Found new/updated file {}".format(file_link["title"]))
203 self._needs_download = True
204 self._parsed = True
205 return
206 # Got here, so nope, no new files.
207 print("Found no new files for {}".format(self.title))
208 self._needs_download = False
209 self._parsed = True
210
211 def download(self, base_dir):
212 """ Download all files for a given thing. """
213 if not self._parsed:
214 self._parse(base_dir)
215
216 if not self._needs_download:
217 if VERBOSE:
218 print("{} already downloaded - skipping.".format(self.title))
219 return
220
221 # Have we already downloaded some things?
222 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
223 prev_dir = None
224 if os.path.exists(self.download_dir):
225 if not os.path.exists(timestamp_file):
226 # edge case: old style dir w/out timestamp.
227 print("Old style download dir found for {}".format(self.title))
228 os.rename(self.download_dir, "{}_old".format(self.download_dir))
229 else:
230 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
231 os.rename(self.download_dir, prev_dir)
232
233 # Get the list of files to download
234 soup = BeautifulSoup(self.text, features='lxml')
235 file_links = soup.find_all('a', {'class':'file-download'})
236
237 new_file_links = []
238 old_file_links = []
239 new_last_time = None
240
241 if not self.last_time:
242 # If we don't have anything to copy from, then it is all new.
243 new_file_links = file_links
244 new_last_time = file_links[0].find_all('time')[0]['datetime']
245 for file_link in file_links:
246 timestamp = file_link.find_all('time')[0]['datetime']
247 if VERBOSE:
248 print("Found file {} from {}".format(file_link["title"], timestamp))
249 if timestamp > new_last_time:
250 new_last_time = timestamp
251 else:
252 for file_link in file_links:
253 timestamp = file_link.find_all('time')[0]['datetime']
254 if VERBOSE:
255 print("Checking {} (updated {})".format(file_link["title"], timestamp))
256 if timestamp > self.last_time:
257 new_file_links.append(file_link)
258 else:
259 old_file_links.append(file_link)
260 if not new_last_time or timestamp > new_last_time:
261 new_last_time = timestamp
262
263 if VERBOSE:
264 print("new timestamp {}".format(new_last_time))
265
266 # OK. Time to get to work.
267 os.mkdir(self.download_dir)
268 # First grab the cached files (if any)
269 for file_link in old_file_links:
270 old_file = os.path.join(prev_dir, file_link["title"])
271 new_file = os.path.join(self.download_dir, file_link["title"])
272 try:
273 if VERBOSE:
274 print("Copying {} to {}".format(old_file, new_file))
275 copyfile(old_file, new_file)
276 except FileNotFoundError:
277 print("Unable to find {} in old archive, redownloading".format(file_link["title"]))
278 new_file_links.append(file_link)
279
280 # Now download the new ones
281 files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
282 try:
283 for url, name in files:
284 file_name = os.path.join(self.download_dir, name)
285 if VERBOSE:
286 print("Downloading {} from {} to {}".format(name, url, file_name))
287 data_req = requests.get(url)
288 with open(file_name, 'wb') as handle:
289 handle.write(data_req.content)
290 except Exception as exception:
291 print("Failed to download {} - {}".format(name, exception))
292 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
293 return
294
295 try:
296 # Now write the timestamp
297 with open(timestamp_file, 'w') as timestamp_handle:
298 timestamp_handle.write(new_last_time)
299 except Exception as exception:
300 print("Failed to write timestamp file - {}".format(exception))
301 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
302 return
303 self._needs_download = False
304 if VERBOSE:
305 print("Download of {} finished".format(self.title))
306
307 def main():
308 """ Entry point for script being run as a command. """
309 parser = argparse.ArgumentParser()
310 parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
311 subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
312 collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
313 collection_parser.add_argument("owner", help="The owner of the collection to get")
314 collection_parser.add_argument("collection", help="The name of the collection to get")
315 thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
316 thing_parser.add_argument("thing", help="Thing ID to download")
317 user_parser = subparsers.add_parser("user", help="Download all things by a user")
318 user_parser.add_argument("user", help="The user to get the designs of")
319
320 args = parser.parse_args()
321 if not args.subcommand:
322 parser.print_help()
323 sys.exit(1)
324 global VERBOSE
325 VERBOSE = args.verbose
326 if args.subcommand.startswith("collection"):
327 collection = Collection(args.owner, args.collection)
328 print(collection.get())
329 collection.download()
330 if args.subcommand == "thing":
331 Thing(args.thing).download(os.getcwd())
332 if args.subcommand == "user":
333 designs = Designs(args.user)
334 print(designs.get())
335 designs.download()
336
337
338
339 if __name__ == "__main__":
340 main()