get readme & license details
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 import logging
13 from shutil import copyfile
14 from bs4 import BeautifulSoup
15
16 URL_BASE = "https://www.thingiverse.com"
17 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
18 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
19
20 ID_REGEX = re.compile(r'"id":(\d*),')
21 TOTAL_REGEX = re.compile(r'"total":(\d*),')
22 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
23 # This appears to be fixed at 12, but if it changes would screw the rest up.
24 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
25 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
26
27 VERSION = "0.5.1"
28
29
30 def strip_ws(value):
31 """ Remove whitespace from a string """
32 return str(NO_WHITESPACE_REGEX.sub('-', value))
33
34
35 def slugify(value):
36 """
37 Normalizes string, converts to lowercase, removes non-alpha characters,
38 and converts spaces to hyphens.
39 """
40 value = unicodedata.normalize('NFKD', value).encode(
41 'ascii', 'ignore').decode()
42 value = str(re.sub(r'[^\w\s-]', '', value).strip())
43 value = str(NO_WHITESPACE_REGEX.sub('-', value))
44 #value = str(re.sub(r'[-\s]+', '-', value))
45 return value
46
47
48 class Grouping:
49 """ Holds details of a group of things for download
50 This is effectively (although not actually) an abstract class
51 - use Collection or Designs instead.
52 """
53
54 def __init__(self):
55 self.things = []
56 self.total = 0
57 self.req_id = None
58 self.last_page = 0
59 self.per_page = None
60 # These should be set by child classes.
61 self.url = None
62 self.download_dir = None
63 self.collection_url = None
64
65 def _get_small_grouping(self, req):
66 """ Handle small groupings """
67 soup = BeautifulSoup(req.text, features='lxml')
68 links = soup.find_all('a', {'class': 'card-img-holder'})
69 self.things = [x['href'].split(':')[1] for x in links]
70 self.total = len(self.things)
71
72 return self.things
73
74 def get(self):
75 """ retrieve the things of the grouping. """
76 if self.things:
77 # We've already done it.
78 return self.things
79
80 # Check for initialisation:
81 if not self.url:
82 logging.error("No URL set - object not initialised properly?")
83 raise ValueError("No URL set - object not initialised properly?")
84
85 # Get the internal details of the grouping.
86 logging.debug("Querying {}".format(self.url))
87 c_req = requests.get(self.url)
88 total = TOTAL_REGEX.search(c_req.text)
89 if total is None:
90 # This is a small (<13) items grouping. Pull the list from this req.
91 return self._get_small_grouping(c_req)
92 self.total = total.groups()[0]
93 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
94 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
95 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
96 parameters = {
97 'base_url': self.url,
98 'page': '1',
99 'per_page': '12',
100 'id': self.req_id
101 }
102 for current_page in range(1, self.last_page + 1):
103 parameters['page'] = current_page
104 req = requests.post(self.collection_url, parameters)
105 soup = BeautifulSoup(req.text, features='lxml')
106 links = soup.find_all('a', {'class': 'card-img-holder'})
107 self.things += [x['href'].split(':')[1] for x in links]
108
109 return self.things
110
111 def download(self):
112 """ Downloads all the files in a collection """
113 if not self.things:
114 self.get()
115
116 if not self.download_dir:
117 raise ValueError(
118 "No download_dir set - invalidly initialised object?")
119
120 base_dir = os.getcwd()
121 try:
122 os.mkdir(self.download_dir)
123 except FileExistsError:
124 logging.info("Target directory {} already exists. Assuming a resume."
125 .format(self.download_dir))
126 logging.info("Downloading {} thing(s).".format(self.total))
127 for idx, thing in enumerate(self.things):
128 logging.info("Downloading thing {}".format(idx))
129 Thing(thing).download(self.download_dir)
130
131
132 class Collection(Grouping):
133 """ Holds details of a collection. """
134
135 def __init__(self, user, name, directory):
136 Grouping.__init__(self)
137 self.user = user
138 self.name = name
139 self.url = "{}/{}/collections/{}".format(
140 URL_BASE, self.user, strip_ws(self.name))
141 self.download_dir = os.path.join(directory,
142 "{}-{}".format(slugify(self.user), slugify(self.name)))
143 self.collection_url = URL_COLLECTION
144
145
146 class Designs(Grouping):
147 """ Holds details of all of a users' designs. """
148
149 def __init__(self, user, directory):
150 Grouping.__init__(self)
151 self.user = user
152 self.url = "{}/{}/designs".format(URL_BASE, self.user)
153 self.download_dir = os.path.join(
154 directory, "{} designs".format(slugify(self.user)))
155 self.collection_url = USER_COLLECTION
156
157
158 class Thing:
159 """ An individual design on thingiverse. """
160
161 def __init__(self, thing_id):
162 self.thing_id = thing_id
163 self.last_time = None
164 self._parsed = False
165 self._needs_download = True
166 self.text = None
167 self.title = None
168 self.download_dir = None
169
170 def _parse(self, base_dir):
171 """ Work out what, if anything needs to be done. """
172 if self._parsed:
173 return
174
175 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
176 req = requests.get(url)
177 self.text = req.text
178 soup = BeautifulSoup(self.text, features='lxml')
179 #import code
180 #code.interact(local=dict(globals(), **locals()))
181 self.title = slugify(soup.find_all('h1')[0].text.strip())
182 self.download_dir = os.path.join(base_dir, self.title)
183
184 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
185
186 if not os.path.exists(self.download_dir):
187 # Not yet downloaded
188 self._parsed = True
189 return
190
191 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
192 if not os.path.exists(timestamp_file):
193 # Old download from before
194 logging.warning(
195 "Old-style download directory found. Assuming update required.")
196 self._parsed = True
197 return
198
199 try:
200 with open(timestamp_file, 'r') as timestamp_handle:
201 self.last_time = timestamp_handle.readlines()[0]
202 logging.info("last downloaded version: {}".format(self.last_time))
203 except FileNotFoundError:
204 # Not run on this thing before.
205 logging.info(
206 "Old-style download directory found. Assuming update required.")
207 self.last_time = None
208 self._parsed = True
209 return
210
211 # OK, so we have a timestamp, lets see if there is anything new to get
212 file_links = soup.find_all('a', {'class': 'file-download'})
213 for file_link in file_links:
214 timestamp = file_link.find_all('time')[0]['datetime']
215 logging.debug("Checking {} (updated {})".format(
216 file_link["title"], timestamp))
217 if timestamp > self.last_time:
218 logging.info(
219 "Found new/updated file {}".format(file_link["title"]))
220 self._needs_download = True
221 self._parsed = True
222 return
223 # Got here, so nope, no new files.
224 self._needs_download = False
225 self._parsed = True
226
227 def download(self, base_dir):
228 """ Download all files for a given thing. """
229 if not self._parsed:
230 self._parse(base_dir)
231
232 if not self._needs_download:
233 print("{} already downloaded - skipping.".format(self.title))
234 return
235
236 # Have we already downloaded some things?
237 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
238 prev_dir = None
239 if os.path.exists(self.download_dir):
240 if not os.path.exists(timestamp_file):
241 # edge case: old style dir w/out timestamp.
242 logging.warning(
243 "Old style download dir found for {}".format(self.title))
244 os.rename(self.download_dir,
245 "{}_old".format(self.download_dir))
246 else:
247 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
248 os.rename(self.download_dir, prev_dir)
249
250 # Get the list of files to download
251 soup = BeautifulSoup(self.text, features='lxml')
252 file_links = soup.find_all('a', {'class': 'file-download'})
253
254 new_file_links = []
255 old_file_links = []
256 new_last_time = None
257
258 if not self.last_time:
259 # If we don't have anything to copy from, then it is all new.
260 new_file_links = file_links
261 new_last_time = file_links[0].find_all('time')[0]['datetime']
262 for file_link in file_links:
263 timestamp = file_link.find_all('time')[0]['datetime']
264 logging.debug("Found file {} from {}".format(
265 file_link["title"], timestamp))
266 if timestamp > new_last_time:
267 new_last_time = timestamp
268 else:
269 for file_link in file_links:
270 timestamp = file_link.find_all('time')[0]['datetime']
271 logging.debug("Checking {} (updated {})".format(
272 file_link["title"], timestamp))
273 if timestamp > self.last_time:
274 new_file_links.append(file_link)
275 else:
276 old_file_links.append(file_link)
277 if not new_last_time or timestamp > new_last_time:
278 new_last_time = timestamp
279
280 logging.debug("new timestamp {}".format(new_last_time))
281
282 # OK. Time to get to work.
283 logging.debug("Generating download_dir")
284 os.mkdir(self.download_dir)
285 # First grab the cached files (if any)
286 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
287 for file_link in old_file_links:
288 old_file = os.path.join(prev_dir, file_link["title"])
289 new_file = os.path.join(self.download_dir, file_link["title"])
290 try:
291 logging.debug("Copying {} to {}".format(old_file, new_file))
292 copyfile(old_file, new_file)
293 except FileNotFoundError:
294 logging.warning(
295 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
296 new_file_links.append(file_link)
297
298 # Now download the new ones
299 files = [("{}{}".format(URL_BASE, x['href']), x["title"])
300 for x in new_file_links]
301 logging.info("Downloading {} new files of {}".format(
302 len(new_file_links), len(file_links)))
303 try:
304 for url, name in files:
305 file_name = os.path.join(self.download_dir, name)
306 logging.debug("Downloading {} from {} to {}".format(
307 name, url, file_name))
308 data_req = requests.get(url)
309 with open(file_name, 'wb') as handle:
310 handle.write(data_req.content)
311 except Exception as exception:
312 logging.error("Failed to download {} - {}".format(name, exception))
313 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
314 return
315
316 # People like images
317 image_dir = os.path.join(self.download_dir, 'images')
318 imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
319 .find_all('div', {'class': 'gallery-photo'})
320 logging.info("Downloading {} images.".format(len(imagelinks)))
321 try:
322 os.mkdir(image_dir)
323 for imagelink in imagelinks:
324 url = next(filter(None,[imagelink[x] for x in ['data-full',
325 'data-large',
326 'data-medium',
327 'data-thumb']]), None)
328 if not url:
329 logging.warning("Unable to find any urls for {}".format(imagelink))
330 continue
331
332 filename = os.path.basename(url)
333 if filename.endswith('stl'):
334 filename = "{}.png".format(filename)
335 image_req = requests.get(url)
336 with open(os.path.join(image_dir, filename), 'wb') as handle:
337 handle.write(image_req.content)
338 except Exception as exception:
339 print("Failed to download {} - {}".format(filename, exception))
340 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
341 return
342
343 # instructions are good too.
344 logging.info("Downloading readme")
345 try:
346 readme_txt = soup.find('meta', property='og:description')['content']
347 with open(os.path.join(self.download_dir,'readme.txt'), 'w') as readme_handle:
348 readme_handle.write("{}\n".format(readme_txt))
349 except (TypeError, KeyError) as exception:
350 logging.warning("No readme? {}".format(exception))
351 except IOError as exception:
352 logging.warning("Failed to write readme! {}".format(exception))
353
354 # Best get some licenses
355 logging.info("Downloading license")
356 try:
357 license_txt = soup.find('div',{'class':'license-text'}).text
358 if license_txt:
359 with open(os.path.join(self.download_dir,'license.txt'), 'w') as license_handle:
360 license_handle.write("{}\n".format(license_txt))
361 except AttributeError as exception:
362 logging.warning("No license? {}".format(exception))
363 except IOError as exception:
364 logging.warning("Failed to write license! {}".format(exception))
365
366
367 try:
368 # Now write the timestamp
369 with open(timestamp_file, 'w') as timestamp_handle:
370 timestamp_handle.write(new_last_time)
371 except Exception as exception:
372 print("Failed to write timestamp file - {}".format(exception))
373 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
374 return
375 self._needs_download = False
376 logging.debug("Download of {} finished".format(self.title))
377
378
379 def do_batch(batch_file, download_dir):
380 """ Read a file in line by line, parsing each as a set of calls to this script."""
381 with open(batch_file) as handle:
382 for line in handle:
383 line = line.strip()
384 logging.info("Handling instruction {}".format(line))
385 command_arr = line.split()
386 if command_arr[0] == "thing":
387 logging.debug(
388 "Handling batch thing instruction: {}".format(line))
389 Thing(command_arr[1]).download(download_dir)
390 continue
391 if command_arr[0] == "collection":
392 logging.debug(
393 "Handling batch collection instruction: {}".format(line))
394 Collection(command_arr[1], command_arr[2],
395 download_dir).download()
396 continue
397 if command_arr[0] == "user":
398 logging.debug(
399 "Handling batch collection instruction: {}".format(line))
400 Designs(command_arr[1], download_dir).download()
401 continue
402 logging.warning("Unable to parse current instruction. Skipping.")
403
404
405 def main():
406 """ Entry point for script being run as a command. """
407 parser = argparse.ArgumentParser()
408 parser.add_argument("-l", "--log-level", choices=[
409 'debug', 'info', 'warning'], default='info', help="level of logging desired")
410 parser.add_argument("-d", "--directory",
411 help="Target directory to download into")
412 subparsers = parser.add_subparsers(
413 help="Type of thing to download", dest="subcommand")
414 collection_parser = subparsers.add_parser(
415 'collection', help="Download one or more entire collection(s)")
416 collection_parser.add_argument(
417 "owner", help="The owner of the collection(s) to get")
418 collection_parser.add_argument(
419 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
420 thing_parser = subparsers.add_parser(
421 'thing', help="Download a single thing.")
422 thing_parser.add_argument("things", nargs="*", help="Space seperated list of thing ID(s) to download")
423 user_parser = subparsers.add_parser(
424 "user", help="Download all things by one or more users")
425 user_parser.add_argument("users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
426 batch_parser = subparsers.add_parser(
427 "batch", help="Perform multiple actions written in a text file")
428 batch_parser.add_argument(
429 "batch_file", help="The name of the file to read.")
430 subparsers.add_parser("version", help="Show the current version")
431
432 args = parser.parse_args()
433 if not args.subcommand:
434 parser.print_help()
435 sys.exit(1)
436 if not args.directory:
437 args.directory = os.getcwd()
438 logging.basicConfig(level=getattr(logging, args.log_level.upper()))
439
440 if args.subcommand.startswith("collection"):
441 for collection in args.collections:
442 Collection(args.owner, collection, args.directory).download()
443 if args.subcommand == "thing":
444 for thing in args.things:
445 Thing(thing).download(args.directory)
446 if args.subcommand == "user":
447 for user in args.users:
448 Designs(user, args.directory).download()
449 if args.subcommand == "version":
450 print("thingy_grabber.py version {}".format(VERSION))
451 if args.subcommand == "batch":
452 do_batch(args.batch_file, args.directory)
453
454
455 if __name__ == "__main__":
456 main()