Fix format string
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
3c82f75b 13from shutil import copyfile
975060c9
OM
14from bs4 import BeautifulSoup
15
16URL_BASE = "https://www.thingiverse.com"
17URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
948bd56f 18USER_COLLECTION = URL_BASE + "/ajax/user/designs"
975060c9
OM
19
20ID_REGEX = re.compile(r'"id":(\d*),')
21TOTAL_REGEX = re.compile(r'"total":(\d*),')
22LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
23# This appears to be fixed at 12, but if it changes would screw the rest up.
24PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
25NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
26
dbdb1782
OM
27VERSION = "0.5.1"
28
dd8c35f4
OM
29def strip_ws(value):
30 """ Remove whitespace from a string """
31 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9 32
dbdb1782 33
975060c9
OM
34def slugify(value):
35 """
36 Normalizes string, converts to lowercase, removes non-alpha characters,
37 and converts spaces to hyphens.
38 """
dbdb1782
OM
39 value = unicodedata.normalize('NFKD', value).encode(
40 'ascii', 'ignore').decode()
975060c9 41 value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4
OM
42 value = str(NO_WHITESPACE_REGEX.sub('-', value))
43 #value = str(re.sub(r'[-\s]+', '-', value))
975060c9
OM
44 return value
45
dbdb1782 46
3522a3bf 47class Grouping:
d66f1f78 48 """ Holds details of a group of things for download
3c82f75b
OM
49 This is effectively (although not actually) an abstract class
50 - use Collection or Designs instead.
51 """
dbdb1782 52
3522a3bf 53 def __init__(self):
975060c9
OM
54 self.things = []
55 self.total = 0
56 self.req_id = None
57 self.last_page = 0
58 self.per_page = None
948bd56f 59 # These should be set by child classes.
3522a3bf
OM
60 self.url = None
61 self.download_dir = None
948bd56f 62 self.collection_url = None
975060c9 63
3522a3bf
OM
64 def _get_small_grouping(self, req):
65 """ Handle small groupings """
975060c9 66 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 67 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9 68 self.things = [x['href'].split(':')[1] for x in links]
fa2f3251 69 self.total = len(self.things)
975060c9
OM
70
71 return self.things
72
3522a3bf
OM
73 def get(self):
74 """ retrieve the things of the grouping. """
975060c9
OM
75 if self.things:
76 # We've already done it.
77 return self.things
78
3522a3bf
OM
79 # Check for initialisation:
80 if not self.url:
fa2f3251 81 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
82 raise ValueError("No URL set - object not initialised properly?")
83
84 # Get the internal details of the grouping.
fa2f3251 85 logging.debug("Querying {}".format(self.url))
3522a3bf 86 c_req = requests.get(self.url)
975060c9
OM
87 total = TOTAL_REGEX.search(c_req.text)
88 if total is None:
3522a3bf
OM
89 # This is a small (<13) items grouping. Pull the list from this req.
90 return self._get_small_grouping(c_req)
975060c9
OM
91 self.total = total.groups()[0]
92 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
93 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
94 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
95 parameters = {
dbdb1782
OM
96 'base_url': self.url,
97 'page': '1',
98 'per_page': '12',
99 'id': self.req_id
975060c9
OM
100 }
101 for current_page in range(1, self.last_page + 1):
102 parameters['page'] = current_page
948bd56f 103 req = requests.post(self.collection_url, parameters)
975060c9 104 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 105 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9
OM
106 self.things += [x['href'].split(':')[1] for x in links]
107
108 return self.things
109
110 def download(self):
111 """ Downloads all the files in a collection """
112 if not self.things:
3522a3bf
OM
113 self.get()
114
115 if not self.download_dir:
dbdb1782
OM
116 raise ValueError(
117 "No download_dir set - invalidly initialised object?")
3522a3bf 118
975060c9 119 base_dir = os.getcwd()
975060c9 120 try:
3522a3bf 121 os.mkdir(self.download_dir)
975060c9 122 except FileExistsError:
fa2f3251 123 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 124 .format(self.download_dir))
fa2f3251 125 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 126 for idx, thing in enumerate(self.things):
fa2f3251 127 logging.info("Downloading thing {}".format(idx))
3c82f75b 128 Thing(thing).download(self.download_dir)
975060c9 129
dbdb1782 130
3522a3bf
OM
131class Collection(Grouping):
132 """ Holds details of a collection. """
dbdb1782 133
d66f1f78 134 def __init__(self, user, name, directory):
3522a3bf
OM
135 Grouping.__init__(self)
136 self.user = user
137 self.name = name
3c82f75b
OM
138 self.url = "{}/{}/collections/{}".format(
139 URL_BASE, self.user, strip_ws(self.name))
d66f1f78 140 self.download_dir = os.path.join(directory,
3c82f75b 141 "{}-{}".format(slugify(self.user), slugify(self.name)))
948bd56f 142 self.collection_url = URL_COLLECTION
3522a3bf 143
dbdb1782 144
3522a3bf
OM
145class Designs(Grouping):
146 """ Holds details of all of a users' designs. """
dbdb1782 147
d66f1f78 148 def __init__(self, user, directory):
3522a3bf
OM
149 Grouping.__init__(self)
150 self.user = user
151 self.url = "{}/{}/designs".format(URL_BASE, self.user)
dbdb1782
OM
152 self.download_dir = os.path.join(
153 directory, "{} designs".format(slugify(self.user)))
948bd56f 154 self.collection_url = USER_COLLECTION
975060c9 155
dbdb1782 156
3c82f75b
OM
157class Thing:
158 """ An individual design on thingiverse. """
dbdb1782 159
3c82f75b
OM
160 def __init__(self, thing_id):
161 self.thing_id = thing_id
162 self.last_time = None
163 self._parsed = False
164 self._needs_download = True
165 self.text = None
166 self.title = None
167 self.download_dir = None
975060c9 168
3c82f75b
OM
169 def _parse(self, base_dir):
170 """ Work out what, if anything needs to be done. """
171 if self._parsed:
172 return
e36c2a07 173
3c82f75b 174 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
e0e69fc6
OM
175 try:
176 req = requests.get(url)
177 except requests.exceptions.ConnectionError as error:
178 logging.error("Unable to connect for thing {}: {}".format(self.thing_id, error))
179 return
180
3c82f75b
OM
181 self.text = req.text
182 soup = BeautifulSoup(self.text, features='lxml')
680039fe
OM
183 #import code
184 #code.interact(local=dict(globals(), **locals()))
e0e69fc6
OM
185 try:
186 self.title = slugify(soup.find_all('h1')[0].text.strip())
187 except IndexError:
188 logging.warning("No title found for thing {}".format(self.thing_id))
189 self.title = self.thing_id
190
191 if req.status_code == 404:
192 logging.warning("404 for thing {} - DMCA or invalid number?".format(self.thing_id))
193 return
194
195 if req.status_code > 299:
196 logging.warning("bad status code {} for thing {} - try again later?".format(req.status_code, self.thing_id))
197 return
198
3b497b1a 199 self.old_download_dir = os.path.join(base_dir, self.title)
84ca2da2 200 self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, self.title))
3c82f75b 201
fa2f3251
OM
202 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
203
3c82f75b 204 if not os.path.exists(self.download_dir):
3b497b1a
M
205 if os.path.exists(self.old_download_dir):
206 logging.info("Found previous style download directory. Moving it")
207 copyfile(self.old_download_dir, self.download_dir)
208 else:
209 # Not yet downloaded
210 self._parsed = True
211 return
3c82f75b
OM
212
213 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
214 if not os.path.exists(timestamp_file):
215 # Old download from before
dbdb1782
OM
216 logging.warning(
217 "Old-style download directory found. Assuming update required.")
3c82f75b
OM
218 self._parsed = True
219 return
220
221 try:
222 with open(timestamp_file, 'r') as timestamp_handle:
223 self.last_time = timestamp_handle.readlines()[0]
fa2f3251 224 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
225 except FileNotFoundError:
226 # Not run on this thing before.
dbdb1782
OM
227 logging.info(
228 "Old-style download directory found. Assuming update required.")
3c82f75b
OM
229 self.last_time = None
230 self._parsed = True
231 return
232
233 # OK, so we have a timestamp, lets see if there is anything new to get
dbdb1782 234 file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b
OM
235 for file_link in file_links:
236 timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782
OM
237 logging.debug("Checking {} (updated {})".format(
238 file_link["title"], timestamp))
3c82f75b 239 if timestamp > self.last_time:
dbdb1782
OM
240 logging.info(
241 "Found new/updated file {}".format(file_link["title"]))
3c82f75b
OM
242 self._needs_download = True
243 self._parsed = True
244 return
245 # Got here, so nope, no new files.
3c82f75b
OM
246 self._needs_download = False
247 self._parsed = True
248
249 def download(self, base_dir):
250 """ Download all files for a given thing. """
251 if not self._parsed:
252 self._parse(base_dir)
253
e0e69fc6
OM
254 if not self._parsed:
255 logging.error("Unable to parse {} - aborting download".format(self.thing_id))
256 return
257
3c82f75b 258 if not self._needs_download:
fa2f3251 259 print("{} already downloaded - skipping.".format(self.title))
3c82f75b
OM
260 return
261
262 # Have we already downloaded some things?
263 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
264 prev_dir = None
265 if os.path.exists(self.download_dir):
266 if not os.path.exists(timestamp_file):
267 # edge case: old style dir w/out timestamp.
dbdb1782
OM
268 logging.warning(
269 "Old style download dir found for {}".format(self.title))
4f94efc8
OM
270 prev_count = 0
271 target_dir = "{}_old".format(self.download_dir)
272 while os.path.exists(target_dir):
273 prev_count = prev_count + 1
274 target_dir = "{}_old_{}".format(self.download_dir, prev_count)
275 os.rename(self.download_dir, target_dir)
3c82f75b
OM
276 else:
277 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
278 os.rename(self.download_dir, prev_dir)
279
280 # Get the list of files to download
281 soup = BeautifulSoup(self.text, features='lxml')
dbdb1782 282 file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b
OM
283
284 new_file_links = []
285 old_file_links = []
286 new_last_time = None
287
288 if not self.last_time:
289 # If we don't have anything to copy from, then it is all new.
290 new_file_links = file_links
e0e69fc6
OM
291 try:
292 new_last_time = file_links[0].find_all('time')[0]['datetime']
293 except:
294 import code
295 code.interact(local=dict(globals(), **locals()))
296
3c82f75b
OM
297 for file_link in file_links:
298 timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782
OM
299 logging.debug("Found file {} from {}".format(
300 file_link["title"], timestamp))
3c82f75b
OM
301 if timestamp > new_last_time:
302 new_last_time = timestamp
303 else:
304 for file_link in file_links:
305 timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782
OM
306 logging.debug("Checking {} (updated {})".format(
307 file_link["title"], timestamp))
3c82f75b
OM
308 if timestamp > self.last_time:
309 new_file_links.append(file_link)
310 else:
311 old_file_links.append(file_link)
312 if not new_last_time or timestamp > new_last_time:
313 new_last_time = timestamp
314
fa2f3251 315 logging.debug("new timestamp {}".format(new_last_time))
3c82f75b
OM
316
317 # OK. Time to get to work.
fa2f3251 318 logging.debug("Generating download_dir")
3c82f75b
OM
319 os.mkdir(self.download_dir)
320 # First grab the cached files (if any)
fa2f3251 321 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
3c82f75b
OM
322 for file_link in old_file_links:
323 old_file = os.path.join(prev_dir, file_link["title"])
324 new_file = os.path.join(self.download_dir, file_link["title"])
325 try:
fa2f3251 326 logging.debug("Copying {} to {}".format(old_file, new_file))
3c82f75b
OM
327 copyfile(old_file, new_file)
328 except FileNotFoundError:
dbdb1782
OM
329 logging.warning(
330 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
3c82f75b
OM
331 new_file_links.append(file_link)
332
333 # Now download the new ones
dbdb1782
OM
334 files = [("{}{}".format(URL_BASE, x['href']), x["title"])
335 for x in new_file_links]
336 logging.info("Downloading {} new files of {}".format(
337 len(new_file_links), len(file_links)))
3c82f75b
OM
338 try:
339 for url, name in files:
340 file_name = os.path.join(self.download_dir, name)
dbdb1782
OM
341 logging.debug("Downloading {} from {} to {}".format(
342 name, url, file_name))
3c82f75b
OM
343 data_req = requests.get(url)
344 with open(file_name, 'wb') as handle:
345 handle.write(data_req.content)
346 except Exception as exception:
fa2f3251 347 logging.error("Failed to download {} - {}".format(name, exception))
3c82f75b
OM
348 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
349 return
350
680039fe
OM
351 # People like images
352 image_dir = os.path.join(self.download_dir, 'images')
dbdb1782
OM
353 imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
354 .find_all('div', {'class': 'gallery-photo'})
fa2f3251 355 logging.info("Downloading {} images.".format(len(imagelinks)))
680039fe
OM
356 try:
357 os.mkdir(image_dir)
fa2f3251 358 for imagelink in imagelinks:
b7bfef68
OM
359 url = next(filter(None,[imagelink[x] for x in ['data-full',
360 'data-large',
361 'data-medium',
362 'data-thumb']]), None)
363 if not url:
364 logging.warning("Unable to find any urls for {}".format(imagelink))
365 continue
366
680039fe
OM
367 filename = os.path.basename(url)
368 if filename.endswith('stl'):
369 filename = "{}.png".format(filename)
370 image_req = requests.get(url)
371 with open(os.path.join(image_dir, filename), 'wb') as handle:
372 handle.write(image_req.content)
373 except Exception as exception:
374 print("Failed to download {} - {}".format(filename, exception))
375 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
376 return
377
4f75dd69
OM
378 # instructions are good too.
379 logging.info("Downloading readme")
380 try:
381 readme_txt = soup.find('meta', property='og:description')['content']
382 with open(os.path.join(self.download_dir,'readme.txt'), 'w') as readme_handle:
383 readme_handle.write("{}\n".format(readme_txt))
384 except (TypeError, KeyError) as exception:
385 logging.warning("No readme? {}".format(exception))
386 except IOError as exception:
387 logging.warning("Failed to write readme! {}".format(exception))
388
389 # Best get some licenses
390 logging.info("Downloading license")
391 try:
392 license_txt = soup.find('div',{'class':'license-text'}).text
393 if license_txt:
394 with open(os.path.join(self.download_dir,'license.txt'), 'w') as license_handle:
395 license_handle.write("{}\n".format(license_txt))
396 except AttributeError as exception:
397 logging.warning("No license? {}".format(exception))
398 except IOError as exception:
399 logging.warning("Failed to write license! {}".format(exception))
400
401
3c82f75b
OM
402 try:
403 # Now write the timestamp
404 with open(timestamp_file, 'w') as timestamp_handle:
405 timestamp_handle.write(new_last_time)
406 except Exception as exception:
407 print("Failed to write timestamp file - {}".format(exception))
408 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
409 return
410 self._needs_download = False
fa2f3251 411 logging.debug("Download of {} finished".format(self.title))
975060c9 412
dbdb1782 413
1ab49020
OM
414def do_batch(batch_file, download_dir):
415 """ Read a file in line by line, parsing each as a set of calls to this script."""
416 with open(batch_file) as handle:
417 for line in handle:
418 line = line.strip()
419 logging.info("Handling instruction {}".format(line))
420 command_arr = line.split()
421 if command_arr[0] == "thing":
dbdb1782
OM
422 logging.debug(
423 "Handling batch thing instruction: {}".format(line))
1ab49020
OM
424 Thing(command_arr[1]).download(download_dir)
425 continue
426 if command_arr[0] == "collection":
dbdb1782
OM
427 logging.debug(
428 "Handling batch collection instruction: {}".format(line))
429 Collection(command_arr[1], command_arr[2],
430 download_dir).download()
1ab49020
OM
431 continue
432 if command_arr[0] == "user":
dbdb1782
OM
433 logging.debug(
434 "Handling batch collection instruction: {}".format(line))
1ab49020
OM
435 Designs(command_arr[1], download_dir).download()
436 continue
437 logging.warning("Unable to parse current instruction. Skipping.")
438
dbdb1782 439
975060c9
OM
440def main():
441 """ Entry point for script being run as a command. """
442 parser = argparse.ArgumentParser()
dbdb1782
OM
443 parser.add_argument("-l", "--log-level", choices=[
444 'debug', 'info', 'warning'], default='info', help="level of logging desired")
445 parser.add_argument("-d", "--directory",
446 help="Target directory to download into")
4f94efc8
OM
447 parser.add_argument("-f", "--log-file",
448 help="Place to log debug information to")
dbdb1782
OM
449 subparsers = parser.add_subparsers(
450 help="Type of thing to download", dest="subcommand")
451 collection_parser = subparsers.add_parser(
b7bfef68 452 'collection', help="Download one or more entire collection(s)")
dbdb1782 453 collection_parser.add_argument(
b7bfef68 454 "owner", help="The owner of the collection(s) to get")
dbdb1782 455 collection_parser.add_argument(
b7bfef68 456 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
457 thing_parser = subparsers.add_parser(
458 'thing', help="Download a single thing.")
b7bfef68 459 thing_parser.add_argument("things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 460 user_parser = subparsers.add_parser(
b7bfef68
OM
461 "user", help="Download all things by one or more users")
462 user_parser.add_argument("users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
463 batch_parser = subparsers.add_parser(
464 "batch", help="Perform multiple actions written in a text file")
465 batch_parser.add_argument(
466 "batch_file", help="The name of the file to read.")
680039fe 467 subparsers.add_parser("version", help="Show the current version")
4a98996b 468
975060c9 469 args = parser.parse_args()
4a98996b
OM
470 if not args.subcommand:
471 parser.print_help()
472 sys.exit(1)
d66f1f78
OM
473 if not args.directory:
474 args.directory = os.getcwd()
4f94efc8
OM
475
476 logger = logging.getLogger()
477 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
478 logger.setLevel(logging.DEBUG)
479 console_handler = logging.StreamHandler()
480 console_handler.setLevel(args.log_level.upper())
481
482 logger.addHandler(console_handler)
483 if args.log_file:
484 file_handler = logging.FileHandler(args.log_file)
485 file_handler.setLevel(logging.DEBUG)
486 file_handler.setFormatter(formatter)
487 logger.addHandler(file_handler)
fa2f3251 488
4a98996b 489 if args.subcommand.startswith("collection"):
b7bfef68
OM
490 for collection in args.collections:
491 Collection(args.owner, collection, args.directory).download()
4a98996b 492 if args.subcommand == "thing":
b7bfef68
OM
493 for thing in args.things:
494 Thing(thing).download(args.directory)
3522a3bf 495 if args.subcommand == "user":
b7bfef68
OM
496 for user in args.users:
497 Designs(user, args.directory).download()
db8066ec
OM
498 if args.subcommand == "version":
499 print("thingy_grabber.py version {}".format(VERSION))
1ab49020
OM
500 if args.subcommand == "batch":
501 do_batch(args.batch_file, args.directory)
502
975060c9
OM
503
504if __name__ == "__main__":
505 main()