add error handling
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
3c82f75b 13from shutil import copyfile
975060c9
OM
14from bs4 import BeautifulSoup
15
16URL_BASE = "https://www.thingiverse.com"
17URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
948bd56f 18USER_COLLECTION = URL_BASE + "/ajax/user/designs"
975060c9
OM
19
20ID_REGEX = re.compile(r'"id":(\d*),')
21TOTAL_REGEX = re.compile(r'"total":(\d*),')
22LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
23# This appears to be fixed at 12, but if it changes would screw the rest up.
24PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
25NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
26
dbdb1782
OM
27VERSION = "0.5.1"
28
db8066ec 29
dd8c35f4
OM
30def strip_ws(value):
31 """ Remove whitespace from a string """
32 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9 33
dbdb1782 34
975060c9
OM
35def slugify(value):
36 """
37 Normalizes string, converts to lowercase, removes non-alpha characters,
38 and converts spaces to hyphens.
39 """
dbdb1782
OM
40 value = unicodedata.normalize('NFKD', value).encode(
41 'ascii', 'ignore').decode()
975060c9 42 value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4
OM
43 value = str(NO_WHITESPACE_REGEX.sub('-', value))
44 #value = str(re.sub(r'[-\s]+', '-', value))
975060c9
OM
45 return value
46
dbdb1782 47
3522a3bf 48class Grouping:
d66f1f78 49 """ Holds details of a group of things for download
3c82f75b
OM
50 This is effectively (although not actually) an abstract class
51 - use Collection or Designs instead.
52 """
dbdb1782 53
3522a3bf 54 def __init__(self):
975060c9
OM
55 self.things = []
56 self.total = 0
57 self.req_id = None
58 self.last_page = 0
59 self.per_page = None
948bd56f 60 # These should be set by child classes.
3522a3bf
OM
61 self.url = None
62 self.download_dir = None
948bd56f 63 self.collection_url = None
975060c9 64
3522a3bf
OM
65 def _get_small_grouping(self, req):
66 """ Handle small groupings """
975060c9 67 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 68 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9 69 self.things = [x['href'].split(':')[1] for x in links]
fa2f3251 70 self.total = len(self.things)
975060c9
OM
71
72 return self.things
73
3522a3bf
OM
74 def get(self):
75 """ retrieve the things of the grouping. """
975060c9
OM
76 if self.things:
77 # We've already done it.
78 return self.things
79
3522a3bf
OM
80 # Check for initialisation:
81 if not self.url:
fa2f3251 82 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
83 raise ValueError("No URL set - object not initialised properly?")
84
85 # Get the internal details of the grouping.
fa2f3251 86 logging.debug("Querying {}".format(self.url))
3522a3bf 87 c_req = requests.get(self.url)
975060c9
OM
88 total = TOTAL_REGEX.search(c_req.text)
89 if total is None:
3522a3bf
OM
90 # This is a small (<13) items grouping. Pull the list from this req.
91 return self._get_small_grouping(c_req)
975060c9
OM
92 self.total = total.groups()[0]
93 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
94 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
95 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
96 parameters = {
dbdb1782
OM
97 'base_url': self.url,
98 'page': '1',
99 'per_page': '12',
100 'id': self.req_id
975060c9
OM
101 }
102 for current_page in range(1, self.last_page + 1):
103 parameters['page'] = current_page
948bd56f 104 req = requests.post(self.collection_url, parameters)
975060c9 105 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 106 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9
OM
107 self.things += [x['href'].split(':')[1] for x in links]
108
109 return self.things
110
111 def download(self):
112 """ Downloads all the files in a collection """
113 if not self.things:
3522a3bf
OM
114 self.get()
115
116 if not self.download_dir:
dbdb1782
OM
117 raise ValueError(
118 "No download_dir set - invalidly initialised object?")
3522a3bf 119
975060c9 120 base_dir = os.getcwd()
975060c9 121 try:
3522a3bf 122 os.mkdir(self.download_dir)
975060c9 123 except FileExistsError:
fa2f3251 124 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 125 .format(self.download_dir))
fa2f3251 126 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 127 for idx, thing in enumerate(self.things):
fa2f3251 128 logging.info("Downloading thing {}".format(idx))
3c82f75b 129 Thing(thing).download(self.download_dir)
975060c9 130
dbdb1782 131
3522a3bf
OM
132class Collection(Grouping):
133 """ Holds details of a collection. """
dbdb1782 134
d66f1f78 135 def __init__(self, user, name, directory):
3522a3bf
OM
136 Grouping.__init__(self)
137 self.user = user
138 self.name = name
3c82f75b
OM
139 self.url = "{}/{}/collections/{}".format(
140 URL_BASE, self.user, strip_ws(self.name))
d66f1f78 141 self.download_dir = os.path.join(directory,
3c82f75b 142 "{}-{}".format(slugify(self.user), slugify(self.name)))
948bd56f 143 self.collection_url = URL_COLLECTION
3522a3bf 144
dbdb1782 145
3522a3bf
OM
146class Designs(Grouping):
147 """ Holds details of all of a users' designs. """
dbdb1782 148
d66f1f78 149 def __init__(self, user, directory):
3522a3bf
OM
150 Grouping.__init__(self)
151 self.user = user
152 self.url = "{}/{}/designs".format(URL_BASE, self.user)
dbdb1782
OM
153 self.download_dir = os.path.join(
154 directory, "{} designs".format(slugify(self.user)))
948bd56f 155 self.collection_url = USER_COLLECTION
975060c9 156
dbdb1782 157
3c82f75b
OM
158class Thing:
159 """ An individual design on thingiverse. """
dbdb1782 160
3c82f75b
OM
161 def __init__(self, thing_id):
162 self.thing_id = thing_id
163 self.last_time = None
164 self._parsed = False
165 self._needs_download = True
166 self.text = None
167 self.title = None
168 self.download_dir = None
975060c9 169
3c82f75b
OM
170 def _parse(self, base_dir):
171 """ Work out what, if anything needs to be done. """
172 if self._parsed:
173 return
e36c2a07 174
3c82f75b 175 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
e0e69fc6
OM
176 try:
177 req = requests.get(url)
178 except requests.exceptions.ConnectionError as error:
179 logging.error("Unable to connect for thing {}: {}".format(self.thing_id, error))
180 return
181
3c82f75b
OM
182 self.text = req.text
183 soup = BeautifulSoup(self.text, features='lxml')
680039fe
OM
184 #import code
185 #code.interact(local=dict(globals(), **locals()))
e0e69fc6
OM
186 try:
187 self.title = slugify(soup.find_all('h1')[0].text.strip())
188 except IndexError:
189 logging.warning("No title found for thing {}".format(self.thing_id))
190 self.title = self.thing_id
191
192 if req.status_code == 404:
193 logging.warning("404 for thing {} - DMCA or invalid number?".format(self.thing_id))
194 return
195
196 if req.status_code > 299:
197 logging.warning("bad status code {} for thing {} - try again later?".format(req.status_code, self.thing_id))
198 return
199
3c82f75b
OM
200 self.download_dir = os.path.join(base_dir, self.title)
201
fa2f3251
OM
202 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
203
3c82f75b
OM
204 if not os.path.exists(self.download_dir):
205 # Not yet downloaded
206 self._parsed = True
207 return
208
209 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
210 if not os.path.exists(timestamp_file):
211 # Old download from before
dbdb1782
OM
212 logging.warning(
213 "Old-style download directory found. Assuming update required.")
3c82f75b
OM
214 self._parsed = True
215 return
216
217 try:
218 with open(timestamp_file, 'r') as timestamp_handle:
219 self.last_time = timestamp_handle.readlines()[0]
fa2f3251 220 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
221 except FileNotFoundError:
222 # Not run on this thing before.
dbdb1782
OM
223 logging.info(
224 "Old-style download directory found. Assuming update required.")
3c82f75b
OM
225 self.last_time = None
226 self._parsed = True
227 return
228
229 # OK, so we have a timestamp, lets see if there is anything new to get
dbdb1782 230 file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b
OM
231 for file_link in file_links:
232 timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782
OM
233 logging.debug("Checking {} (updated {})".format(
234 file_link["title"], timestamp))
3c82f75b 235 if timestamp > self.last_time:
dbdb1782
OM
236 logging.info(
237 "Found new/updated file {}".format(file_link["title"]))
3c82f75b
OM
238 self._needs_download = True
239 self._parsed = True
240 return
241 # Got here, so nope, no new files.
3c82f75b
OM
242 self._needs_download = False
243 self._parsed = True
244
245 def download(self, base_dir):
246 """ Download all files for a given thing. """
247 if not self._parsed:
248 self._parse(base_dir)
249
e0e69fc6
OM
250 if not self._parsed:
251 logging.error("Unable to parse {} - aborting download".format(self.thing_id))
252 return
253
3c82f75b 254 if not self._needs_download:
fa2f3251 255 print("{} already downloaded - skipping.".format(self.title))
3c82f75b
OM
256 return
257
258 # Have we already downloaded some things?
259 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
260 prev_dir = None
261 if os.path.exists(self.download_dir):
262 if not os.path.exists(timestamp_file):
263 # edge case: old style dir w/out timestamp.
dbdb1782
OM
264 logging.warning(
265 "Old style download dir found for {}".format(self.title))
266 os.rename(self.download_dir,
267 "{}_old".format(self.download_dir))
3c82f75b
OM
268 else:
269 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
270 os.rename(self.download_dir, prev_dir)
271
272 # Get the list of files to download
273 soup = BeautifulSoup(self.text, features='lxml')
dbdb1782 274 file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b
OM
275
276 new_file_links = []
277 old_file_links = []
278 new_last_time = None
279
280 if not self.last_time:
281 # If we don't have anything to copy from, then it is all new.
282 new_file_links = file_links
e0e69fc6
OM
283 try:
284 new_last_time = file_links[0].find_all('time')[0]['datetime']
285 except:
286 import code
287 code.interact(local=dict(globals(), **locals()))
288
3c82f75b
OM
289 for file_link in file_links:
290 timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782
OM
291 logging.debug("Found file {} from {}".format(
292 file_link["title"], timestamp))
3c82f75b
OM
293 if timestamp > new_last_time:
294 new_last_time = timestamp
295 else:
296 for file_link in file_links:
297 timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782
OM
298 logging.debug("Checking {} (updated {})".format(
299 file_link["title"], timestamp))
3c82f75b
OM
300 if timestamp > self.last_time:
301 new_file_links.append(file_link)
302 else:
303 old_file_links.append(file_link)
304 if not new_last_time or timestamp > new_last_time:
305 new_last_time = timestamp
306
fa2f3251 307 logging.debug("new timestamp {}".format(new_last_time))
3c82f75b
OM
308
309 # OK. Time to get to work.
fa2f3251 310 logging.debug("Generating download_dir")
3c82f75b
OM
311 os.mkdir(self.download_dir)
312 # First grab the cached files (if any)
fa2f3251 313 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
3c82f75b
OM
314 for file_link in old_file_links:
315 old_file = os.path.join(prev_dir, file_link["title"])
316 new_file = os.path.join(self.download_dir, file_link["title"])
317 try:
fa2f3251 318 logging.debug("Copying {} to {}".format(old_file, new_file))
3c82f75b
OM
319 copyfile(old_file, new_file)
320 except FileNotFoundError:
dbdb1782
OM
321 logging.warning(
322 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
3c82f75b
OM
323 new_file_links.append(file_link)
324
325 # Now download the new ones
dbdb1782
OM
326 files = [("{}{}".format(URL_BASE, x['href']), x["title"])
327 for x in new_file_links]
328 logging.info("Downloading {} new files of {}".format(
329 len(new_file_links), len(file_links)))
3c82f75b
OM
330 try:
331 for url, name in files:
332 file_name = os.path.join(self.download_dir, name)
dbdb1782
OM
333 logging.debug("Downloading {} from {} to {}".format(
334 name, url, file_name))
3c82f75b
OM
335 data_req = requests.get(url)
336 with open(file_name, 'wb') as handle:
337 handle.write(data_req.content)
338 except Exception as exception:
fa2f3251 339 logging.error("Failed to download {} - {}".format(name, exception))
3c82f75b
OM
340 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
341 return
342
680039fe
OM
343 # People like images
344 image_dir = os.path.join(self.download_dir, 'images')
dbdb1782
OM
345 imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
346 .find_all('div', {'class': 'gallery-photo'})
fa2f3251 347 logging.info("Downloading {} images.".format(len(imagelinks)))
680039fe
OM
348 try:
349 os.mkdir(image_dir)
fa2f3251 350 for imagelink in imagelinks:
b7bfef68
OM
351 url = next(filter(None,[imagelink[x] for x in ['data-full',
352 'data-large',
353 'data-medium',
354 'data-thumb']]), None)
355 if not url:
356 logging.warning("Unable to find any urls for {}".format(imagelink))
357 continue
358
680039fe
OM
359 filename = os.path.basename(url)
360 if filename.endswith('stl'):
361 filename = "{}.png".format(filename)
362 image_req = requests.get(url)
363 with open(os.path.join(image_dir, filename), 'wb') as handle:
364 handle.write(image_req.content)
365 except Exception as exception:
366 print("Failed to download {} - {}".format(filename, exception))
367 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
368 return
369
4f75dd69
OM
370 # instructions are good too.
371 logging.info("Downloading readme")
372 try:
373 readme_txt = soup.find('meta', property='og:description')['content']
374 with open(os.path.join(self.download_dir,'readme.txt'), 'w') as readme_handle:
375 readme_handle.write("{}\n".format(readme_txt))
376 except (TypeError, KeyError) as exception:
377 logging.warning("No readme? {}".format(exception))
378 except IOError as exception:
379 logging.warning("Failed to write readme! {}".format(exception))
380
381 # Best get some licenses
382 logging.info("Downloading license")
383 try:
384 license_txt = soup.find('div',{'class':'license-text'}).text
385 if license_txt:
386 with open(os.path.join(self.download_dir,'license.txt'), 'w') as license_handle:
387 license_handle.write("{}\n".format(license_txt))
388 except AttributeError as exception:
389 logging.warning("No license? {}".format(exception))
390 except IOError as exception:
391 logging.warning("Failed to write license! {}".format(exception))
392
393
3c82f75b
OM
394 try:
395 # Now write the timestamp
396 with open(timestamp_file, 'w') as timestamp_handle:
397 timestamp_handle.write(new_last_time)
398 except Exception as exception:
399 print("Failed to write timestamp file - {}".format(exception))
400 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
401 return
402 self._needs_download = False
fa2f3251 403 logging.debug("Download of {} finished".format(self.title))
975060c9 404
dbdb1782 405
1ab49020
OM
406def do_batch(batch_file, download_dir):
407 """ Read a file in line by line, parsing each as a set of calls to this script."""
408 with open(batch_file) as handle:
409 for line in handle:
410 line = line.strip()
411 logging.info("Handling instruction {}".format(line))
412 command_arr = line.split()
413 if command_arr[0] == "thing":
dbdb1782
OM
414 logging.debug(
415 "Handling batch thing instruction: {}".format(line))
1ab49020
OM
416 Thing(command_arr[1]).download(download_dir)
417 continue
418 if command_arr[0] == "collection":
dbdb1782
OM
419 logging.debug(
420 "Handling batch collection instruction: {}".format(line))
421 Collection(command_arr[1], command_arr[2],
422 download_dir).download()
1ab49020
OM
423 continue
424 if command_arr[0] == "user":
dbdb1782
OM
425 logging.debug(
426 "Handling batch collection instruction: {}".format(line))
1ab49020
OM
427 Designs(command_arr[1], download_dir).download()
428 continue
429 logging.warning("Unable to parse current instruction. Skipping.")
430
dbdb1782 431
975060c9
OM
432def main():
433 """ Entry point for script being run as a command. """
434 parser = argparse.ArgumentParser()
dbdb1782
OM
435 parser.add_argument("-l", "--log-level", choices=[
436 'debug', 'info', 'warning'], default='info', help="level of logging desired")
437 parser.add_argument("-d", "--directory",
438 help="Target directory to download into")
439 subparsers = parser.add_subparsers(
440 help="Type of thing to download", dest="subcommand")
441 collection_parser = subparsers.add_parser(
b7bfef68 442 'collection', help="Download one or more entire collection(s)")
dbdb1782 443 collection_parser.add_argument(
b7bfef68 444 "owner", help="The owner of the collection(s) to get")
dbdb1782 445 collection_parser.add_argument(
b7bfef68 446 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782
OM
447 thing_parser = subparsers.add_parser(
448 'thing', help="Download a single thing.")
b7bfef68 449 thing_parser.add_argument("things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782 450 user_parser = subparsers.add_parser(
b7bfef68
OM
451 "user", help="Download all things by one or more users")
452 user_parser.add_argument("users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782
OM
453 batch_parser = subparsers.add_parser(
454 "batch", help="Perform multiple actions written in a text file")
455 batch_parser.add_argument(
456 "batch_file", help="The name of the file to read.")
680039fe 457 subparsers.add_parser("version", help="Show the current version")
4a98996b 458
975060c9 459 args = parser.parse_args()
4a98996b
OM
460 if not args.subcommand:
461 parser.print_help()
462 sys.exit(1)
d66f1f78
OM
463 if not args.directory:
464 args.directory = os.getcwd()
fa2f3251
OM
465 logging.basicConfig(level=getattr(logging, args.log_level.upper()))
466
4a98996b 467 if args.subcommand.startswith("collection"):
b7bfef68
OM
468 for collection in args.collections:
469 Collection(args.owner, collection, args.directory).download()
4a98996b 470 if args.subcommand == "thing":
b7bfef68
OM
471 for thing in args.things:
472 Thing(thing).download(args.directory)
3522a3bf 473 if args.subcommand == "user":
b7bfef68
OM
474 for user in args.users:
475 Designs(user, args.directory).download()
db8066ec
OM
476 if args.subcommand == "version":
477 print("thingy_grabber.py version {}".format(VERSION))
1ab49020
OM
478 if args.subcommand == "batch":
479 do_batch(args.batch_file, args.directory)
480
975060c9
OM
481
482if __name__ == "__main__":
483 main()