X-Git-Url: http://git.hcoop.net/clinton/thingy_grabber.git/blobdiff_plain/db8066ec3899ea35101918a39d3bffca7057fb97..4f75dd69431f52496c584a10a76bf227b7f1b49d:/thingy_grabber.py diff --git a/thingy_grabber.py b/thingy_grabber.py index bfa700f..63c929a 100755 --- a/thingy_grabber.py +++ b/thingy_grabber.py @@ -9,6 +9,7 @@ import os import argparse import unicodedata import requests +import logging from shutil import copyfile from bs4 import BeautifulSoup @@ -23,30 +24,33 @@ LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),') PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),') NO_WHITESPACE_REGEX = re.compile(r'[-\s]+') -VERSION = "0.4.0" +VERSION = "0.5.1" -VERBOSE = False def strip_ws(value): """ Remove whitespace from a string """ return str(NO_WHITESPACE_REGEX.sub('-', value)) + def slugify(value): """ Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. """ - value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() + value = unicodedata.normalize('NFKD', value).encode( + 'ascii', 'ignore').decode() value = str(re.sub(r'[^\w\s-]', '', value).strip()) value = str(NO_WHITESPACE_REGEX.sub('-', value)) #value = str(re.sub(r'[-\s]+', '-', value)) return value + class Grouping: """ Holds details of a group of things for download This is effectively (although not actually) an abstract class - use Collection or Designs instead. """ + def __init__(self): self.things = [] self.total = 0 @@ -61,8 +65,9 @@ class Grouping: def _get_small_grouping(self, req): """ Handle small groupings """ soup = BeautifulSoup(req.text, features='lxml') - links = soup.find_all('a', {'class':'card-img-holder'}) + links = soup.find_all('a', {'class': 'card-img-holder'}) self.things = [x['href'].split(':')[1] for x in links] + self.total = len(self.things) return self.things @@ -74,12 +79,11 @@ class Grouping: # Check for initialisation: if not self.url: - print("No URL set - object not initialised properly?") + logging.error("No URL set - object not initialised properly?") raise ValueError("No URL set - object not initialised properly?") # Get the internal details of the grouping. - if VERBOSE: - print("Querying {}".format(self.url)) + logging.debug("Querying {}".format(self.url)) c_req = requests.get(self.url) total = TOTAL_REGEX.search(c_req.text) if total is None: @@ -90,16 +94,16 @@ class Grouping: self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0]) self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0] parameters = { - 'base_url':self.url, - 'page':'1', - 'per_page':'12', - 'id':self.req_id + 'base_url': self.url, + 'page': '1', + 'per_page': '12', + 'id': self.req_id } for current_page in range(1, self.last_page + 1): parameters['page'] = current_page req = requests.post(self.collection_url, parameters) soup = BeautifulSoup(req.text, features='lxml') - links = soup.find_all('a', {'class':'card-img-holder'}) + links = soup.find_all('a', {'class': 'card-img-holder'}) self.things += [x['href'].split(':')[1] for x in links] return self.things @@ -110,21 +114,24 @@ class Grouping: self.get() if not self.download_dir: - raise ValueError("No download_dir set - invalidly initialised object?") + raise ValueError( + "No download_dir set - invalidly initialised object?") base_dir = os.getcwd() try: os.mkdir(self.download_dir) except FileExistsError: - print("Target directory {} already exists. Assuming a resume." - .format(self.download_dir)) - if VERBOSE: - print("Downloading {} things.".format(self.total)) - for thing in self.things: + logging.info("Target directory {} already exists. Assuming a resume." + .format(self.download_dir)) + logging.info("Downloading {} thing(s).".format(self.total)) + for idx, thing in enumerate(self.things): + logging.info("Downloading thing {}".format(idx)) Thing(thing).download(self.download_dir) + class Collection(Grouping): """ Holds details of a collection. """ + def __init__(self, user, name, directory): Grouping.__init__(self) self.user = user @@ -135,17 +142,22 @@ class Collection(Grouping): "{}-{}".format(slugify(self.user), slugify(self.name))) self.collection_url = URL_COLLECTION + class Designs(Grouping): """ Holds details of all of a users' designs. """ + def __init__(self, user, directory): Grouping.__init__(self) self.user = user self.url = "{}/{}/designs".format(URL_BASE, self.user) - self.download_dir = os.path.join(directory, "{} designs".format(slugify(self.user))) + self.download_dir = os.path.join( + directory, "{} designs".format(slugify(self.user))) self.collection_url = USER_COLLECTION + class Thing: """ An individual design on thingiverse. """ + def __init__(self, thing_id): self.thing_id = thing_id self.last_time = None @@ -164,10 +176,13 @@ class Thing: req = requests.get(url) self.text = req.text soup = BeautifulSoup(self.text, features='lxml') - + #import code + #code.interact(local=dict(globals(), **locals())) self.title = slugify(soup.find_all('h1')[0].text.strip()) self.download_dir = os.path.join(base_dir, self.title) + logging.debug("Parsing {} ({})".format(self.thing_id, self.title)) + if not os.path.exists(self.download_dir): # Not yet downloaded self._parsed = True @@ -176,37 +191,36 @@ class Thing: timestamp_file = os.path.join(self.download_dir, 'timestamp.txt') if not os.path.exists(timestamp_file): # Old download from before - if VERBOSE: - print("Old-style download directory found. Assuming update required.") + logging.warning( + "Old-style download directory found. Assuming update required.") self._parsed = True return try: with open(timestamp_file, 'r') as timestamp_handle: self.last_time = timestamp_handle.readlines()[0] - if VERBOSE: - print("last downloaded version: {}".format(self.last_time)) + logging.info("last downloaded version: {}".format(self.last_time)) except FileNotFoundError: # Not run on this thing before. - if VERBOSE: - print("Old-style download directory found. Assuming update required.") + logging.info( + "Old-style download directory found. Assuming update required.") self.last_time = None self._parsed = True return # OK, so we have a timestamp, lets see if there is anything new to get - file_links = soup.find_all('a', {'class':'file-download'}) + file_links = soup.find_all('a', {'class': 'file-download'}) for file_link in file_links: timestamp = file_link.find_all('time')[0]['datetime'] - if VERBOSE: - print("Checking {} (updated {})".format(file_link["title"], timestamp)) + logging.debug("Checking {} (updated {})".format( + file_link["title"], timestamp)) if timestamp > self.last_time: - print("Found new/updated file {}".format(file_link["title"])) + logging.info( + "Found new/updated file {}".format(file_link["title"])) self._needs_download = True self._parsed = True return # Got here, so nope, no new files. - print("Found no new files for {}".format(self.title)) self._needs_download = False self._parsed = True @@ -216,8 +230,7 @@ class Thing: self._parse(base_dir) if not self._needs_download: - if VERBOSE: - print("{} already downloaded - skipping.".format(self.title)) + print("{} already downloaded - skipping.".format(self.title)) return # Have we already downloaded some things? @@ -226,15 +239,17 @@ class Thing: if os.path.exists(self.download_dir): if not os.path.exists(timestamp_file): # edge case: old style dir w/out timestamp. - print("Old style download dir found for {}".format(self.title)) - os.rename(self.download_dir, "{}_old".format(self.download_dir)) + logging.warning( + "Old style download dir found for {}".format(self.title)) + os.rename(self.download_dir, + "{}_old".format(self.download_dir)) else: prev_dir = "{}_{}".format(self.download_dir, self.last_time) os.rename(self.download_dir, prev_dir) # Get the list of files to download soup = BeautifulSoup(self.text, features='lxml') - file_links = soup.find_all('a', {'class':'file-download'}) + file_links = soup.find_all('a', {'class': 'file-download'}) new_file_links = [] old_file_links = [] @@ -246,15 +261,15 @@ class Thing: new_last_time = file_links[0].find_all('time')[0]['datetime'] for file_link in file_links: timestamp = file_link.find_all('time')[0]['datetime'] - if VERBOSE: - print("Found file {} from {}".format(file_link["title"], timestamp)) + logging.debug("Found file {} from {}".format( + file_link["title"], timestamp)) if timestamp > new_last_time: new_last_time = timestamp else: for file_link in file_links: timestamp = file_link.find_all('time')[0]['datetime'] - if VERBOSE: - print("Checking {} (updated {})".format(file_link["title"], timestamp)) + logging.debug("Checking {} (updated {})".format( + file_link["title"], timestamp)) if timestamp > self.last_time: new_file_links.append(file_link) else: @@ -262,38 +277,93 @@ class Thing: if not new_last_time or timestamp > new_last_time: new_last_time = timestamp - if VERBOSE: - print("new timestamp {}".format(new_last_time)) + logging.debug("new timestamp {}".format(new_last_time)) # OK. Time to get to work. + logging.debug("Generating download_dir") os.mkdir(self.download_dir) # First grab the cached files (if any) + logging.info("Copying {} unchanged files.".format(len(old_file_links))) for file_link in old_file_links: old_file = os.path.join(prev_dir, file_link["title"]) new_file = os.path.join(self.download_dir, file_link["title"]) try: - if VERBOSE: - print("Copying {} to {}".format(old_file, new_file)) + logging.debug("Copying {} to {}".format(old_file, new_file)) copyfile(old_file, new_file) except FileNotFoundError: - print("Unable to find {} in old archive, redownloading".format(file_link["title"])) + logging.warning( + "Unable to find {} in old archive, redownloading".format(file_link["title"])) new_file_links.append(file_link) # Now download the new ones - files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links] + files = [("{}{}".format(URL_BASE, x['href']), x["title"]) + for x in new_file_links] + logging.info("Downloading {} new files of {}".format( + len(new_file_links), len(file_links))) try: for url, name in files: file_name = os.path.join(self.download_dir, name) - if VERBOSE: - print("Downloading {} from {} to {}".format(name, url, file_name)) + logging.debug("Downloading {} from {} to {}".format( + name, url, file_name)) data_req = requests.get(url) with open(file_name, 'wb') as handle: handle.write(data_req.content) except Exception as exception: - print("Failed to download {} - {}".format(name, exception)) + logging.error("Failed to download {} - {}".format(name, exception)) os.rename(self.download_dir, "{}_failed".format(self.download_dir)) return + # People like images + image_dir = os.path.join(self.download_dir, 'images') + imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \ + .find_all('div', {'class': 'gallery-photo'}) + logging.info("Downloading {} images.".format(len(imagelinks))) + try: + os.mkdir(image_dir) + for imagelink in imagelinks: + url = next(filter(None,[imagelink[x] for x in ['data-full', + 'data-large', + 'data-medium', + 'data-thumb']]), None) + if not url: + logging.warning("Unable to find any urls for {}".format(imagelink)) + continue + + filename = os.path.basename(url) + if filename.endswith('stl'): + filename = "{}.png".format(filename) + image_req = requests.get(url) + with open(os.path.join(image_dir, filename), 'wb') as handle: + handle.write(image_req.content) + except Exception as exception: + print("Failed to download {} - {}".format(filename, exception)) + os.rename(self.download_dir, "{}_failed".format(self.download_dir)) + return + + # instructions are good too. + logging.info("Downloading readme") + try: + readme_txt = soup.find('meta', property='og:description')['content'] + with open(os.path.join(self.download_dir,'readme.txt'), 'w') as readme_handle: + readme_handle.write("{}\n".format(readme_txt)) + except (TypeError, KeyError) as exception: + logging.warning("No readme? {}".format(exception)) + except IOError as exception: + logging.warning("Failed to write readme! {}".format(exception)) + + # Best get some licenses + logging.info("Downloading license") + try: + license_txt = soup.find('div',{'class':'license-text'}).text + if license_txt: + with open(os.path.join(self.download_dir,'license.txt'), 'w') as license_handle: + license_handle.write("{}\n".format(license_txt)) + except AttributeError as exception: + logging.warning("No license? {}".format(exception)) + except IOError as exception: + logging.warning("Failed to write license! {}".format(exception)) + + try: # Now write the timestamp with open(timestamp_file, 'w') as timestamp_handle: @@ -303,23 +373,61 @@ class Thing: os.rename(self.download_dir, "{}_failed".format(self.download_dir)) return self._needs_download = False - if VERBOSE: - print("Download of {} finished".format(self.title)) + logging.debug("Download of {} finished".format(self.title)) + + +def do_batch(batch_file, download_dir): + """ Read a file in line by line, parsing each as a set of calls to this script.""" + with open(batch_file) as handle: + for line in handle: + line = line.strip() + logging.info("Handling instruction {}".format(line)) + command_arr = line.split() + if command_arr[0] == "thing": + logging.debug( + "Handling batch thing instruction: {}".format(line)) + Thing(command_arr[1]).download(download_dir) + continue + if command_arr[0] == "collection": + logging.debug( + "Handling batch collection instruction: {}".format(line)) + Collection(command_arr[1], command_arr[2], + download_dir).download() + continue + if command_arr[0] == "user": + logging.debug( + "Handling batch collection instruction: {}".format(line)) + Designs(command_arr[1], download_dir).download() + continue + logging.warning("Unable to parse current instruction. Skipping.") + def main(): """ Entry point for script being run as a command. """ parser = argparse.ArgumentParser() - parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true") - parser.add_argument("-d", "--directory", help="Target directory to download into") - subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand") - collection_parser = subparsers.add_parser('collection', help="Download an entire collection") - collection_parser.add_argument("owner", help="The owner of the collection to get") - collection_parser.add_argument("collection", help="The name of the collection to get") - thing_parser = subparsers.add_parser('thing', help="Download a single thing.") - thing_parser.add_argument("thing", help="Thing ID to download") - user_parser = subparsers.add_parser("user", help="Download all things by a user") - user_parser.add_argument("user", help="The user to get the designs of") - version_parser = subparsers.add_parser("version", help="Show the current version") + parser.add_argument("-l", "--log-level", choices=[ + 'debug', 'info', 'warning'], default='info', help="level of logging desired") + parser.add_argument("-d", "--directory", + help="Target directory to download into") + subparsers = parser.add_subparsers( + help="Type of thing to download", dest="subcommand") + collection_parser = subparsers.add_parser( + 'collection', help="Download one or more entire collection(s)") + collection_parser.add_argument( + "owner", help="The owner of the collection(s) to get") + collection_parser.add_argument( + "collections", nargs="+", help="Space seperated list of the name(s) of collection to get") + thing_parser = subparsers.add_parser( + 'thing', help="Download a single thing.") + thing_parser.add_argument("things", nargs="*", help="Space seperated list of thing ID(s) to download") + user_parser = subparsers.add_parser( + "user", help="Download all things by one or more users") + user_parser.add_argument("users", nargs="+", help="A space seperated list of the user(s) to get the designs of") + batch_parser = subparsers.add_parser( + "batch", help="Perform multiple actions written in a text file") + batch_parser.add_argument( + "batch_file", help="The name of the file to read.") + subparsers.add_parser("version", help="Show the current version") args = parser.parse_args() if not args.subcommand: @@ -327,21 +435,22 @@ def main(): sys.exit(1) if not args.directory: args.directory = os.getcwd() + logging.basicConfig(level=getattr(logging, args.log_level.upper())) - global VERBOSE - VERBOSE = args.verbose if args.subcommand.startswith("collection"): - collection = Collection(args.owner, args.collection, args.directory) - print(collection.get()) - collection.download() + for collection in args.collections: + Collection(args.owner, collection, args.directory).download() if args.subcommand == "thing": - Thing(args.thing).download(args.directory) + for thing in args.things: + Thing(thing).download(args.directory) if args.subcommand == "user": - designs = Designs(args.user, args.directory) - print(designs.get()) - designs.download() + for user in args.users: + Designs(user, args.directory).download() if args.subcommand == "version": print("thingy_grabber.py version {}".format(VERSION)) + if args.subcommand == "batch": + do_batch(args.batch_file, args.directory) + if __name__ == "__main__": main()