[clinton/thingy_grabber.git] / thingy_grabber.py

#!/usr/bin/env python3
"""
Thingiverse bulk downloader
"""

import re
import sys
import os
import argparse
import unicodedata
import requests
import logging
from shutil import copyfile
from bs4 import BeautifulSoup

URL_BASE = "https://www.thingiverse.com"
URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
USER_COLLECTION = URL_BASE + "/ajax/user/designs"

ID_REGEX = re.compile(r'"id":(\d*),')
TOTAL_REGEX = re.compile(r'"total":(\d*),')
LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
# This appears to be fixed at 12, but if it changes would screw the rest up.
PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')

VERSION = "0.5.1"

def strip_ws(value):
    """ Remove whitespace from a string """
    return str(NO_WHITESPACE_REGEX.sub('-', value))


def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value = unicodedata.normalize('NFKD', value).encode(
        'ascii', 'ignore').decode()
    value = str(re.sub(r'[^\w\s-]', '', value).strip())
    value = str(NO_WHITESPACE_REGEX.sub('-', value))
    #value = str(re.sub(r'[-\s]+', '-', value))
    return value


class Grouping:
    """ Holds details of a group of things for download
        This is effectively (although not actually) an abstract class
        - use Collection or Designs instead.
    """

    def __init__(self):
        self.things = []
        self.total = 0
        self.req_id = None
        self.last_page = 0
        self.per_page = None
        # These should be set by child classes.
        self.url = None
        self.download_dir = None
        self.collection_url = None

    def _get_small_grouping(self, req):
        """ Handle small groupings """
        soup = BeautifulSoup(req.text, features='lxml')
        links = soup.find_all('a', {'class': 'card-img-holder'})
        self.things = [x['href'].split(':')[1] for x in links]
        self.total = len(self.things)

        return self.things

    def get(self):
        """ retrieve the things of the grouping. """
        if self.things:
            # We've already done it.
            return self.things

        # Check for initialisation:
        if not self.url:
            logging.error("No URL set - object not initialised properly?")
            raise ValueError("No URL set - object not initialised properly?")

        # Get the internal details of the grouping.
        logging.debug("Querying {}".format(self.url))
        c_req = requests.get(self.url)
        total = TOTAL_REGEX.search(c_req.text)
        if total is None:
            # This is a small (<13) items grouping. Pull the list from this req.
            return self._get_small_grouping(c_req)
        self.total = total.groups()[0]
        self.req_id = ID_REGEX.search(c_req.text).groups()[0]
        self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
        self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
        parameters = {
            'base_url': self.url,
            'page': '1',
            'per_page': '12',
            'id': self.req_id
        }
        for current_page in range(1, self.last_page + 1):
            parameters['page'] = current_page
            req = requests.post(self.collection_url, parameters)
            soup = BeautifulSoup(req.text, features='lxml')
            links = soup.find_all('a', {'class': 'card-img-holder'})
            self.things += [x['href'].split(':')[1] for x in links]

        return self.things

    def download(self):
        """ Downloads all the files in a collection """
        if not self.things:
            self.get()

        if not self.download_dir:
            raise ValueError(
                "No download_dir set - invalidly initialised object?")

        base_dir = os.getcwd()
        try:
            os.mkdir(self.download_dir)
        except FileExistsError:
            logging.info("Target directory {} already exists. Assuming a resume."
                         .format(self.download_dir))
        logging.info("Downloading {} thing(s).".format(self.total))
        for idx, thing in enumerate(self.things):
            logging.info("Downloading thing {}".format(idx))
            Thing(thing).download(self.download_dir)


class Collection(Grouping):
    """ Holds details of a collection. """

    def __init__(self, user, name, directory):
        Grouping.__init__(self)
        self.user = user
        self.name = name
        self.url = "{}/{}/collections/{}".format(
            URL_BASE, self.user, strip_ws(self.name))
        self.download_dir = os.path.join(directory,
                                         "{}-{}".format(slugify(self.user), slugify(self.name)))
        self.collection_url = URL_COLLECTION


class Designs(Grouping):
    """ Holds details of all of a users' designs. """

    def __init__(self, user, directory):
        Grouping.__init__(self)
        self.user = user
        self.url = "{}/{}/designs".format(URL_BASE, self.user)
        self.download_dir = os.path.join(
            directory, "{} designs".format(slugify(self.user)))
        self.collection_url = USER_COLLECTION


class Thing:
    """ An individual design on thingiverse. """

    def __init__(self, thing_id):
        self.thing_id = thing_id
        self.last_time = None
        self._parsed = False
        self._needs_download = True
        self.text = None
        self.title = None
        self.download_dir = None

    def _parse(self, base_dir):
        """ Work out what, if anything needs to be done. """
        if self._parsed:
            return

        url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
        try:
          req = requests.get(url)
        except requests.exceptions.ConnectionError as error:
          logging.error("Unable to connect for thing {}: {}".format(self.thing_id, error))
          return

        self.text = req.text
        soup = BeautifulSoup(self.text, features='lxml')
        #import code
        #code.interact(local=dict(globals(), **locals()))
        try:
          self.title = slugify(soup.find_all('h1')[0].text.strip())
        except IndexError:
          logging.warning("No title found for thing {}".format(self.thing_id))
          self.title = self.thing_id

        if req.status_code == 404:
          logging.warning("404 for thing {} - DMCA or invalid number?".format(self.thing_id))
          return

        if req.status_code > 299:
          logging.warning("bad status code {}  for thing {} - try again later?".format(req.status_code, self.thing_id))
          return

        self.old_download_dir = os.path.join(base_dir, self.title)
        self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, self.title))

        logging.debug("Parsing {} ({})".format(self.thing_id, self.title))

        if not os.path.exists(self.download_dir):
            if os.path.exists(self.old_download_dir):
                logging.info("Found previous style download directory. Moving it")
                copyfile(self.old_download_dir, self.download_dir)
            else:
                # Not yet downloaded
                self._parsed = True
                return

        timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
        if not os.path.exists(timestamp_file):
            # Old download from before
            logging.warning(
                "Old-style download directory found. Assuming update required.")
            self._parsed = True
            return

        try:
            with open(timestamp_file, 'r') as timestamp_handle:
                self.last_time = timestamp_handle.readlines()[0]
            logging.info("last downloaded version: {}".format(self.last_time))
        except FileNotFoundError:
            # Not run on this thing before.
            logging.info(
                "Old-style download directory found. Assuming update required.")
            self.last_time = None
            self._parsed = True
            return

        # OK, so we have a timestamp, lets see if there is anything new to get
        file_links = soup.find_all('a', {'class': 'file-download'})
        for file_link in file_links:
            timestamp = file_link.find_all('time')[0]['datetime']
            logging.debug("Checking {} (updated {})".format(
                file_link["title"], timestamp))
            if timestamp > self.last_time:
                logging.info(
                    "Found new/updated file {}".format(file_link["title"]))
                self._needs_download = True
                self._parsed = True
                return
        # Got here, so nope, no new files.
        self._needs_download = False
        self._parsed = True

    def download(self, base_dir):
        """ Download all files for a given thing. """
        if not self._parsed:
            self._parse(base_dir)

        if not self._parsed:
          logging.error("Unable to parse {} - aborting download".format(self.thing_id))
          return

        if not self._needs_download:
            print("{} already downloaded - skipping.".format(self.title))
            return

        # Have we already downloaded some things?
        timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
        prev_dir = None
        if os.path.exists(self.download_dir):
            if not os.path.exists(timestamp_file):
                # edge case: old style dir w/out timestamp.
                logging.warning(
                    "Old style download dir found for {}".format(self.title))
                prev_count = 0
                target_dir = "{}_old".format(self.download_dir)
                while os.path.exists(target_dir):
                    prev_count = prev_count + 1
                    target_dir = "{}_old_{}".format(self.download_dir, prev_count)
                os.rename(self.download_dir, target_dir)
            else:
                prev_dir = "{}_{}".format(self.download_dir, self.last_time)
                os.rename(self.download_dir, prev_dir)

        # Get the list of files to download
        soup = BeautifulSoup(self.text, features='lxml')
        file_links = soup.find_all('a', {'class': 'file-download'})

        new_file_links = []
        old_file_links = []
        new_last_time = None

        if not self.last_time:
            # If we don't have anything to copy from, then it is all new.
            new_file_links = file_links
            try:
              new_last_time = file_links[0].find_all('time')[0]['datetime']
            except:
              import code
              code.interact(local=dict(globals(), **locals()))

            for file_link in file_links:
                timestamp = file_link.find_all('time')[0]['datetime']
                logging.debug("Found file {} from {}".format(
                    file_link["title"], timestamp))
                if timestamp > new_last_time:
                    new_last_time = timestamp
        else:
            for file_link in file_links:
                timestamp = file_link.find_all('time')[0]['datetime']
                logging.debug("Checking {} (updated {})".format(
                    file_link["title"], timestamp))
                if timestamp > self.last_time:
                    new_file_links.append(file_link)
                else:
                    old_file_links.append(file_link)
                if not new_last_time or timestamp > new_last_time:
                    new_last_time = timestamp

        logging.debug("new timestamp {}".format(new_last_time))

        # OK. Time to get to work.
        logging.debug("Generating download_dir")
        os.mkdir(self.download_dir)
        # First grab the cached files (if any)
        logging.info("Copying {} unchanged files.".format(len(old_file_links)))
        for file_link in old_file_links:
            old_file = os.path.join(prev_dir, file_link["title"])
            new_file = os.path.join(self.download_dir, file_link["title"])
            try:
                logging.debug("Copying {} to {}".format(old_file, new_file))
                copyfile(old_file, new_file)
            except FileNotFoundError:
                logging.warning(
                    "Unable to find {} in old archive, redownloading".format(file_link["title"]))
                new_file_links.append(file_link)

        # Now download the new ones
        files = [("{}{}".format(URL_BASE, x['href']), x["title"])
                 for x in new_file_links]
        logging.info("Downloading {} new files of {}".format(
            len(new_file_links), len(file_links)))
        try:
            for url, name in files:
                file_name = os.path.join(self.download_dir, name)
                logging.debug("Downloading {} from {} to {}".format(
                    name, url, file_name))
                data_req = requests.get(url)
                with open(file_name, 'wb') as handle:
                    handle.write(data_req.content)
        except Exception as exception:
            logging.error("Failed to download {} - {}".format(name, exception))
            os.rename(self.download_dir, "{}_failed".format(self.download_dir))
            return

        # People like images
        image_dir = os.path.join(self.download_dir, 'images')
        imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
                         .find_all('div', {'class': 'gallery-photo'})
        logging.info("Downloading {} images.".format(len(imagelinks)))
        try:
            os.mkdir(image_dir)
            for imagelink in imagelinks:
                url = next(filter(None,[imagelink[x] for x in ['data-full',
                                                               'data-large',
                                                               'data-medium',
                                                               'data-thumb']]), None)
                if not url:
                    logging.warning("Unable to find any urls for {}".format(imagelink))
                    continue

                filename = os.path.basename(url)
                if filename.endswith('stl'):
                    filename = "{}.png".format(filename)
                image_req = requests.get(url)
                with open(os.path.join(image_dir, filename), 'wb') as handle:
                    handle.write(image_req.content)
        except Exception as exception:
            print("Failed to download {} - {}".format(filename, exception))
            os.rename(self.download_dir, "{}_failed".format(self.download_dir))
            return

        # instructions are good too.
        logging.info("Downloading readme")
        try:
            readme_txt = soup.find('meta', property='og:description')['content']
            with open(os.path.join(self.download_dir,'readme.txt'), 'w') as readme_handle:
                readme_handle.write("{}\n".format(readme_txt))
        except (TypeError, KeyError) as exception:
            logging.warning("No readme? {}".format(exception))
        except IOError as exception:
            logging.warning("Failed to write readme! {}".format(exception))

        # Best get some licenses
        logging.info("Downloading license")
        try:
            license_txt = soup.find('div',{'class':'license-text'}).text
            if license_txt:
                with open(os.path.join(self.download_dir,'license.txt'), 'w') as license_handle:
                    license_handle.write("{}\n".format(license_txt))
        except AttributeError as exception:
            logging.warning("No license? {}".format(exception))
        except IOError as exception:
            logging.warning("Failed to write license! {}".format(exception))


        try:
            # Now write the timestamp
            with open(timestamp_file, 'w') as timestamp_handle:
                timestamp_handle.write(new_last_time)
        except Exception as exception:
            print("Failed to write timestamp file - {}".format(exception))
            os.rename(self.download_dir, "{}_failed".format(self.download_dir))
            return
        self._needs_download = False
        logging.debug("Download of {} finished".format(self.title))


def do_batch(batch_file, download_dir):
    """ Read a file in line by line, parsing each as a set of calls to this script."""
    with open(batch_file) as handle:
        for line in handle:
            line = line.strip()
            logging.info("Handling instruction {}".format(line))
            command_arr = line.split()
            if command_arr[0] == "thing":
                logging.debug(
                    "Handling batch thing instruction: {}".format(line))
                Thing(command_arr[1]).download(download_dir)
                continue
            if command_arr[0] == "collection":
                logging.debug(
                    "Handling batch collection instruction: {}".format(line))
                Collection(command_arr[1], command_arr[2],
                           download_dir).download()
                continue
            if command_arr[0] == "user":
                logging.debug(
                    "Handling batch collection instruction: {}".format(line))
                Designs(command_arr[1], download_dir).download()
                continue
            logging.warning("Unable to parse current instruction. Skipping.")


def main():
    """ Entry point for script being run as a command. """
    parser = argparse.ArgumentParser()
    parser.add_argument("-l", "--log-level", choices=[
                        'debug', 'info', 'warning'], default='info', help="level of logging desired")
    parser.add_argument("-d", "--directory",
                        help="Target directory to download into")
    parser.add_argument("-f", "--log-file",
                        help="Place to log debug information to")
    subparsers = parser.add_subparsers(
        help="Type of thing to download", dest="subcommand")
    collection_parser = subparsers.add_parser(
        'collection', help="Download one or more entire collection(s)")
    collection_parser.add_argument(
        "owner", help="The owner of the collection(s) to get")
    collection_parser.add_argument(
        "collections", nargs="+",  help="Space seperated list of the name(s) of collection to get")
    thing_parser = subparsers.add_parser(
        'thing', help="Download a single thing.")
    thing_parser.add_argument("things", nargs="*", help="Space seperated list of thing ID(s) to download")
    user_parser = subparsers.add_parser(
        "user",  help="Download all things by one or more users")
    user_parser.add_argument("users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
    batch_parser = subparsers.add_parser(
        "batch", help="Perform multiple actions written in a text file")
    batch_parser.add_argument(
        "batch_file", help="The name of the file to read.")
    subparsers.add_parser("version", help="Show the current version")

    args = parser.parse_args()
    if not args.subcommand:
        parser.print_help()
        sys.exit(1)
    if not args.directory:
        args.directory = os.getcwd()

    logger = logging.getLogger()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger.setLevel(logging.DEBUG)
    console_handler = logging.StreamHandler()
    console_handler.setLevel(args.log_level.upper())

    logger.addHandler(console_handler)
    if args.log_file:
        file_handler = logging.FileHandler(args.log_file)
        file_handler.setLevel(logging.DEBUG)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

    if args.subcommand.startswith("collection"):
        for collection in args.collections:
            Collection(args.owner, collection, args.directory).download()
    if args.subcommand == "thing":
        for thing in args.things:
            Thing(thing).download(args.directory)
    if args.subcommand == "user":
        for user in args.users:
            Designs(user, args.directory).download()
    if args.subcommand == "version":
        print("thingy_grabber.py version {}".format(VERSION))
    if args.subcommand == "batch":
        do_batch(args.batch_file, args.directory)


if __name__ == "__main__":
    main()
Commit	Line	Data
975060c9 OM	1	#!/usr/bin/env python3
	2	"""
	3	Thingiverse bulk downloader
	4	"""
	5
	6	import re
4a98996b	7	import sys
975060c9 OM	8	import os
	9	import argparse
	10	import unicodedata
	11	import requests
fa2f3251	12	import logging
3c82f75b	13	from shutil import copyfile
975060c9 OM	14	from bs4 import BeautifulSoup
	15
	16	URL_BASE = "https://www.thingiverse.com"
	17	URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
948bd56f	18	USER_COLLECTION = URL_BASE + "/ajax/user/designs"
975060c9 OM	19
	20	ID_REGEX = re.compile(r'"id":(\d*),')
	21	TOTAL_REGEX = re.compile(r'"total":(\d*),')
	22	LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
	23	# This appears to be fixed at 12, but if it changes would screw the rest up.
	24	PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4 OM	25	NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
dd8c35f4 OM	26
dbdb1782 OM	27	VERSION = "0.5.1"
dbdb1782 OM	28
dd8c35f4 OM	29	def strip_ws(value):
	30	""" Remove whitespace from a string """
	31	return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9	32
dbdb1782	33
975060c9 OM	34	def slugify(value):
	35	"""
	36	Normalizes string, converts to lowercase, removes non-alpha characters,
	37	and converts spaces to hyphens.
	38	"""
dbdb1782 OM	39	value = unicodedata.normalize('NFKD', value).encode(
dbdb1782 OM	40	'ascii', 'ignore').decode()
975060c9	41	value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4 OM	42	value = str(NO_WHITESPACE_REGEX.sub('-', value))
dd8c35f4 OM	43	#value = str(re.sub(r'[-\s]+', '-', value))
975060c9 OM	44	return value
975060c9 OM	45
dbdb1782	46
3522a3bf	47	class Grouping:
d66f1f78	48	""" Holds details of a group of things for download
3c82f75b OM	49	This is effectively (although not actually) an abstract class
	50	- use Collection or Designs instead.
	51	"""
dbdb1782	52
3522a3bf	53	def __init__(self):
975060c9 OM	54	self.things = []
	55	self.total = 0
	56	self.req_id = None
	57	self.last_page = 0
	58	self.per_page = None
948bd56f	59	# These should be set by child classes.
3522a3bf OM	60	self.url = None
3522a3bf OM	61	self.download_dir = None
948bd56f	62	self.collection_url = None
975060c9	63
3522a3bf OM	64	def _get_small_grouping(self, req):
3522a3bf OM	65	""" Handle small groupings """
975060c9	66	soup = BeautifulSoup(req.text, features='lxml')
dbdb1782	67	links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9	68	self.things = [x['href'].split(':')[1] for x in links]
fa2f3251	69	self.total = len(self.things)
975060c9 OM	70
	71	return self.things
	72
3522a3bf OM	73	def get(self):
3522a3bf OM	74	""" retrieve the things of the grouping. """
975060c9 OM	75	if self.things:
	76	# We've already done it.
	77	return self.things
	78
3522a3bf OM	79	# Check for initialisation:
3522a3bf OM	80	if not self.url:
fa2f3251	81	logging.error("No URL set - object not initialised properly?")
3522a3bf OM	82	raise ValueError("No URL set - object not initialised properly?")
	83
	84	# Get the internal details of the grouping.
fa2f3251	85	logging.debug("Querying {}".format(self.url))
3522a3bf	86	c_req = requests.get(self.url)
975060c9 OM	87	total = TOTAL_REGEX.search(c_req.text)
975060c9 OM	88	if total is None:
3522a3bf OM	89	# This is a small (<13) items grouping. Pull the list from this req.
3522a3bf OM	90	return self._get_small_grouping(c_req)
975060c9 OM	91	self.total = total.groups()[0]
	92	self.req_id = ID_REGEX.search(c_req.text).groups()[0]
	93	self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
	94	self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
	95	parameters = {
dbdb1782 OM	96	'base_url': self.url,
	97	'page': '1',
	98	'per_page': '12',
	99	'id': self.req_id
975060c9 OM	100	}
	101	for current_page in range(1, self.last_page + 1):
	102	parameters['page'] = current_page
948bd56f	103	req = requests.post(self.collection_url, parameters)
975060c9	104	soup = BeautifulSoup(req.text, features='lxml')
dbdb1782	105	links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9 OM	106	self.things += [x['href'].split(':')[1] for x in links]
	107
	108	return self.things
	109
	110	def download(self):
	111	""" Downloads all the files in a collection """
	112	if not self.things:
3522a3bf OM	113	self.get()
	114
	115	if not self.download_dir:
dbdb1782 OM	116	raise ValueError(
dbdb1782 OM	117	"No download_dir set - invalidly initialised object?")
3522a3bf	118
975060c9	119	base_dir = os.getcwd()
975060c9	120	try:
3522a3bf	121	os.mkdir(self.download_dir)
975060c9	122	except FileExistsError:
fa2f3251	123	logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782	124	.format(self.download_dir))
fa2f3251	125	logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782	126	for idx, thing in enumerate(self.things):
fa2f3251	127	logging.info("Downloading thing {}".format(idx))
3c82f75b	128	Thing(thing).download(self.download_dir)
975060c9	129
dbdb1782	130
3522a3bf OM	131	class Collection(Grouping):
3522a3bf OM	132	""" Holds details of a collection. """
dbdb1782	133
d66f1f78	134	def __init__(self, user, name, directory):
3522a3bf OM	135	Grouping.__init__(self)
	136	self.user = user
	137	self.name = name
3c82f75b OM	138	self.url = "{}/{}/collections/{}".format(
3c82f75b OM	139	URL_BASE, self.user, strip_ws(self.name))
d66f1f78	140	self.download_dir = os.path.join(directory,
3c82f75b	141	"{}-{}".format(slugify(self.user), slugify(self.name)))
948bd56f	142	self.collection_url = URL_COLLECTION
3522a3bf	143
dbdb1782	144
3522a3bf OM	145	class Designs(Grouping):
3522a3bf OM	146	""" Holds details of all of a users' designs. """
dbdb1782	147
d66f1f78	148	def __init__(self, user, directory):
3522a3bf OM	149	Grouping.__init__(self)
	150	self.user = user
	151	self.url = "{}/{}/designs".format(URL_BASE, self.user)
dbdb1782 OM	152	self.download_dir = os.path.join(
dbdb1782 OM	153	directory, "{} designs".format(slugify(self.user)))
948bd56f	154	self.collection_url = USER_COLLECTION
975060c9	155
dbdb1782	156
3c82f75b OM	157	class Thing:
3c82f75b OM	158	""" An individual design on thingiverse. """
dbdb1782	159
3c82f75b OM	160	def __init__(self, thing_id):
	161	self.thing_id = thing_id
	162	self.last_time = None
	163	self._parsed = False
	164	self._needs_download = True
	165	self.text = None
	166	self.title = None
	167	self.download_dir = None
975060c9	168
3c82f75b OM	169	def _parse(self, base_dir):
	170	""" Work out what, if anything needs to be done. """
	171	if self._parsed:
	172	return
e36c2a07	173
3c82f75b	174	url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
e0e69fc6 OM	175	try:
	176	req = requests.get(url)
	177	except requests.exceptions.ConnectionError as error:
	178	logging.error("Unable to connect for thing {}: {}".format(self.thing_id, error))
	179	return
	180
3c82f75b OM	181	self.text = req.text
3c82f75b OM	182	soup = BeautifulSoup(self.text, features='lxml')
680039fe OM	183	#import code
680039fe OM	184	#code.interact(local=dict(globals(), **locals()))
e0e69fc6 OM	185	try:
	186	self.title = slugify(soup.find_all('h1')[0].text.strip())
	187	except IndexError:
	188	logging.warning("No title found for thing {}".format(self.thing_id))
	189	self.title = self.thing_id
	190
	191	if req.status_code == 404:
	192	logging.warning("404 for thing {} - DMCA or invalid number?".format(self.thing_id))
	193	return
	194
	195	if req.status_code > 299:
	196	logging.warning("bad status code {} for thing {} - try again later?".format(req.status_code, self.thing_id))
	197	return
	198
3b497b1a	199	self.old_download_dir = os.path.join(base_dir, self.title)
84ca2da2	200	self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, self.title))
3c82f75b	201
fa2f3251 OM	202	logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
fa2f3251 OM	203
3c82f75b	204	if not os.path.exists(self.download_dir):
3b497b1a M	205	if os.path.exists(self.old_download_dir):
	206	logging.info("Found previous style download directory. Moving it")
	207	copyfile(self.old_download_dir, self.download_dir)
	208	else:
	209	# Not yet downloaded
	210	self._parsed = True
	211	return
3c82f75b OM	212
	213	timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
	214	if not os.path.exists(timestamp_file):
	215	# Old download from before
dbdb1782 OM	216	logging.warning(
dbdb1782 OM	217	"Old-style download directory found. Assuming update required.")
3c82f75b OM	218	self._parsed = True
	219	return
	220
	221	try:
	222	with open(timestamp_file, 'r') as timestamp_handle:
	223	self.last_time = timestamp_handle.readlines()[0]
fa2f3251	224	logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b OM	225	except FileNotFoundError:
3c82f75b OM	226	# Not run on this thing before.
dbdb1782 OM	227	logging.info(
dbdb1782 OM	228	"Old-style download directory found. Assuming update required.")
3c82f75b OM	229	self.last_time = None
	230	self._parsed = True
	231	return
	232
	233	# OK, so we have a timestamp, lets see if there is anything new to get
dbdb1782	234	file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b OM	235	for file_link in file_links:
3c82f75b OM	236	timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782 OM	237	logging.debug("Checking {} (updated {})".format(
dbdb1782 OM	238	file_link["title"], timestamp))
3c82f75b	239	if timestamp > self.last_time:
dbdb1782 OM	240	logging.info(
dbdb1782 OM	241	"Found new/updated file {}".format(file_link["title"]))
3c82f75b OM	242	self._needs_download = True
	243	self._parsed = True
	244	return
	245	# Got here, so nope, no new files.
3c82f75b OM	246	self._needs_download = False
	247	self._parsed = True
	248
	249	def download(self, base_dir):
	250	""" Download all files for a given thing. """
	251	if not self._parsed:
	252	self._parse(base_dir)
	253
e0e69fc6 OM	254	if not self._parsed:
	255	logging.error("Unable to parse {} - aborting download".format(self.thing_id))
	256	return
	257
3c82f75b	258	if not self._needs_download:
fa2f3251	259	print("{} already downloaded - skipping.".format(self.title))
3c82f75b OM	260	return
	261
	262	# Have we already downloaded some things?
	263	timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
	264	prev_dir = None
	265	if os.path.exists(self.download_dir):
	266	if not os.path.exists(timestamp_file):
	267	# edge case: old style dir w/out timestamp.
dbdb1782 OM	268	logging.warning(
dbdb1782 OM	269	"Old style download dir found for {}".format(self.title))
4f94efc8 OM	270	prev_count = 0
	271	target_dir = "{}_old".format(self.download_dir)
	272	while os.path.exists(target_dir):
	273	prev_count = prev_count + 1
	274	target_dir = "{}_old_{}".format(self.download_dir, prev_count)
	275	os.rename(self.download_dir, target_dir)
3c82f75b OM	276	else:
	277	prev_dir = "{}_{}".format(self.download_dir, self.last_time)
	278	os.rename(self.download_dir, prev_dir)
	279
	280	# Get the list of files to download
	281	soup = BeautifulSoup(self.text, features='lxml')
dbdb1782	282	file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b OM	283
	284	new_file_links = []
	285	old_file_links = []
	286	new_last_time = None
	287
	288	if not self.last_time:
	289	# If we don't have anything to copy from, then it is all new.
	290	new_file_links = file_links
e0e69fc6 OM	291	try:
	292	new_last_time = file_links[0].find_all('time')[0]['datetime']
	293	except:
	294	import code
	295	code.interact(local=dict(globals(), **locals()))
	296
3c82f75b OM	297	for file_link in file_links:
3c82f75b OM	298	timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782 OM	299	logging.debug("Found file {} from {}".format(
dbdb1782 OM	300	file_link["title"], timestamp))
3c82f75b OM	301	if timestamp > new_last_time:
	302	new_last_time = timestamp
	303	else:
	304	for file_link in file_links:
	305	timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782 OM	306	logging.debug("Checking {} (updated {})".format(
dbdb1782 OM	307	file_link["title"], timestamp))
3c82f75b OM	308	if timestamp > self.last_time:
	309	new_file_links.append(file_link)
	310	else:
	311	old_file_links.append(file_link)
	312	if not new_last_time or timestamp > new_last_time:
	313	new_last_time = timestamp
	314
fa2f3251	315	logging.debug("new timestamp {}".format(new_last_time))
3c82f75b OM	316
3c82f75b OM	317	# OK. Time to get to work.
fa2f3251	318	logging.debug("Generating download_dir")
3c82f75b OM	319	os.mkdir(self.download_dir)
3c82f75b OM	320	# First grab the cached files (if any)
fa2f3251	321	logging.info("Copying {} unchanged files.".format(len(old_file_links)))
3c82f75b OM	322	for file_link in old_file_links:
	323	old_file = os.path.join(prev_dir, file_link["title"])
	324	new_file = os.path.join(self.download_dir, file_link["title"])
	325	try:
fa2f3251	326	logging.debug("Copying {} to {}".format(old_file, new_file))
3c82f75b OM	327	copyfile(old_file, new_file)
3c82f75b OM	328	except FileNotFoundError:
dbdb1782 OM	329	logging.warning(
dbdb1782 OM	330	"Unable to find {} in old archive, redownloading".format(file_link["title"]))
3c82f75b OM	331	new_file_links.append(file_link)
	332
	333	# Now download the new ones
dbdb1782 OM	334	files = [("{}{}".format(URL_BASE, x['href']), x["title"])
	335	for x in new_file_links]
	336	logging.info("Downloading {} new files of {}".format(
	337	len(new_file_links), len(file_links)))
3c82f75b OM	338	try:
	339	for url, name in files:
	340	file_name = os.path.join(self.download_dir, name)
dbdb1782 OM	341	logging.debug("Downloading {} from {} to {}".format(
dbdb1782 OM	342	name, url, file_name))
3c82f75b OM	343	data_req = requests.get(url)
	344	with open(file_name, 'wb') as handle:
	345	handle.write(data_req.content)
	346	except Exception as exception:
fa2f3251	347	logging.error("Failed to download {} - {}".format(name, exception))
3c82f75b OM	348	os.rename(self.download_dir, "{}_failed".format(self.download_dir))
	349	return
	350
680039fe OM	351	# People like images
680039fe OM	352	image_dir = os.path.join(self.download_dir, 'images')
dbdb1782 OM	353	imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
dbdb1782 OM	354	.find_all('div', {'class': 'gallery-photo'})
fa2f3251	355	logging.info("Downloading {} images.".format(len(imagelinks)))
680039fe OM	356	try:
680039fe OM	357	os.mkdir(image_dir)
fa2f3251	358	for imagelink in imagelinks:
b7bfef68 OM	359	url = next(filter(None,[imagelink[x] for x in ['data-full',
	360	'data-large',
	361	'data-medium',
	362	'data-thumb']]), None)
	363	if not url:
	364	logging.warning("Unable to find any urls for {}".format(imagelink))
	365	continue
	366
680039fe OM	367	filename = os.path.basename(url)
	368	if filename.endswith('stl'):
	369	filename = "{}.png".format(filename)
	370	image_req = requests.get(url)
	371	with open(os.path.join(image_dir, filename), 'wb') as handle:
	372	handle.write(image_req.content)
	373	except Exception as exception:
	374	print("Failed to download {} - {}".format(filename, exception))
	375	os.rename(self.download_dir, "{}_failed".format(self.download_dir))
	376	return
	377
4f75dd69 OM	378	# instructions are good too.
	379	logging.info("Downloading readme")
	380	try:
	381	readme_txt = soup.find('meta', property='og:description')['content']
	382	with open(os.path.join(self.download_dir,'readme.txt'), 'w') as readme_handle:
	383	readme_handle.write("{}\n".format(readme_txt))
	384	except (TypeError, KeyError) as exception:
	385	logging.warning("No readme? {}".format(exception))
	386	except IOError as exception:
	387	logging.warning("Failed to write readme! {}".format(exception))
	388
	389	# Best get some licenses
	390	logging.info("Downloading license")
	391	try:
	392	license_txt = soup.find('div',{'class':'license-text'}).text
	393	if license_txt:
	394	with open(os.path.join(self.download_dir,'license.txt'), 'w') as license_handle:
	395	license_handle.write("{}\n".format(license_txt))
	396	except AttributeError as exception:
	397	logging.warning("No license? {}".format(exception))
	398	except IOError as exception:
	399	logging.warning("Failed to write license! {}".format(exception))
	400
	401
3c82f75b OM	402	try:
	403	# Now write the timestamp
	404	with open(timestamp_file, 'w') as timestamp_handle:
	405	timestamp_handle.write(new_last_time)
	406	except Exception as exception:
	407	print("Failed to write timestamp file - {}".format(exception))
	408	os.rename(self.download_dir, "{}_failed".format(self.download_dir))
	409	return
	410	self._needs_download = False
fa2f3251	411	logging.debug("Download of {} finished".format(self.title))
975060c9	412
dbdb1782	413
1ab49020 OM	414	def do_batch(batch_file, download_dir):
	415	""" Read a file in line by line, parsing each as a set of calls to this script."""
	416	with open(batch_file) as handle:
	417	for line in handle:
	418	line = line.strip()
	419	logging.info("Handling instruction {}".format(line))
	420	command_arr = line.split()
	421	if command_arr[0] == "thing":
dbdb1782 OM	422	logging.debug(
dbdb1782 OM	423	"Handling batch thing instruction: {}".format(line))
1ab49020 OM	424	Thing(command_arr[1]).download(download_dir)
	425	continue
	426	if command_arr[0] == "collection":
dbdb1782 OM	427	logging.debug(
	428	"Handling batch collection instruction: {}".format(line))
	429	Collection(command_arr[1], command_arr[2],
	430	download_dir).download()
1ab49020 OM	431	continue
1ab49020 OM	432	if command_arr[0] == "user":
dbdb1782 OM	433	logging.debug(
dbdb1782 OM	434	"Handling batch collection instruction: {}".format(line))
1ab49020 OM	435	Designs(command_arr[1], download_dir).download()
	436	continue
	437	logging.warning("Unable to parse current instruction. Skipping.")
	438
dbdb1782	439
975060c9 OM	440	def main():
	441	""" Entry point for script being run as a command. """
	442	parser = argparse.ArgumentParser()
dbdb1782 OM	443	parser.add_argument("-l", "--log-level", choices=[
	444	'debug', 'info', 'warning'], default='info', help="level of logging desired")
	445	parser.add_argument("-d", "--directory",
	446	help="Target directory to download into")
4f94efc8 OM	447	parser.add_argument("-f", "--log-file",
4f94efc8 OM	448	help="Place to log debug information to")
dbdb1782 OM	449	subparsers = parser.add_subparsers(
	450	help="Type of thing to download", dest="subcommand")
	451	collection_parser = subparsers.add_parser(
b7bfef68	452	'collection', help="Download one or more entire collection(s)")
dbdb1782	453	collection_parser.add_argument(
b7bfef68	454	"owner", help="The owner of the collection(s) to get")
dbdb1782	455	collection_parser.add_argument(
b7bfef68	456	"collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782 OM	457	thing_parser = subparsers.add_parser(
dbdb1782 OM	458	'thing', help="Download a single thing.")
b7bfef68	459	thing_parser.add_argument("things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782	460	user_parser = subparsers.add_parser(
b7bfef68 OM	461	"user", help="Download all things by one or more users")
b7bfef68 OM	462	user_parser.add_argument("users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782 OM	463	batch_parser = subparsers.add_parser(
	464	"batch", help="Perform multiple actions written in a text file")
	465	batch_parser.add_argument(
	466	"batch_file", help="The name of the file to read.")
680039fe	467	subparsers.add_parser("version", help="Show the current version")
4a98996b	468
975060c9	469	args = parser.parse_args()
4a98996b OM	470	if not args.subcommand:
	471	parser.print_help()
	472	sys.exit(1)
d66f1f78 OM	473	if not args.directory:
d66f1f78 OM	474	args.directory = os.getcwd()
4f94efc8 OM	475
	476	logger = logging.getLogger()
	477	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	478	logger.setLevel(logging.DEBUG)
	479	console_handler = logging.StreamHandler()
	480	console_handler.setLevel(args.log_level.upper())
	481
	482	logger.addHandler(console_handler)
	483	if args.log_file:
	484	file_handler = logging.FileHandler(args.log_file)
	485	file_handler.setLevel(logging.DEBUG)
	486	file_handler.setFormatter(formatter)
	487	logger.addHandler(file_handler)
fa2f3251	488
4a98996b	489	if args.subcommand.startswith("collection"):
b7bfef68 OM	490	for collection in args.collections:
b7bfef68 OM	491	Collection(args.owner, collection, args.directory).download()
4a98996b	492	if args.subcommand == "thing":
b7bfef68 OM	493	for thing in args.things:
b7bfef68 OM	494	Thing(thing).download(args.directory)
3522a3bf	495	if args.subcommand == "user":
b7bfef68 OM	496	for user in args.users:
b7bfef68 OM	497	Designs(user, args.directory).download()
db8066ec OM	498	if args.subcommand == "version":
db8066ec OM	499	print("thingy_grabber.py version {}".format(VERSION))
1ab49020 OM	500	if args.subcommand == "batch":
	501	do_batch(args.batch_file, args.directory)
	502
975060c9 OM	503
	504	if __name__ == "__main__":
	505	main()