[clinton/thingy_grabber.git] / thingy_grabber.py

#!/usr/bin/env python3
"""
Thingiverse bulk downloader
"""

import re
import sys
import os
import argparse
import unicodedata
import requests
import logging
from shutil import copyfile
from bs4 import BeautifulSoup

URL_BASE = "https://www.thingiverse.com"
URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
USER_COLLECTION = URL_BASE + "/ajax/user/designs"

ID_REGEX = re.compile(r'"id":(\d*),')
TOTAL_REGEX = re.compile(r'"total":(\d*),')
LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
# This appears to be fixed at 12, but if it changes would screw the rest up.
PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')

VERSION = "0.5.1"


def strip_ws(value):
    """ Remove whitespace from a string """
    return str(NO_WHITESPACE_REGEX.sub('-', value))


def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value = unicodedata.normalize('NFKD', value).encode(
        'ascii', 'ignore').decode()
    value = str(re.sub(r'[^\w\s-]', '', value).strip())
    value = str(NO_WHITESPACE_REGEX.sub('-', value))
    #value = str(re.sub(r'[-\s]+', '-', value))
    return value


class Grouping:
    """ Holds details of a group of things for download
        This is effectively (although not actually) an abstract class
        - use Collection or Designs instead.
    """

    def __init__(self):
        self.things = []
        self.total = 0
        self.req_id = None
        self.last_page = 0
        self.per_page = None
        # These should be set by child classes.
        self.url = None
        self.download_dir = None
        self.collection_url = None

    def _get_small_grouping(self, req):
        """ Handle small groupings """
        soup = BeautifulSoup(req.text, features='lxml')
        links = soup.find_all('a', {'class': 'card-img-holder'})
        self.things = [x['href'].split(':')[1] for x in links]
        self.total = len(self.things)

        return self.things

    def get(self):
        """ retrieve the things of the grouping. """
        if self.things:
            # We've already done it.
            return self.things

        # Check for initialisation:
        if not self.url:
            logging.error("No URL set - object not initialised properly?")
            raise ValueError("No URL set - object not initialised properly?")

        # Get the internal details of the grouping.
        logging.debug("Querying {}".format(self.url))
        c_req = requests.get(self.url)
        total = TOTAL_REGEX.search(c_req.text)
        if total is None:
            # This is a small (<13) items grouping. Pull the list from this req.
            return self._get_small_grouping(c_req)
        self.total = total.groups()[0]
        self.req_id = ID_REGEX.search(c_req.text).groups()[0]
        self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
        self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
        parameters = {
            'base_url': self.url,
            'page': '1',
            'per_page': '12',
            'id': self.req_id
        }
        for current_page in range(1, self.last_page + 1):
            parameters['page'] = current_page
            req = requests.post(self.collection_url, parameters)
            soup = BeautifulSoup(req.text, features='lxml')
            links = soup.find_all('a', {'class': 'card-img-holder'})
            self.things += [x['href'].split(':')[1] for x in links]

        return self.things

    def download(self):
        """ Downloads all the files in a collection """
        if not self.things:
            self.get()

        if not self.download_dir:
            raise ValueError(
                "No download_dir set - invalidly initialised object?")

        base_dir = os.getcwd()
        try:
            os.mkdir(self.download_dir)
        except FileExistsError:
            logging.info("Target directory {} already exists. Assuming a resume."
                         .format(self.download_dir))
        logging.info("Downloading {} thing(s).".format(self.total))
        for idx, thing in enumerate(self.things):
            logging.info("Downloading thing {}".format(idx))
            Thing(thing).download(self.download_dir)


class Collection(Grouping):
    """ Holds details of a collection. """

    def __init__(self, user, name, directory):
        Grouping.__init__(self)
        self.user = user
        self.name = name
        self.url = "{}/{}/collections/{}".format(
            URL_BASE, self.user, strip_ws(self.name))
        self.download_dir = os.path.join(directory,
                                         "{}-{}".format(slugify(self.user), slugify(self.name)))
        self.collection_url = URL_COLLECTION


class Designs(Grouping):
    """ Holds details of all of a users' designs. """

    def __init__(self, user, directory):
        Grouping.__init__(self)
        self.user = user
        self.url = "{}/{}/designs".format(URL_BASE, self.user)
        self.download_dir = os.path.join(
            directory, "{} designs".format(slugify(self.user)))
        self.collection_url = USER_COLLECTION


class Thing:
    """ An individual design on thingiverse. """

    def __init__(self, thing_id):
        self.thing_id = thing_id
        self.last_time = None
        self._parsed = False
        self._needs_download = True
        self.text = None
        self.title = None
        self.download_dir = None

    def _parse(self, base_dir):
        """ Work out what, if anything needs to be done. """
        if self._parsed:
            return

        url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
        try:
          req = requests.get(url)
        except requests.exceptions.ConnectionError as error:
          logging.error("Unable to connect for thing {}: {}".format(self.thing_id, error))
          return

        self.text = req.text
        soup = BeautifulSoup(self.text, features='lxml')
        #import code
        #code.interact(local=dict(globals(), **locals()))
        try:
          self.title = slugify(soup.find_all('h1')[0].text.strip())
        except IndexError:
          logging.warning("No title found for thing {}".format(self.thing_id))
          self.title = self.thing_id

        if req.status_code == 404:
          logging.warning("404 for thing {} - DMCA or invalid number?".format(self.thing_id))
          return

        if req.status_code > 299:
          logging.warning("bad status code {}  for thing {} - try again later?".format(req.status_code, self.thing_id))
          return

        self.download_dir = os.path.join(base_dir, self.title)

        logging.debug("Parsing {} ({})".format(self.thing_id, self.title))

        if not os.path.exists(self.download_dir):
            # Not yet downloaded
            self._parsed = True
            return

        timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
        if not os.path.exists(timestamp_file):
            # Old download from before
            logging.warning(
                "Old-style download directory found. Assuming update required.")
            self._parsed = True
            return

        try:
            with open(timestamp_file, 'r') as timestamp_handle:
                self.last_time = timestamp_handle.readlines()[0]
            logging.info("last downloaded version: {}".format(self.last_time))
        except FileNotFoundError:
            # Not run on this thing before.
            logging.info(
                "Old-style download directory found. Assuming update required.")
            self.last_time = None
            self._parsed = True
            return

        # OK, so we have a timestamp, lets see if there is anything new to get
        file_links = soup.find_all('a', {'class': 'file-download'})
        for file_link in file_links:
            timestamp = file_link.find_all('time')[0]['datetime']
            logging.debug("Checking {} (updated {})".format(
                file_link["title"], timestamp))
            if timestamp > self.last_time:
                logging.info(
                    "Found new/updated file {}".format(file_link["title"]))
                self._needs_download = True
                self._parsed = True
                return
        # Got here, so nope, no new files.
        self._needs_download = False
        self._parsed = True

    def download(self, base_dir):
        """ Download all files for a given thing. """
        if not self._parsed:
            self._parse(base_dir)

        if not self._parsed:
          logging.error("Unable to parse {} - aborting download".format(self.thing_id))
          return

        if not self._needs_download:
            print("{} already downloaded - skipping.".format(self.title))
            return

        # Have we already downloaded some things?
        timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
        prev_dir = None
        if os.path.exists(self.download_dir):
            if not os.path.exists(timestamp_file):
                # edge case: old style dir w/out timestamp.
                logging.warning(
                    "Old style download dir found for {}".format(self.title))
                os.rename(self.download_dir,
                          "{}_old".format(self.download_dir))
            else:
                prev_dir = "{}_{}".format(self.download_dir, self.last_time)
                os.rename(self.download_dir, prev_dir)

        # Get the list of files to download
        soup = BeautifulSoup(self.text, features='lxml')
        file_links = soup.find_all('a', {'class': 'file-download'})

        new_file_links = []
        old_file_links = []
        new_last_time = None

        if not self.last_time:
            # If we don't have anything to copy from, then it is all new.
            new_file_links = file_links
            try:
              new_last_time = file_links[0].find_all('time')[0]['datetime']
            except:
              import code
              code.interact(local=dict(globals(), **locals()))

            for file_link in file_links:
                timestamp = file_link.find_all('time')[0]['datetime']
                logging.debug("Found file {} from {}".format(
                    file_link["title"], timestamp))
                if timestamp > new_last_time:
                    new_last_time = timestamp
        else:
            for file_link in file_links:
                timestamp = file_link.find_all('time')[0]['datetime']
                logging.debug("Checking {} (updated {})".format(
                    file_link["title"], timestamp))
                if timestamp > self.last_time:
                    new_file_links.append(file_link)
                else:
                    old_file_links.append(file_link)
                if not new_last_time or timestamp > new_last_time:
                    new_last_time = timestamp

        logging.debug("new timestamp {}".format(new_last_time))

        # OK. Time to get to work.
        logging.debug("Generating download_dir")
        os.mkdir(self.download_dir)
        # First grab the cached files (if any)
        logging.info("Copying {} unchanged files.".format(len(old_file_links)))
        for file_link in old_file_links:
            old_file = os.path.join(prev_dir, file_link["title"])
            new_file = os.path.join(self.download_dir, file_link["title"])
            try:
                logging.debug("Copying {} to {}".format(old_file, new_file))
                copyfile(old_file, new_file)
            except FileNotFoundError:
                logging.warning(
                    "Unable to find {} in old archive, redownloading".format(file_link["title"]))
                new_file_links.append(file_link)

        # Now download the new ones
        files = [("{}{}".format(URL_BASE, x['href']), x["title"])
                 for x in new_file_links]
        logging.info("Downloading {} new files of {}".format(
            len(new_file_links), len(file_links)))
        try:
            for url, name in files:
                file_name = os.path.join(self.download_dir, name)
                logging.debug("Downloading {} from {} to {}".format(
                    name, url, file_name))
                data_req = requests.get(url)
                with open(file_name, 'wb') as handle:
                    handle.write(data_req.content)
        except Exception as exception:
            logging.error("Failed to download {} - {}".format(name, exception))
            os.rename(self.download_dir, "{}_failed".format(self.download_dir))
            return

        # People like images
        image_dir = os.path.join(self.download_dir, 'images')
        imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
                         .find_all('div', {'class': 'gallery-photo'})
        logging.info("Downloading {} images.".format(len(imagelinks)))
        try:
            os.mkdir(image_dir)
            for imagelink in imagelinks:
                url = next(filter(None,[imagelink[x] for x in ['data-full',
                                                               'data-large',
                                                               'data-medium',
                                                               'data-thumb']]), None)
                if not url:
                    logging.warning("Unable to find any urls for {}".format(imagelink))
                    continue

                filename = os.path.basename(url)
                if filename.endswith('stl'):
                    filename = "{}.png".format(filename)
                image_req = requests.get(url)
                with open(os.path.join(image_dir, filename), 'wb') as handle:
                    handle.write(image_req.content)
        except Exception as exception:
            print("Failed to download {} - {}".format(filename, exception))
            os.rename(self.download_dir, "{}_failed".format(self.download_dir))
            return

        # instructions are good too.
        logging.info("Downloading readme")
        try:
            readme_txt = soup.find('meta', property='og:description')['content']
            with open(os.path.join(self.download_dir,'readme.txt'), 'w') as readme_handle:
                readme_handle.write("{}\n".format(readme_txt))
        except (TypeError, KeyError) as exception:
            logging.warning("No readme? {}".format(exception))
        except IOError as exception:
            logging.warning("Failed to write readme! {}".format(exception))

        # Best get some licenses
        logging.info("Downloading license")
        try:
            license_txt = soup.find('div',{'class':'license-text'}).text
            if license_txt:
                with open(os.path.join(self.download_dir,'license.txt'), 'w') as license_handle:
                    license_handle.write("{}\n".format(license_txt))
        except AttributeError as exception:
            logging.warning("No license? {}".format(exception))
        except IOError as exception:
            logging.warning("Failed to write license! {}".format(exception))


        try:
            # Now write the timestamp
            with open(timestamp_file, 'w') as timestamp_handle:
                timestamp_handle.write(new_last_time)
        except Exception as exception:
            print("Failed to write timestamp file - {}".format(exception))
            os.rename(self.download_dir, "{}_failed".format(self.download_dir))
            return
        self._needs_download = False
        logging.debug("Download of {} finished".format(self.title))


def do_batch(batch_file, download_dir):
    """ Read a file in line by line, parsing each as a set of calls to this script."""
    with open(batch_file) as handle:
        for line in handle:
            line = line.strip()
            logging.info("Handling instruction {}".format(line))
            command_arr = line.split()
            if command_arr[0] == "thing":
                logging.debug(
                    "Handling batch thing instruction: {}".format(line))
                Thing(command_arr[1]).download(download_dir)
                continue
            if command_arr[0] == "collection":
                logging.debug(
                    "Handling batch collection instruction: {}".format(line))
                Collection(command_arr[1], command_arr[2],
                           download_dir).download()
                continue
            if command_arr[0] == "user":
                logging.debug(
                    "Handling batch collection instruction: {}".format(line))
                Designs(command_arr[1], download_dir).download()
                continue
            logging.warning("Unable to parse current instruction. Skipping.")


def main():
    """ Entry point for script being run as a command. """
    parser = argparse.ArgumentParser()
    parser.add_argument("-l", "--log-level", choices=[
                        'debug', 'info', 'warning'], default='info', help="level of logging desired")
    parser.add_argument("-d", "--directory",
                        help="Target directory to download into")
    subparsers = parser.add_subparsers(
        help="Type of thing to download", dest="subcommand")
    collection_parser = subparsers.add_parser(
        'collection', help="Download one or more entire collection(s)")
    collection_parser.add_argument(
        "owner", help="The owner of the collection(s) to get")
    collection_parser.add_argument(
        "collections", nargs="+",  help="Space seperated list of the name(s) of collection to get")
    thing_parser = subparsers.add_parser(
        'thing', help="Download a single thing.")
    thing_parser.add_argument("things", nargs="*", help="Space seperated list of thing ID(s) to download")
    user_parser = subparsers.add_parser(
        "user",  help="Download all things by one or more users")
    user_parser.add_argument("users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
    batch_parser = subparsers.add_parser(
        "batch", help="Perform multiple actions written in a text file")
    batch_parser.add_argument(
        "batch_file", help="The name of the file to read.")
    subparsers.add_parser("version", help="Show the current version")

    args = parser.parse_args()
    if not args.subcommand:
        parser.print_help()
        sys.exit(1)
    if not args.directory:
        args.directory = os.getcwd()
    logging.basicConfig(level=getattr(logging, args.log_level.upper()))

    if args.subcommand.startswith("collection"):
        for collection in args.collections:
            Collection(args.owner, collection, args.directory).download()
    if args.subcommand == "thing":
        for thing in args.things:
            Thing(thing).download(args.directory)
    if args.subcommand == "user":
        for user in args.users:
            Designs(user, args.directory).download()
    if args.subcommand == "version":
        print("thingy_grabber.py version {}".format(VERSION))
    if args.subcommand == "batch":
        do_batch(args.batch_file, args.directory)


if __name__ == "__main__":
    main()
Commit	Line	Data
975060c9 OM	1	#!/usr/bin/env python3
	2	"""
	3	Thingiverse bulk downloader
	4	"""
	5
	6	import re
4a98996b	7	import sys
975060c9 OM	8	import os
	9	import argparse
	10	import unicodedata
	11	import requests
fa2f3251	12	import logging
3c82f75b	13	from shutil import copyfile
975060c9 OM	14	from bs4 import BeautifulSoup
	15
	16	URL_BASE = "https://www.thingiverse.com"
	17	URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
948bd56f	18	USER_COLLECTION = URL_BASE + "/ajax/user/designs"
975060c9 OM	19
	20	ID_REGEX = re.compile(r'"id":(\d*),')
	21	TOTAL_REGEX = re.compile(r'"total":(\d*),')
	22	LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
	23	# This appears to be fixed at 12, but if it changes would screw the rest up.
	24	PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4 OM	25	NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
dd8c35f4 OM	26
dbdb1782 OM	27	VERSION = "0.5.1"
dbdb1782 OM	28
db8066ec	29
dd8c35f4 OM	30	def strip_ws(value):
	31	""" Remove whitespace from a string """
	32	return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9	33
dbdb1782	34
975060c9 OM	35	def slugify(value):
	36	"""
	37	Normalizes string, converts to lowercase, removes non-alpha characters,
	38	and converts spaces to hyphens.
	39	"""
dbdb1782 OM	40	value = unicodedata.normalize('NFKD', value).encode(
dbdb1782 OM	41	'ascii', 'ignore').decode()
975060c9	42	value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4 OM	43	value = str(NO_WHITESPACE_REGEX.sub('-', value))
dd8c35f4 OM	44	#value = str(re.sub(r'[-\s]+', '-', value))
975060c9 OM	45	return value
975060c9 OM	46
dbdb1782	47
3522a3bf	48	class Grouping:
d66f1f78	49	""" Holds details of a group of things for download
3c82f75b OM	50	This is effectively (although not actually) an abstract class
	51	- use Collection or Designs instead.
	52	"""
dbdb1782	53
3522a3bf	54	def __init__(self):
975060c9 OM	55	self.things = []
	56	self.total = 0
	57	self.req_id = None
	58	self.last_page = 0
	59	self.per_page = None
948bd56f	60	# These should be set by child classes.
3522a3bf OM	61	self.url = None
3522a3bf OM	62	self.download_dir = None
948bd56f	63	self.collection_url = None
975060c9	64
3522a3bf OM	65	def _get_small_grouping(self, req):
3522a3bf OM	66	""" Handle small groupings """
975060c9	67	soup = BeautifulSoup(req.text, features='lxml')
dbdb1782	68	links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9	69	self.things = [x['href'].split(':')[1] for x in links]
fa2f3251	70	self.total = len(self.things)
975060c9 OM	71
	72	return self.things
	73
3522a3bf OM	74	def get(self):
3522a3bf OM	75	""" retrieve the things of the grouping. """
975060c9 OM	76	if self.things:
	77	# We've already done it.
	78	return self.things
	79
3522a3bf OM	80	# Check for initialisation:
3522a3bf OM	81	if not self.url:
fa2f3251	82	logging.error("No URL set - object not initialised properly?")
3522a3bf OM	83	raise ValueError("No URL set - object not initialised properly?")
	84
	85	# Get the internal details of the grouping.
fa2f3251	86	logging.debug("Querying {}".format(self.url))
3522a3bf	87	c_req = requests.get(self.url)
975060c9 OM	88	total = TOTAL_REGEX.search(c_req.text)
975060c9 OM	89	if total is None:
3522a3bf OM	90	# This is a small (<13) items grouping. Pull the list from this req.
3522a3bf OM	91	return self._get_small_grouping(c_req)
975060c9 OM	92	self.total = total.groups()[0]
	93	self.req_id = ID_REGEX.search(c_req.text).groups()[0]
	94	self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
	95	self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
	96	parameters = {
dbdb1782 OM	97	'base_url': self.url,
	98	'page': '1',
	99	'per_page': '12',
	100	'id': self.req_id
975060c9 OM	101	}
	102	for current_page in range(1, self.last_page + 1):
	103	parameters['page'] = current_page
948bd56f	104	req = requests.post(self.collection_url, parameters)
975060c9	105	soup = BeautifulSoup(req.text, features='lxml')
dbdb1782	106	links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9 OM	107	self.things += [x['href'].split(':')[1] for x in links]
	108
	109	return self.things
	110
	111	def download(self):
	112	""" Downloads all the files in a collection """
	113	if not self.things:
3522a3bf OM	114	self.get()
	115
	116	if not self.download_dir:
dbdb1782 OM	117	raise ValueError(
dbdb1782 OM	118	"No download_dir set - invalidly initialised object?")
3522a3bf	119
975060c9	120	base_dir = os.getcwd()
975060c9	121	try:
3522a3bf	122	os.mkdir(self.download_dir)
975060c9	123	except FileExistsError:
fa2f3251	124	logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782	125	.format(self.download_dir))
fa2f3251	126	logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782	127	for idx, thing in enumerate(self.things):
fa2f3251	128	logging.info("Downloading thing {}".format(idx))
3c82f75b	129	Thing(thing).download(self.download_dir)
975060c9	130
dbdb1782	131
3522a3bf OM	132	class Collection(Grouping):
3522a3bf OM	133	""" Holds details of a collection. """
dbdb1782	134
d66f1f78	135	def __init__(self, user, name, directory):
3522a3bf OM	136	Grouping.__init__(self)
	137	self.user = user
	138	self.name = name
3c82f75b OM	139	self.url = "{}/{}/collections/{}".format(
3c82f75b OM	140	URL_BASE, self.user, strip_ws(self.name))
d66f1f78	141	self.download_dir = os.path.join(directory,
3c82f75b	142	"{}-{}".format(slugify(self.user), slugify(self.name)))
948bd56f	143	self.collection_url = URL_COLLECTION
3522a3bf	144
dbdb1782	145
3522a3bf OM	146	class Designs(Grouping):
3522a3bf OM	147	""" Holds details of all of a users' designs. """
dbdb1782	148
d66f1f78	149	def __init__(self, user, directory):
3522a3bf OM	150	Grouping.__init__(self)
	151	self.user = user
	152	self.url = "{}/{}/designs".format(URL_BASE, self.user)
dbdb1782 OM	153	self.download_dir = os.path.join(
dbdb1782 OM	154	directory, "{} designs".format(slugify(self.user)))
948bd56f	155	self.collection_url = USER_COLLECTION
975060c9	156
dbdb1782	157
3c82f75b OM	158	class Thing:
3c82f75b OM	159	""" An individual design on thingiverse. """
dbdb1782	160
3c82f75b OM	161	def __init__(self, thing_id):
	162	self.thing_id = thing_id
	163	self.last_time = None
	164	self._parsed = False
	165	self._needs_download = True
	166	self.text = None
	167	self.title = None
	168	self.download_dir = None
975060c9	169
3c82f75b OM	170	def _parse(self, base_dir):
	171	""" Work out what, if anything needs to be done. """
	172	if self._parsed:
	173	return
e36c2a07	174
3c82f75b	175	url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
e0e69fc6 OM	176	try:
	177	req = requests.get(url)
	178	except requests.exceptions.ConnectionError as error:
	179	logging.error("Unable to connect for thing {}: {}".format(self.thing_id, error))
	180	return
	181
3c82f75b OM	182	self.text = req.text
3c82f75b OM	183	soup = BeautifulSoup(self.text, features='lxml')
680039fe OM	184	#import code
680039fe OM	185	#code.interact(local=dict(globals(), **locals()))
e0e69fc6 OM	186	try:
	187	self.title = slugify(soup.find_all('h1')[0].text.strip())
	188	except IndexError:
	189	logging.warning("No title found for thing {}".format(self.thing_id))
	190	self.title = self.thing_id
	191
	192	if req.status_code == 404:
	193	logging.warning("404 for thing {} - DMCA or invalid number?".format(self.thing_id))
	194	return
	195
	196	if req.status_code > 299:
	197	logging.warning("bad status code {} for thing {} - try again later?".format(req.status_code, self.thing_id))
	198	return
	199
3c82f75b OM	200	self.download_dir = os.path.join(base_dir, self.title)
3c82f75b OM	201
fa2f3251 OM	202	logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
fa2f3251 OM	203
3c82f75b OM	204	if not os.path.exists(self.download_dir):
	205	# Not yet downloaded
	206	self._parsed = True
	207	return
	208
	209	timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
	210	if not os.path.exists(timestamp_file):
	211	# Old download from before
dbdb1782 OM	212	logging.warning(
dbdb1782 OM	213	"Old-style download directory found. Assuming update required.")
3c82f75b OM	214	self._parsed = True
	215	return
	216
	217	try:
	218	with open(timestamp_file, 'r') as timestamp_handle:
	219	self.last_time = timestamp_handle.readlines()[0]
fa2f3251	220	logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b OM	221	except FileNotFoundError:
3c82f75b OM	222	# Not run on this thing before.
dbdb1782 OM	223	logging.info(
dbdb1782 OM	224	"Old-style download directory found. Assuming update required.")
3c82f75b OM	225	self.last_time = None
	226	self._parsed = True
	227	return
	228
	229	# OK, so we have a timestamp, lets see if there is anything new to get
dbdb1782	230	file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b OM	231	for file_link in file_links:
3c82f75b OM	232	timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782 OM	233	logging.debug("Checking {} (updated {})".format(
dbdb1782 OM	234	file_link["title"], timestamp))
3c82f75b	235	if timestamp > self.last_time:
dbdb1782 OM	236	logging.info(
dbdb1782 OM	237	"Found new/updated file {}".format(file_link["title"]))
3c82f75b OM	238	self._needs_download = True
	239	self._parsed = True
	240	return
	241	# Got here, so nope, no new files.
3c82f75b OM	242	self._needs_download = False
	243	self._parsed = True
	244
	245	def download(self, base_dir):
	246	""" Download all files for a given thing. """
	247	if not self._parsed:
	248	self._parse(base_dir)
	249
e0e69fc6 OM	250	if not self._parsed:
	251	logging.error("Unable to parse {} - aborting download".format(self.thing_id))
	252	return
	253
3c82f75b	254	if not self._needs_download:
fa2f3251	255	print("{} already downloaded - skipping.".format(self.title))
3c82f75b OM	256	return
	257
	258	# Have we already downloaded some things?
	259	timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
	260	prev_dir = None
	261	if os.path.exists(self.download_dir):
	262	if not os.path.exists(timestamp_file):
	263	# edge case: old style dir w/out timestamp.
dbdb1782 OM	264	logging.warning(
	265	"Old style download dir found for {}".format(self.title))
	266	os.rename(self.download_dir,
	267	"{}_old".format(self.download_dir))
3c82f75b OM	268	else:
	269	prev_dir = "{}_{}".format(self.download_dir, self.last_time)
	270	os.rename(self.download_dir, prev_dir)
	271
	272	# Get the list of files to download
	273	soup = BeautifulSoup(self.text, features='lxml')
dbdb1782	274	file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b OM	275
	276	new_file_links = []
	277	old_file_links = []
	278	new_last_time = None
	279
	280	if not self.last_time:
	281	# If we don't have anything to copy from, then it is all new.
	282	new_file_links = file_links
e0e69fc6 OM	283	try:
	284	new_last_time = file_links[0].find_all('time')[0]['datetime']
	285	except:
	286	import code
	287	code.interact(local=dict(globals(), **locals()))
	288
3c82f75b OM	289	for file_link in file_links:
3c82f75b OM	290	timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782 OM	291	logging.debug("Found file {} from {}".format(
dbdb1782 OM	292	file_link["title"], timestamp))
3c82f75b OM	293	if timestamp > new_last_time:
	294	new_last_time = timestamp
	295	else:
	296	for file_link in file_links:
	297	timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782 OM	298	logging.debug("Checking {} (updated {})".format(
dbdb1782 OM	299	file_link["title"], timestamp))
3c82f75b OM	300	if timestamp > self.last_time:
	301	new_file_links.append(file_link)
	302	else:
	303	old_file_links.append(file_link)
	304	if not new_last_time or timestamp > new_last_time:
	305	new_last_time = timestamp
	306
fa2f3251	307	logging.debug("new timestamp {}".format(new_last_time))
3c82f75b OM	308
3c82f75b OM	309	# OK. Time to get to work.
fa2f3251	310	logging.debug("Generating download_dir")
3c82f75b OM	311	os.mkdir(self.download_dir)
3c82f75b OM	312	# First grab the cached files (if any)
fa2f3251	313	logging.info("Copying {} unchanged files.".format(len(old_file_links)))
3c82f75b OM	314	for file_link in old_file_links:
	315	old_file = os.path.join(prev_dir, file_link["title"])
	316	new_file = os.path.join(self.download_dir, file_link["title"])
	317	try:
fa2f3251	318	logging.debug("Copying {} to {}".format(old_file, new_file))
3c82f75b OM	319	copyfile(old_file, new_file)
3c82f75b OM	320	except FileNotFoundError:
dbdb1782 OM	321	logging.warning(
dbdb1782 OM	322	"Unable to find {} in old archive, redownloading".format(file_link["title"]))
3c82f75b OM	323	new_file_links.append(file_link)
	324
	325	# Now download the new ones
dbdb1782 OM	326	files = [("{}{}".format(URL_BASE, x['href']), x["title"])
	327	for x in new_file_links]
	328	logging.info("Downloading {} new files of {}".format(
	329	len(new_file_links), len(file_links)))
3c82f75b OM	330	try:
	331	for url, name in files:
	332	file_name = os.path.join(self.download_dir, name)
dbdb1782 OM	333	logging.debug("Downloading {} from {} to {}".format(
dbdb1782 OM	334	name, url, file_name))
3c82f75b OM	335	data_req = requests.get(url)
	336	with open(file_name, 'wb') as handle:
	337	handle.write(data_req.content)
	338	except Exception as exception:
fa2f3251	339	logging.error("Failed to download {} - {}".format(name, exception))
3c82f75b OM	340	os.rename(self.download_dir, "{}_failed".format(self.download_dir))
	341	return
	342
680039fe OM	343	# People like images
680039fe OM	344	image_dir = os.path.join(self.download_dir, 'images')
dbdb1782 OM	345	imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
dbdb1782 OM	346	.find_all('div', {'class': 'gallery-photo'})
fa2f3251	347	logging.info("Downloading {} images.".format(len(imagelinks)))
680039fe OM	348	try:
680039fe OM	349	os.mkdir(image_dir)
fa2f3251	350	for imagelink in imagelinks:
b7bfef68 OM	351	url = next(filter(None,[imagelink[x] for x in ['data-full',
	352	'data-large',
	353	'data-medium',
	354	'data-thumb']]), None)
	355	if not url:
	356	logging.warning("Unable to find any urls for {}".format(imagelink))
	357	continue
	358
680039fe OM	359	filename = os.path.basename(url)
	360	if filename.endswith('stl'):
	361	filename = "{}.png".format(filename)
	362	image_req = requests.get(url)
	363	with open(os.path.join(image_dir, filename), 'wb') as handle:
	364	handle.write(image_req.content)
	365	except Exception as exception:
	366	print("Failed to download {} - {}".format(filename, exception))
	367	os.rename(self.download_dir, "{}_failed".format(self.download_dir))
	368	return
	369
4f75dd69 OM	370	# instructions are good too.
	371	logging.info("Downloading readme")
	372	try:
	373	readme_txt = soup.find('meta', property='og:description')['content']
	374	with open(os.path.join(self.download_dir,'readme.txt'), 'w') as readme_handle:
	375	readme_handle.write("{}\n".format(readme_txt))
	376	except (TypeError, KeyError) as exception:
	377	logging.warning("No readme? {}".format(exception))
	378	except IOError as exception:
	379	logging.warning("Failed to write readme! {}".format(exception))
	380
	381	# Best get some licenses
	382	logging.info("Downloading license")
	383	try:
	384	license_txt = soup.find('div',{'class':'license-text'}).text
	385	if license_txt:
	386	with open(os.path.join(self.download_dir,'license.txt'), 'w') as license_handle:
	387	license_handle.write("{}\n".format(license_txt))
	388	except AttributeError as exception:
	389	logging.warning("No license? {}".format(exception))
	390	except IOError as exception:
	391	logging.warning("Failed to write license! {}".format(exception))
	392
	393
3c82f75b OM	394	try:
	395	# Now write the timestamp
	396	with open(timestamp_file, 'w') as timestamp_handle:
	397	timestamp_handle.write(new_last_time)
	398	except Exception as exception:
	399	print("Failed to write timestamp file - {}".format(exception))
	400	os.rename(self.download_dir, "{}_failed".format(self.download_dir))
	401	return
	402	self._needs_download = False
fa2f3251	403	logging.debug("Download of {} finished".format(self.title))
975060c9	404
dbdb1782	405
1ab49020 OM	406	def do_batch(batch_file, download_dir):
	407	""" Read a file in line by line, parsing each as a set of calls to this script."""
	408	with open(batch_file) as handle:
	409	for line in handle:
	410	line = line.strip()
	411	logging.info("Handling instruction {}".format(line))
	412	command_arr = line.split()
	413	if command_arr[0] == "thing":
dbdb1782 OM	414	logging.debug(
dbdb1782 OM	415	"Handling batch thing instruction: {}".format(line))
1ab49020 OM	416	Thing(command_arr[1]).download(download_dir)
	417	continue
	418	if command_arr[0] == "collection":
dbdb1782 OM	419	logging.debug(
	420	"Handling batch collection instruction: {}".format(line))
	421	Collection(command_arr[1], command_arr[2],
	422	download_dir).download()
1ab49020 OM	423	continue
1ab49020 OM	424	if command_arr[0] == "user":
dbdb1782 OM	425	logging.debug(
dbdb1782 OM	426	"Handling batch collection instruction: {}".format(line))
1ab49020 OM	427	Designs(command_arr[1], download_dir).download()
	428	continue
	429	logging.warning("Unable to parse current instruction. Skipping.")
	430
dbdb1782	431
975060c9 OM	432	def main():
	433	""" Entry point for script being run as a command. """
	434	parser = argparse.ArgumentParser()
dbdb1782 OM	435	parser.add_argument("-l", "--log-level", choices=[
	436	'debug', 'info', 'warning'], default='info', help="level of logging desired")
	437	parser.add_argument("-d", "--directory",
	438	help="Target directory to download into")
	439	subparsers = parser.add_subparsers(
	440	help="Type of thing to download", dest="subcommand")
	441	collection_parser = subparsers.add_parser(
b7bfef68	442	'collection', help="Download one or more entire collection(s)")
dbdb1782	443	collection_parser.add_argument(
b7bfef68	444	"owner", help="The owner of the collection(s) to get")
dbdb1782	445	collection_parser.add_argument(
b7bfef68	446	"collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
dbdb1782 OM	447	thing_parser = subparsers.add_parser(
dbdb1782 OM	448	'thing', help="Download a single thing.")
b7bfef68	449	thing_parser.add_argument("things", nargs="*", help="Space seperated list of thing ID(s) to download")
dbdb1782	450	user_parser = subparsers.add_parser(
b7bfef68 OM	451	"user", help="Download all things by one or more users")
b7bfef68 OM	452	user_parser.add_argument("users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
dbdb1782 OM	453	batch_parser = subparsers.add_parser(
	454	"batch", help="Perform multiple actions written in a text file")
	455	batch_parser.add_argument(
	456	"batch_file", help="The name of the file to read.")
680039fe	457	subparsers.add_parser("version", help="Show the current version")
4a98996b	458
975060c9	459	args = parser.parse_args()
4a98996b OM	460	if not args.subcommand:
	461	parser.print_help()
	462	sys.exit(1)
d66f1f78 OM	463	if not args.directory:
d66f1f78 OM	464	args.directory = os.getcwd()
fa2f3251 OM	465	logging.basicConfig(level=getattr(logging, args.log_level.upper()))
fa2f3251 OM	466
4a98996b	467	if args.subcommand.startswith("collection"):
b7bfef68 OM	468	for collection in args.collections:
b7bfef68 OM	469	Collection(args.owner, collection, args.directory).download()
4a98996b	470	if args.subcommand == "thing":
b7bfef68 OM	471	for thing in args.things:
b7bfef68 OM	472	Thing(thing).download(args.directory)
3522a3bf	473	if args.subcommand == "user":
b7bfef68 OM	474	for user in args.users:
b7bfef68 OM	475	Designs(user, args.directory).download()
db8066ec OM	476	if args.subcommand == "version":
db8066ec OM	477	print("thingy_grabber.py version {}".format(VERSION))
1ab49020 OM	478	if args.subcommand == "batch":
	479	do_batch(args.batch_file, args.directory)
	480
975060c9 OM	481
	482	if __name__ == "__main__":
	483	main()