[clinton/thingy_grabber.git] / thingy_grabber.py

#!/usr/bin/env python3
"""
Thingiverse bulk downloader
"""

import re
import sys
import os
import argparse
import unicodedata
import requests
from bs4 import BeautifulSoup

URL_BASE = "https://www.thingiverse.com"
URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"

ID_REGEX = re.compile(r'"id":(\d*),')
TOTAL_REGEX = re.compile(r'"total":(\d*),')
LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
# This appears to be fixed at 12, but if it changes would screw the rest up.
PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')

VERBOSE = False

def strip_ws(value):
    """ Remove whitespace from a string """
    return str(NO_WHITESPACE_REGEX.sub('-', value))

def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
    value = str(re.sub(r'[^\w\s-]', '', value).strip())
    value = str(NO_WHITESPACE_REGEX.sub('-', value))
    #value = str(re.sub(r'[-\s]+', '-', value))
    return value

class Grouping:
    """ Holds details of a group of things. """
    def __init__(self):
        self.things = []
        self.total = 0
        self.req_id = None
        self.last_page = 0
        self.per_page = None
        # These two should be set by child classes.
        self.url = None
        self.download_dir = None

    def _get_small_grouping(self, req):
        """ Handle small groupings """
        soup = BeautifulSoup(req.text, features='lxml')
        links = soup.find_all('a', {'class':'card-img-holder'})
        self.things = [x['href'].split(':')[1] for x in links]

        return self.things

    def get(self):
        """ retrieve the things of the grouping. """
        if self.things:
            # We've already done it.
            return self.things

        # Check for initialisation:
        if not self.url:
            print("No URL set - object not initialised properly?")
            raise ValueError("No URL set - object not initialised properly?")

        # Get the internal details of the grouping.
        if VERBOSE:
            print("Querying {}".format(self.url))
        c_req = requests.get(self.url)
        total = TOTAL_REGEX.search(c_req.text)
        if total is None:
            # This is a small (<13) items grouping. Pull the list from this req.
            return self._get_small_grouping(c_req)
        self.total = total.groups()[0]
        self.req_id = ID_REGEX.search(c_req.text).groups()[0]
        self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
        self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
        parameters = {
            'base_url':self.url,
            'page':'1',
            'per_page':'12',
            'id':self.req_id
        }
        for current_page in range(1, self.last_page + 1):
            parameters['page'] = current_page
            req = requests.post(URL_COLLECTION, parameters)
            soup = BeautifulSoup(req.text, features='lxml')
            links = soup.find_all('a', {'class':'card-img-holder'})
            self.things += [x['href'].split(':')[1] for x in links]

        return self.things

    def download(self):
        """ Downloads all the files in a collection """
        if not self.things:
            self.get()

        if not self.download_dir:
            raise ValueError("No download_dir set - invalidly initialised object?")

        base_dir = os.getcwd()
        try:
            os.mkdir(self.download_dir)
        except FileExistsError:
            print("Target directory {} already exists. Assuming a resume.".format(self.download_dir))
        os.chdir(self.download_dir)
        for thing in self.things:
            download_thing(thing)
        os.chdir(base_dir)

class Collection(Grouping):
    """ Holds details of a collection. """
    def __init__(self, user, name):
        Grouping.__init__(self)
        self.user = user
        self.name = name
        self.url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
        self.download_dir = os.path.join(os.getcwd(), "{}-{}".format(slugify(self.user), slugify(self.name)))

class Designs(Grouping):
    """ Holds details of all of a users' designs. """
    def __init__(self, user):
        Grouping.__init__(self)
        self.user = user
        self.url = "{}/{}/designs".format(URL_BASE, self.user)
        self.download_dir = os.path.join(os.getcwd(), "{} designs".format(slugify(self.user)))

def download_thing(thing):
    """ Downloads all the files for a given thing. """
    file_url = "{}/thing:{}/files".format(URL_BASE, thing)
    file_req = requests.get(file_url)
    file_soup = BeautifulSoup(file_req.text, features='lxml')

    title = slugify(file_soup.find_all('h1')[0].text.strip())
    base_dir = os.getcwd()
    try:
        os.mkdir(title)
    except FileExistsError:
        pass

    print("Downloading {} ({})".format(thing, title))
    os.chdir(title)
    last_time = None

    try:
        with open('timestamp.txt', 'r') as timestamp_handle:
            last_time = timestamp_handle.readlines()[0]
        if VERBOSE:
            print("last downloaded version: {}".format(last_time))
    except FileNotFoundError:
        # Not run on this thing before.
        if VERBOSE:
            print('Directory for thing already exists, checking for update.')
        last_time = None

    file_links = file_soup.find_all('a', {'class':'file-download'})
    new_last_time = last_time
    new_file_links = []

    for file_link in file_links:
        timestamp = file_link.find_all('time')[0]['datetime']
        if VERBOSE:
            print("Checking {} (updated {})".format(file_link["title"], timestamp))
        if not last_time or timestamp > last_time:
            new_file_links.append(file_link)
        if not new_last_time or timestamp > new_last_time:
            new_last_time = timestamp

    if last_time and new_last_time <= last_time:
        print("Thing already downloaded. Skipping.")
    files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]

    try:
        for url, name in files:
            if VERBOSE:
                print("Downloading {} from {}".format(name, url))
            data_req = requests.get(url)
            with open(name, 'wb') as handle:
                handle.write(data_req.content)
        # now write timestamp
        with open('timestamp.txt', 'w') as timestamp_handle:
            timestamp_handle.write(new_last_time)
    except Exception as exception:
        print("Failed to download {} - {}".format(name, exception))
        os.chdir(base_dir)
        os.rename(title, "{}_failed".format(title))
        return


    os.chdir(base_dir)

def main():
    """ Entry point for script being run as a command. """
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
    subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
    collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
    collection_parser.add_argument("owner", help="The owner of the collection to get")
    collection_parser.add_argument("collection", help="The name of the collection to get")
    thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
    thing_parser.add_argument("thing", help="Thing ID to download")
    user_parser = subparsers.add_parser("user", help="Download all things by a user")
    user_parser.add_argument("user", help="The user to get the designs of")

    args = parser.parse_args()
    if not args.subcommand:
        parser.print_help()
        sys.exit(1)
    global VERBOSE
    VERBOSE = args.verbose
    if args.subcommand.startswith("collection"):
        collection = Collection(args.owner, args.collection)
        print(collection.get())
        collection.download()
    if args.subcommand == "thing":
        download_thing(args.thing)
    if args.subcommand == "user":
        designs = Designs(args.user)
        print(designs.get())
        designs.download()


if __name__ == "__main__":
    main()
Commit	Line	Data
975060c9 OM	1	#!/usr/bin/env python3
	2	"""
	3	Thingiverse bulk downloader
	4	"""
	5
	6	import re
4a98996b	7	import sys
975060c9 OM	8	import os
	9	import argparse
	10	import unicodedata
	11	import requests
	12	from bs4 import BeautifulSoup
	13
	14	URL_BASE = "https://www.thingiverse.com"
	15	URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
	16
	17	ID_REGEX = re.compile(r'"id":(\d*),')
	18	TOTAL_REGEX = re.compile(r'"total":(\d*),')
	19	LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
	20	# This appears to be fixed at 12, but if it changes would screw the rest up.
	21	PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4 OM	22	NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
	23
	24	VERBOSE = False
	25
	26	def strip_ws(value):
	27	""" Remove whitespace from a string """
	28	return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9 OM	29
	30	def slugify(value):
	31	"""
	32	Normalizes string, converts to lowercase, removes non-alpha characters,
	33	and converts spaces to hyphens.
	34	"""
	35	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
	36	value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4 OM	37	value = str(NO_WHITESPACE_REGEX.sub('-', value))
dd8c35f4 OM	38	#value = str(re.sub(r'[-\s]+', '-', value))
975060c9 OM	39	return value
975060c9 OM	40
3522a3bf OM	41	class Grouping:
	42	""" Holds details of a group of things. """
	43	def __init__(self):
975060c9 OM	44	self.things = []
	45	self.total = 0
	46	self.req_id = None
	47	self.last_page = 0
	48	self.per_page = None
3522a3bf OM	49	# These two should be set by child classes.
	50	self.url = None
	51	self.download_dir = None
975060c9	52
3522a3bf OM	53	def _get_small_grouping(self, req):
3522a3bf OM	54	""" Handle small groupings """
975060c9 OM	55	soup = BeautifulSoup(req.text, features='lxml')
	56	links = soup.find_all('a', {'class':'card-img-holder'})
	57	self.things = [x['href'].split(':')[1] for x in links]
	58
	59	return self.things
	60
3522a3bf OM	61	def get(self):
3522a3bf OM	62	""" retrieve the things of the grouping. """
975060c9 OM	63	if self.things:
	64	# We've already done it.
	65	return self.things
	66
3522a3bf OM	67	# Check for initialisation:
	68	if not self.url:
	69	print("No URL set - object not initialised properly?")
	70	raise ValueError("No URL set - object not initialised properly?")
	71
	72	# Get the internal details of the grouping.
dd8c35f4	73	if VERBOSE:
3522a3bf OM	74	print("Querying {}".format(self.url))
3522a3bf OM	75	c_req = requests.get(self.url)
975060c9 OM	76	total = TOTAL_REGEX.search(c_req.text)
975060c9 OM	77	if total is None:
3522a3bf OM	78	# This is a small (<13) items grouping. Pull the list from this req.
3522a3bf OM	79	return self._get_small_grouping(c_req)
975060c9 OM	80	self.total = total.groups()[0]
	81	self.req_id = ID_REGEX.search(c_req.text).groups()[0]
	82	self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
	83	self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
	84	parameters = {
3522a3bf	85	'base_url':self.url,
975060c9 OM	86	'page':'1',
	87	'per_page':'12',
	88	'id':self.req_id
	89	}
	90	for current_page in range(1, self.last_page + 1):
	91	parameters['page'] = current_page
	92	req = requests.post(URL_COLLECTION, parameters)
	93	soup = BeautifulSoup(req.text, features='lxml')
	94	links = soup.find_all('a', {'class':'card-img-holder'})
	95	self.things += [x['href'].split(':')[1] for x in links]
	96
	97	return self.things
	98
	99	def download(self):
	100	""" Downloads all the files in a collection """
	101	if not self.things:
3522a3bf OM	102	self.get()
	103
	104	if not self.download_dir:
	105	raise ValueError("No download_dir set - invalidly initialised object?")
	106
975060c9	107	base_dir = os.getcwd()
975060c9	108	try:
3522a3bf	109	os.mkdir(self.download_dir)
975060c9	110	except FileExistsError:
3522a3bf OM	111	print("Target directory {} already exists. Assuming a resume.".format(self.download_dir))
3522a3bf OM	112	os.chdir(self.download_dir)
975060c9 OM	113	for thing in self.things:
975060c9 OM	114	download_thing(thing)
3522a3bf	115	os.chdir(base_dir)
975060c9	116
3522a3bf OM	117	class Collection(Grouping):
	118	""" Holds details of a collection. """
	119	def __init__(self, user, name):
	120	Grouping.__init__(self)
	121	self.user = user
	122	self.name = name
	123	self.url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
	124	self.download_dir = os.path.join(os.getcwd(), "{}-{}".format(slugify(self.user), slugify(self.name)))
	125
	126	class Designs(Grouping):
	127	""" Holds details of all of a users' designs. """
	128	def __init__(self, user):
	129	Grouping.__init__(self)
	130	self.user = user
	131	self.url = "{}/{}/designs".format(URL_BASE, self.user)
	132	self.download_dir = os.path.join(os.getcwd(), "{} designs".format(slugify(self.user)))
975060c9 OM	133
	134	def download_thing(thing):
	135	""" Downloads all the files for a given thing. """
	136	file_url = "{}/thing:{}/files".format(URL_BASE, thing)
	137	file_req = requests.get(file_url)
	138	file_soup = BeautifulSoup(file_req.text, features='lxml')
	139
	140	title = slugify(file_soup.find_all('h1')[0].text.strip())
	141	base_dir = os.getcwd()
	142	try:
	143	os.mkdir(title)
	144	except FileExistsError:
e36c2a07 OM	145	pass
e36c2a07 OM	146
975060c9 OM	147	print("Downloading {} ({})".format(thing, title))
975060c9 OM	148	os.chdir(title)
e36c2a07 OM	149	last_time = None
	150
	151	try:
4a98996b OM	152	with open('timestamp.txt', 'r') as timestamp_handle:
4a98996b OM	153	last_time = timestamp_handle.readlines()[0]
e36c2a07 OM	154	if VERBOSE:
	155	print("last downloaded version: {}".format(last_time))
	156	except FileNotFoundError:
	157	# Not run on this thing before.
	158	if VERBOSE:
	159	print('Directory for thing already exists, checking for update.')
	160	last_time = None
975060c9 OM	161
975060c9 OM	162	file_links = file_soup.find_all('a', {'class':'file-download'})
e36c2a07 OM	163	new_last_time = last_time
	164	new_file_links = []
	165
	166	for file_link in file_links:
	167	timestamp = file_link.find_all('time')[0]['datetime']
	168	if VERBOSE:
	169	print("Checking {} (updated {})".format(file_link["title"], timestamp))
	170	if not last_time or timestamp > last_time:
	171	new_file_links.append(file_link)
	172	if not new_last_time or timestamp > new_last_time:
	173	new_last_time = timestamp
	174
	175	if last_time and new_last_time <= last_time:
	176	print("Thing already downloaded. Skipping.")
	177	files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
975060c9	178
a7152c35 OM	179	try:
a7152c35 OM	180	for url, name in files:
e36c2a07 OM	181	if VERBOSE:
e36c2a07 OM	182	print("Downloading {} from {}".format(name, url))
a7152c35 OM	183	data_req = requests.get(url)
	184	with open(name, 'wb') as handle:
	185	handle.write(data_req.content)
e36c2a07	186	# now write timestamp
4a98996b OM	187	with open('timestamp.txt', 'w') as timestamp_handle:
4a98996b OM	188	timestamp_handle.write(new_last_time)
a7152c35 OM	189	except Exception as exception:
	190	print("Failed to download {} - {}".format(name, exception))
	191	os.chdir(base_dir)
	192	os.rename(title, "{}_failed".format(title))
	193	return
	194
e36c2a07	195
975060c9 OM	196	os.chdir(base_dir)
	197
	198	def main():
	199	""" Entry point for script being run as a command. """
	200	parser = argparse.ArgumentParser()
dd8c35f4	201	parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
4a98996b OM	202	subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
	203	collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
	204	collection_parser.add_argument("owner", help="The owner of the collection to get")
	205	collection_parser.add_argument("collection", help="The name of the collection to get")
	206	thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
	207	thing_parser.add_argument("thing", help="Thing ID to download")
3522a3bf OM	208	user_parser = subparsers.add_parser("user", help="Download all things by a user")
3522a3bf OM	209	user_parser.add_argument("user", help="The user to get the designs of")
4a98996b	210
975060c9	211	args = parser.parse_args()
4a98996b OM	212	if not args.subcommand:
	213	parser.print_help()
	214	sys.exit(1)
dd8c35f4 OM	215	global VERBOSE
dd8c35f4 OM	216	VERBOSE = args.verbose
4a98996b OM	217	if args.subcommand.startswith("collection"):
4a98996b OM	218	collection = Collection(args.owner, args.collection)
3522a3bf	219	print(collection.get())
4a98996b OM	220	collection.download()
	221	if args.subcommand == "thing":
	222	download_thing(args.thing)
3522a3bf OM	223	if args.subcommand == "user":
	224	designs = Designs(args.user)
	225	print(designs.get())
	226	designs.download()
	227
975060c9	228
975060c9 OM	229
	230	if __name__ == "__main__":
	231	main()