[clinton/thingy_grabber.git] / thingy_grabber.py

#!/usr/bin/env python3
"""
Thingiverse bulk downloader
"""

import re
import os
import argparse
import unicodedata
import requests
from bs4 import BeautifulSoup

URL_BASE = "https://www.thingiverse.com"
URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"

ID_REGEX = re.compile(r'"id":(\d*),')
TOTAL_REGEX = re.compile(r'"total":(\d*),')
LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
# This appears to be fixed at 12, but if it changes would screw the rest up.
PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')

VERBOSE = False

def strip_ws(value):
    """ Remove whitespace from a string """
    return str(NO_WHITESPACE_REGEX.sub('-', value))

def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
    value = str(re.sub(r'[^\w\s-]', '', value).strip())
    value = str(NO_WHITESPACE_REGEX.sub('-', value))
    #value = str(re.sub(r'[-\s]+', '-', value))
    return value

class Collection:
    """ Holds details of a collection. """
    def __init__(self, user, name):
        self.user = user
        self.name = name
        self.things = []
        self.total = 0
        self.req_id = None
        self.last_page = 0
        self.per_page = None

    def _get_small_collection(self, req):
        """ Handle small collections """
        soup = BeautifulSoup(req.text, features='lxml')
        links = soup.find_all('a', {'class':'card-img-holder'})
        self.things = [x['href'].split(':')[1] for x in links]

        return self.things

    def get_collection(self):
        """ retrieve the things of the collection. """
        if self.things:
            # We've already done it.
            return self.things

        # Get the internal details of the collection.
        c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
        if VERBOSE:
            print("Querying {}".format(c_url))
        c_req = requests.get(c_url)
        total = TOTAL_REGEX.search(c_req.text)
        if total is None:
            # This is a small (<13) items collection. Pull the list from this req.
            return self._get_small_collection(c_req)
        self.total = total.groups()[0]
        self.req_id = ID_REGEX.search(c_req.text).groups()[0]
        self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
        self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
        parameters = {
            'base_url':"{}/collections/{}".format(self.user, self.name),
            'page':'1',
            'per_page':'12',
            'id':self.req_id
        }
        for current_page in range(1, self.last_page + 1):
            parameters['page'] = current_page
            req = requests.post(URL_COLLECTION, parameters)
            soup = BeautifulSoup(req.text, features='lxml')
            links = soup.find_all('a', {'class':'card-img-holder'})
            self.things += [x['href'].split(':')[1] for x in links]

        return self.things

    def download(self):
        """ Downloads all the files in a collection """
        if not self.things:
            self.get_collection()
        base_dir = os.getcwd()
        new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
        target_dir = os.path.join(base_dir, new_dir)
        try:
            os.mkdir(target_dir)
        except FileExistsError:
            print("Target directory {} already exists. Assuming a resume.".format(new_dir))
        os.chdir(target_dir)
        for thing in self.things:
            download_thing(thing)


def download_thing(thing):
    """ Downloads all the files for a given thing. """
    file_url = "{}/thing:{}/files".format(URL_BASE, thing)
    file_req = requests.get(file_url)
    file_soup = BeautifulSoup(file_req.text, features='lxml')

    title = slugify(file_soup.find_all('h1')[0].text.strip())
    base_dir = os.getcwd()
    try:
        os.mkdir(title)
    except FileExistsError:
        print("Directory for {} ({}) already exists, skipping".format(thing, title))
        return
    print("Downloading {} ({})".format(thing, title))
    os.chdir(title)

    file_links = file_soup.find_all('a', {'class':'file-download'})
    files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in file_links]

    try:
        for url, name in files:
            data_req = requests.get(url)
            with open(name, 'wb') as handle:
                handle.write(data_req.content)
    except Exception as exception:
        print("Failed to download {} - {}".format(name, exception))
        os.chdir(base_dir)
        os.rename(title, "{}_failed".format(title))
        return

    os.chdir(base_dir)

def main():
    """ Entry point for script being run as a command. """
    parser = argparse.ArgumentParser()
    parser.add_argument("owner", help="The owner of the collection to get")
    parser.add_argument("collection", help="The name of the collection to get")
    parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
    args = parser.parse_args()
    global VERBOSE
    VERBOSE = args.verbose

    collection = Collection(args.owner, args.collection)
    print(collection.get_collection())
    collection.download()

if __name__ == "__main__":
    main()
Commit	Line	Data
975060c9 OM	1	#!/usr/bin/env python3
	2	"""
	3	Thingiverse bulk downloader
	4	"""
	5
	6	import re
	7	import os
	8	import argparse
	9	import unicodedata
	10	import requests
	11	from bs4 import BeautifulSoup
	12
	13	URL_BASE = "https://www.thingiverse.com"
	14	URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
	15
	16	ID_REGEX = re.compile(r'"id":(\d*),')
	17	TOTAL_REGEX = re.compile(r'"total":(\d*),')
	18	LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
	19	# This appears to be fixed at 12, but if it changes would screw the rest up.
	20	PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4 OM	21	NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
	22
	23	VERBOSE = False
	24
	25	def strip_ws(value):
	26	""" Remove whitespace from a string """
	27	return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9 OM	28
	29	def slugify(value):
	30	"""
	31	Normalizes string, converts to lowercase, removes non-alpha characters,
	32	and converts spaces to hyphens.
	33	"""
	34	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
	35	value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4 OM	36	value = str(NO_WHITESPACE_REGEX.sub('-', value))
dd8c35f4 OM	37	#value = str(re.sub(r'[-\s]+', '-', value))
975060c9 OM	38	return value
	39
	40	class Collection:
	41	""" Holds details of a collection. """
	42	def __init__(self, user, name):
	43	self.user = user
	44	self.name = name
	45	self.things = []
	46	self.total = 0
	47	self.req_id = None
	48	self.last_page = 0
	49	self.per_page = None
	50
	51	def _get_small_collection(self, req):
	52	""" Handle small collections """
	53	soup = BeautifulSoup(req.text, features='lxml')
	54	links = soup.find_all('a', {'class':'card-img-holder'})
	55	self.things = [x['href'].split(':')[1] for x in links]
	56
	57	return self.things
	58
	59	def get_collection(self):
	60	""" retrieve the things of the collection. """
	61	if self.things:
	62	# We've already done it.
	63	return self.things
	64
	65	# Get the internal details of the collection.
dd8c35f4 OM	66	c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
	67	if VERBOSE:
	68	print("Querying {}".format(c_url))
975060c9 OM	69	c_req = requests.get(c_url)
	70	total = TOTAL_REGEX.search(c_req.text)
	71	if total is None:
	72	# This is a small (<13) items collection. Pull the list from this req.
	73	return self._get_small_collection(c_req)
	74	self.total = total.groups()[0]
	75	self.req_id = ID_REGEX.search(c_req.text).groups()[0]
	76	self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
	77	self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
	78	parameters = {
	79	'base_url':"{}/collections/{}".format(self.user, self.name),
	80	'page':'1',
	81	'per_page':'12',
	82	'id':self.req_id
	83	}
	84	for current_page in range(1, self.last_page + 1):
	85	parameters['page'] = current_page
	86	req = requests.post(URL_COLLECTION, parameters)
	87	soup = BeautifulSoup(req.text, features='lxml')
	88	links = soup.find_all('a', {'class':'card-img-holder'})
	89	self.things += [x['href'].split(':')[1] for x in links]
	90
	91	return self.things
	92
	93	def download(self):
	94	""" Downloads all the files in a collection """
	95	if not self.things:
	96	self.get_collection()
	97	base_dir = os.getcwd()
	98	new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
	99	target_dir = os.path.join(base_dir, new_dir)
	100	try:
	101	os.mkdir(target_dir)
	102	except FileExistsError:
	103	print("Target directory {} already exists. Assuming a resume.".format(new_dir))
	104	os.chdir(target_dir)
	105	for thing in self.things:
	106	download_thing(thing)
	107
	108
	109	def download_thing(thing):
	110	""" Downloads all the files for a given thing. """
	111	file_url = "{}/thing:{}/files".format(URL_BASE, thing)
	112	file_req = requests.get(file_url)
	113	file_soup = BeautifulSoup(file_req.text, features='lxml')
	114
	115	title = slugify(file_soup.find_all('h1')[0].text.strip())
	116	base_dir = os.getcwd()
	117	try:
	118	os.mkdir(title)
	119	except FileExistsError:
	120	print("Directory for {} ({}) already exists, skipping".format(thing, title))
	121	return
	122	print("Downloading {} ({})".format(thing, title))
	123	os.chdir(title)
	124
	125	file_links = file_soup.find_all('a', {'class':'file-download'})
	126	files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in file_links]
	127
a7152c35 OM	128	try:
	129	for url, name in files:
	130	data_req = requests.get(url)
	131	with open(name, 'wb') as handle:
	132	handle.write(data_req.content)
	133	except Exception as exception:
	134	print("Failed to download {} - {}".format(name, exception))
	135	os.chdir(base_dir)
	136	os.rename(title, "{}_failed".format(title))
	137	return
	138
975060c9 OM	139	os.chdir(base_dir)
	140
	141	def main():
	142	""" Entry point for script being run as a command. """
	143	parser = argparse.ArgumentParser()
	144	parser.add_argument("owner", help="The owner of the collection to get")
	145	parser.add_argument("collection", help="The name of the collection to get")
dd8c35f4	146	parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
975060c9	147	args = parser.parse_args()
dd8c35f4 OM	148	global VERBOSE
dd8c35f4 OM	149	VERBOSE = args.verbose
975060c9 OM	150
	151	collection = Collection(args.owner, args.collection)
	152	print(collection.get_collection())
	153	collection.download()
	154
	155	if __name__ == "__main__":
	156	main()