[clinton/thingy_grabber.git] / thingy_grabber.py

#!/usr/bin/env python3
"""
Thingiverse bulk downloader
"""

import re
import sys
import os
import argparse
import unicodedata
import requests
from bs4 import BeautifulSoup

URL_BASE = "https://www.thingiverse.com"
URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"

ID_REGEX = re.compile(r'"id":(\d*),')
TOTAL_REGEX = re.compile(r'"total":(\d*),')
LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
# This appears to be fixed at 12, but if it changes would screw the rest up.
PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')

VERBOSE = False

def strip_ws(value):
    """ Remove whitespace from a string """
    return str(NO_WHITESPACE_REGEX.sub('-', value))

def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
    value = str(re.sub(r'[^\w\s-]', '', value).strip())
    value = str(NO_WHITESPACE_REGEX.sub('-', value))
    #value = str(re.sub(r'[-\s]+', '-', value))
    return value

class Collection:
    """ Holds details of a collection. """
    def __init__(self, user, name):
        self.user = user
        self.name = name
        self.things = []
        self.total = 0
        self.req_id = None
        self.last_page = 0
        self.per_page = None

    def _get_small_collection(self, req):
        """ Handle small collections """
        soup = BeautifulSoup(req.text, features='lxml')
        links = soup.find_all('a', {'class':'card-img-holder'})
        self.things = [x['href'].split(':')[1] for x in links]

        return self.things

    def get_collection(self):
        """ retrieve the things of the collection. """
        if self.things:
            # We've already done it.
            return self.things

        # Get the internal details of the collection.
        c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
        if VERBOSE:
            print("Querying {}".format(c_url))
        c_req = requests.get(c_url)
        total = TOTAL_REGEX.search(c_req.text)
        if total is None:
            # This is a small (<13) items collection. Pull the list from this req.
            return self._get_small_collection(c_req)
        self.total = total.groups()[0]
        self.req_id = ID_REGEX.search(c_req.text).groups()[0]
        self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
        self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
        parameters = {
            'base_url':"{}/collections/{}".format(self.user, self.name),
            'page':'1',
            'per_page':'12',
            'id':self.req_id
        }
        for current_page in range(1, self.last_page + 1):
            parameters['page'] = current_page
            req = requests.post(URL_COLLECTION, parameters)
            soup = BeautifulSoup(req.text, features='lxml')
            links = soup.find_all('a', {'class':'card-img-holder'})
            self.things += [x['href'].split(':')[1] for x in links]

        return self.things

    def download(self):
        """ Downloads all the files in a collection """
        if not self.things:
            self.get_collection()
        base_dir = os.getcwd()
        new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
        target_dir = os.path.join(base_dir, new_dir)
        try:
            os.mkdir(target_dir)
        except FileExistsError:
            print("Target directory {} already exists. Assuming a resume.".format(new_dir))
        os.chdir(target_dir)
        for thing in self.things:
            download_thing(thing)


def download_thing(thing):
    """ Downloads all the files for a given thing. """
    file_url = "{}/thing:{}/files".format(URL_BASE, thing)
    file_req = requests.get(file_url)
    file_soup = BeautifulSoup(file_req.text, features='lxml')

    title = slugify(file_soup.find_all('h1')[0].text.strip())
    base_dir = os.getcwd()
    try:
        os.mkdir(title)
    except FileExistsError:
        pass

    print("Downloading {} ({})".format(thing, title))
    os.chdir(title)
    last_time = None

    try:
        with open('timestamp.txt', 'r') as timestamp_handle:
            last_time = timestamp_handle.readlines()[0]
        if VERBOSE:
            print("last downloaded version: {}".format(last_time))
    except FileNotFoundError:
        # Not run on this thing before.
        if VERBOSE:
            print('Directory for thing already exists, checking for update.')
        last_time = None

    file_links = file_soup.find_all('a', {'class':'file-download'})
    new_last_time = last_time
    new_file_links = []

    for file_link in file_links:
        timestamp = file_link.find_all('time')[0]['datetime']
        if VERBOSE:
            print("Checking {} (updated {})".format(file_link["title"], timestamp))
        if not last_time or timestamp > last_time:
            new_file_links.append(file_link)
        if not new_last_time or timestamp > new_last_time:
            new_last_time = timestamp

    if last_time and new_last_time <= last_time:
        print("Thing already downloaded. Skipping.")
    files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]

    try:
        for url, name in files:
            if VERBOSE:
                print("Downloading {} from {}".format(name, url))
            data_req = requests.get(url)
            with open(name, 'wb') as handle:
                handle.write(data_req.content)
        # now write timestamp
        with open('timestamp.txt', 'w') as timestamp_handle:
            timestamp_handle.write(new_last_time)
    except Exception as exception:
        print("Failed to download {} - {}".format(name, exception))
        os.chdir(base_dir)
        os.rename(title, "{}_failed".format(title))
        return


    os.chdir(base_dir)

def main():
    """ Entry point for script being run as a command. """
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
    subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
    collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
    collection_parser.add_argument("owner", help="The owner of the collection to get")
    collection_parser.add_argument("collection", help="The name of the collection to get")
    thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
    thing_parser.add_argument("thing", help="Thing ID to download")

    args = parser.parse_args()
    if not args.subcommand:
        parser.print_help()
        sys.exit(1)
    global VERBOSE
    VERBOSE = args.verbose
    if args.subcommand.startswith("collection"):
        collection = Collection(args.owner, args.collection)
        print(collection.get_collection())
        collection.download()
    if args.subcommand == "thing":
        download_thing(args.thing)


if __name__ == "__main__":
    main()
Commit	Line	Data
975060c9 OM	1	#!/usr/bin/env python3
	2	"""
	3	Thingiverse bulk downloader
	4	"""
	5
	6	import re
4a98996b	7	import sys
975060c9 OM	8	import os
	9	import argparse
	10	import unicodedata
	11	import requests
	12	from bs4 import BeautifulSoup
	13
	14	URL_BASE = "https://www.thingiverse.com"
	15	URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
	16
	17	ID_REGEX = re.compile(r'"id":(\d*),')
	18	TOTAL_REGEX = re.compile(r'"total":(\d*),')
	19	LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
	20	# This appears to be fixed at 12, but if it changes would screw the rest up.
	21	PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4 OM	22	NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
	23
	24	VERBOSE = False
	25
	26	def strip_ws(value):
	27	""" Remove whitespace from a string """
	28	return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9 OM	29
	30	def slugify(value):
	31	"""
	32	Normalizes string, converts to lowercase, removes non-alpha characters,
	33	and converts spaces to hyphens.
	34	"""
	35	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
	36	value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4 OM	37	value = str(NO_WHITESPACE_REGEX.sub('-', value))
dd8c35f4 OM	38	#value = str(re.sub(r'[-\s]+', '-', value))
975060c9 OM	39	return value
	40
	41	class Collection:
	42	""" Holds details of a collection. """
	43	def __init__(self, user, name):
	44	self.user = user
	45	self.name = name
	46	self.things = []
	47	self.total = 0
	48	self.req_id = None
	49	self.last_page = 0
	50	self.per_page = None
	51
	52	def _get_small_collection(self, req):
	53	""" Handle small collections """
	54	soup = BeautifulSoup(req.text, features='lxml')
	55	links = soup.find_all('a', {'class':'card-img-holder'})
	56	self.things = [x['href'].split(':')[1] for x in links]
	57
	58	return self.things
	59
	60	def get_collection(self):
	61	""" retrieve the things of the collection. """
	62	if self.things:
	63	# We've already done it.
	64	return self.things
	65
	66	# Get the internal details of the collection.
dd8c35f4 OM	67	c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
	68	if VERBOSE:
	69	print("Querying {}".format(c_url))
975060c9 OM	70	c_req = requests.get(c_url)
	71	total = TOTAL_REGEX.search(c_req.text)
	72	if total is None:
	73	# This is a small (<13) items collection. Pull the list from this req.
	74	return self._get_small_collection(c_req)
	75	self.total = total.groups()[0]
	76	self.req_id = ID_REGEX.search(c_req.text).groups()[0]
	77	self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
	78	self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
	79	parameters = {
	80	'base_url':"{}/collections/{}".format(self.user, self.name),
	81	'page':'1',
	82	'per_page':'12',
	83	'id':self.req_id
	84	}
	85	for current_page in range(1, self.last_page + 1):
	86	parameters['page'] = current_page
	87	req = requests.post(URL_COLLECTION, parameters)
	88	soup = BeautifulSoup(req.text, features='lxml')
	89	links = soup.find_all('a', {'class':'card-img-holder'})
	90	self.things += [x['href'].split(':')[1] for x in links]
	91
	92	return self.things
	93
	94	def download(self):
	95	""" Downloads all the files in a collection """
	96	if not self.things:
	97	self.get_collection()
	98	base_dir = os.getcwd()
	99	new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
	100	target_dir = os.path.join(base_dir, new_dir)
	101	try:
	102	os.mkdir(target_dir)
	103	except FileExistsError:
	104	print("Target directory {} already exists. Assuming a resume.".format(new_dir))
	105	os.chdir(target_dir)
	106	for thing in self.things:
	107	download_thing(thing)
	108
	109
	110	def download_thing(thing):
	111	""" Downloads all the files for a given thing. """
	112	file_url = "{}/thing:{}/files".format(URL_BASE, thing)
	113	file_req = requests.get(file_url)
	114	file_soup = BeautifulSoup(file_req.text, features='lxml')
	115
	116	title = slugify(file_soup.find_all('h1')[0].text.strip())
	117	base_dir = os.getcwd()
	118	try:
	119	os.mkdir(title)
	120	except FileExistsError:
e36c2a07 OM	121	pass
e36c2a07 OM	122
975060c9 OM	123	print("Downloading {} ({})".format(thing, title))
975060c9 OM	124	os.chdir(title)
e36c2a07 OM	125	last_time = None
	126
	127	try:
4a98996b OM	128	with open('timestamp.txt', 'r') as timestamp_handle:
4a98996b OM	129	last_time = timestamp_handle.readlines()[0]
e36c2a07 OM	130	if VERBOSE:
	131	print("last downloaded version: {}".format(last_time))
	132	except FileNotFoundError:
	133	# Not run on this thing before.
	134	if VERBOSE:
	135	print('Directory for thing already exists, checking for update.')
	136	last_time = None
975060c9 OM	137
975060c9 OM	138	file_links = file_soup.find_all('a', {'class':'file-download'})
e36c2a07 OM	139	new_last_time = last_time
	140	new_file_links = []
	141
	142	for file_link in file_links:
	143	timestamp = file_link.find_all('time')[0]['datetime']
	144	if VERBOSE:
	145	print("Checking {} (updated {})".format(file_link["title"], timestamp))
	146	if not last_time or timestamp > last_time:
	147	new_file_links.append(file_link)
	148	if not new_last_time or timestamp > new_last_time:
	149	new_last_time = timestamp
	150
	151	if last_time and new_last_time <= last_time:
	152	print("Thing already downloaded. Skipping.")
	153	files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
975060c9	154
a7152c35 OM	155	try:
a7152c35 OM	156	for url, name in files:
e36c2a07 OM	157	if VERBOSE:
e36c2a07 OM	158	print("Downloading {} from {}".format(name, url))
a7152c35 OM	159	data_req = requests.get(url)
	160	with open(name, 'wb') as handle:
	161	handle.write(data_req.content)
e36c2a07	162	# now write timestamp
4a98996b OM	163	with open('timestamp.txt', 'w') as timestamp_handle:
4a98996b OM	164	timestamp_handle.write(new_last_time)
a7152c35 OM	165	except Exception as exception:
	166	print("Failed to download {} - {}".format(name, exception))
	167	os.chdir(base_dir)
	168	os.rename(title, "{}_failed".format(title))
	169	return
	170
e36c2a07	171
975060c9 OM	172	os.chdir(base_dir)
	173
	174	def main():
	175	""" Entry point for script being run as a command. """
	176	parser = argparse.ArgumentParser()
dd8c35f4	177	parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
4a98996b OM	178	subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
	179	collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
	180	collection_parser.add_argument("owner", help="The owner of the collection to get")
	181	collection_parser.add_argument("collection", help="The name of the collection to get")
	182	thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
	183	thing_parser.add_argument("thing", help="Thing ID to download")
	184
975060c9	185	args = parser.parse_args()
4a98996b OM	186	if not args.subcommand:
	187	parser.print_help()
	188	sys.exit(1)
dd8c35f4 OM	189	global VERBOSE
dd8c35f4 OM	190	VERBOSE = args.verbose
4a98996b OM	191	if args.subcommand.startswith("collection"):
	192	collection = Collection(args.owner, args.collection)
	193	print(collection.get_collection())
	194	collection.download()
	195	if args.subcommand == "thing":
	196	download_thing(args.thing)
975060c9	197
975060c9 OM	198
	199	if __name__ == "__main__":
	200	main()