HCoop Git - clinton/thingy_grabber.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env python3
	2	"""
	3	Thingiverse bulk downloader
	4	"""
	5
	6	import re
	7	import sys
	8	import os
	9	import argparse
	10	import unicodedata
	11	import requests
	12	from shutil import copyfile
	13	from bs4 import BeautifulSoup
	14
	15	URL_BASE = "https://www.thingiverse.com"
	16	URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
	17	USER_COLLECTION = URL_BASE + "/ajax/user/designs"
	18
	19	ID_REGEX = re.compile(r'"id":(\d*),')
	20	TOTAL_REGEX = re.compile(r'"total":(\d*),')
	21	LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
	22	# This appears to be fixed at 12, but if it changes would screw the rest up.
	23	PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
	24	NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
	25
	26	VERSION = "0.4.0"
	27
	28	VERBOSE = False
	29
	30	def strip_ws(value):
	31	""" Remove whitespace from a string """
	32	return str(NO_WHITESPACE_REGEX.sub('-', value))
	33
	34	def slugify(value):
	35	"""
	36	Normalizes string, converts to lowercase, removes non-alpha characters,
	37	and converts spaces to hyphens.
	38	"""
	39	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
	40	value = str(re.sub(r'[^\w\s-]', '', value).strip())
	41	value = str(NO_WHITESPACE_REGEX.sub('-', value))
	42	#value = str(re.sub(r'[-\s]+', '-', value))
	43	return value
	44
	45	class Grouping:
	46	""" Holds details of a group of things for download
	47	This is effectively (although not actually) an abstract class
	48	- use Collection or Designs instead.
	49	"""
	50	def __init__(self):
	51	self.things = []
	52	self.total = 0
	53	self.req_id = None
	54	self.last_page = 0
	55	self.per_page = None
	56	# These should be set by child classes.
	57	self.url = None
	58	self.download_dir = None
	59	self.collection_url = None
	60
	61	def _get_small_grouping(self, req):
	62	""" Handle small groupings """
	63	soup = BeautifulSoup(req.text, features='lxml')
	64	links = soup.find_all('a', {'class':'card-img-holder'})
	65	self.things = [x['href'].split(':')[1] for x in links]
	66
	67	return self.things
	68
	69	def get(self):
	70	""" retrieve the things of the grouping. """
	71	if self.things:
	72	# We've already done it.
	73	return self.things
	74
	75	# Check for initialisation:
	76	if not self.url:
	77	print("No URL set - object not initialised properly?")
	78	raise ValueError("No URL set - object not initialised properly?")
	79
	80	# Get the internal details of the grouping.
	81	if VERBOSE:
	82	print("Querying {}".format(self.url))
	83	c_req = requests.get(self.url)
	84	total = TOTAL_REGEX.search(c_req.text)
	85	if total is None:
	86	# This is a small (<13) items grouping. Pull the list from this req.
	87	return self._get_small_grouping(c_req)
	88	self.total = total.groups()[0]
	89	self.req_id = ID_REGEX.search(c_req.text).groups()[0]
	90	self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
	91	self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
	92	parameters = {
	93	'base_url':self.url,
	94	'page':'1',
	95	'per_page':'12',
	96	'id':self.req_id
	97	}
	98	for current_page in range(1, self.last_page + 1):
	99	parameters['page'] = current_page
	100	req = requests.post(self.collection_url, parameters)
	101	soup = BeautifulSoup(req.text, features='lxml')
	102	links = soup.find_all('a', {'class':'card-img-holder'})
	103	self.things += [x['href'].split(':')[1] for x in links]
	104
	105	return self.things
	106
	107	def download(self):
	108	""" Downloads all the files in a collection """
	109	if not self.things:
	110	self.get()
	111
	112	if not self.download_dir:
	113	raise ValueError("No download_dir set - invalidly initialised object?")
	114
	115	base_dir = os.getcwd()
	116	try:
	117	os.mkdir(self.download_dir)
	118	except FileExistsError:
	119	print("Target directory {} already exists. Assuming a resume."
	120	.format(self.download_dir))
	121	if VERBOSE:
	122	print("Downloading {} things.".format(self.total))
	123	for thing in self.things:
	124	Thing(thing).download(self.download_dir)
	125
	126	class Collection(Grouping):
	127	""" Holds details of a collection. """
	128	def __init__(self, user, name, directory):
	129	Grouping.__init__(self)
	130	self.user = user
	131	self.name = name
	132	self.url = "{}/{}/collections/{}".format(
	133	URL_BASE, self.user, strip_ws(self.name))
	134	self.download_dir = os.path.join(directory,
	135	"{}-{}".format(slugify(self.user), slugify(self.name)))
	136	self.collection_url = URL_COLLECTION
	137
	138	class Designs(Grouping):
	139	""" Holds details of all of a users' designs. """
	140	def __init__(self, user, directory):
	141	Grouping.__init__(self)
	142	self.user = user
	143	self.url = "{}/{}/designs".format(URL_BASE, self.user)
	144	self.download_dir = os.path.join(directory, "{} designs".format(slugify(self.user)))
	145	self.collection_url = USER_COLLECTION
	146
	147	class Thing:
	148	""" An individual design on thingiverse. """
	149	def __init__(self, thing_id):
	150	self.thing_id = thing_id
	151	self.last_time = None
	152	self._parsed = False
	153	self._needs_download = True
	154	self.text = None
	155	self.title = None
	156	self.download_dir = None
	157
	158	def _parse(self, base_dir):
	159	""" Work out what, if anything needs to be done. """
	160	if self._parsed:
	161	return
	162
	163	url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
	164	req = requests.get(url)
	165	self.text = req.text
	166	soup = BeautifulSoup(self.text, features='lxml')
	167
	168	self.title = slugify(soup.find_all('h1')[0].text.strip())
	169	self.download_dir = os.path.join(base_dir, self.title)
	170
	171	if not os.path.exists(self.download_dir):
	172	# Not yet downloaded
	173	self._parsed = True
	174	return
	175
	176	timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
	177	if not os.path.exists(timestamp_file):
	178	# Old download from before
	179	if VERBOSE:
	180	print("Old-style download directory found. Assuming update required.")
	181	self._parsed = True
	182	return
	183
	184	try:
	185	with open(timestamp_file, 'r') as timestamp_handle:
	186	self.last_time = timestamp_handle.readlines()[0]
	187	if VERBOSE:
	188	print("last downloaded version: {}".format(self.last_time))
	189	except FileNotFoundError:
	190	# Not run on this thing before.
	191	if VERBOSE:
	192	print("Old-style download directory found. Assuming update required.")
	193	self.last_time = None
	194	self._parsed = True
	195	return
	196
	197	# OK, so we have a timestamp, lets see if there is anything new to get
	198	file_links = soup.find_all('a', {'class':'file-download'})
	199	for file_link in file_links:
	200	timestamp = file_link.find_all('time')[0]['datetime']
	201	if VERBOSE:
	202	print("Checking {} (updated {})".format(file_link["title"], timestamp))
	203	if timestamp > self.last_time:
	204	print("Found new/updated file {}".format(file_link["title"]))
	205	self._needs_download = True
	206	self._parsed = True
	207	return
	208	# Got here, so nope, no new files.
	209	print("Found no new files for {}".format(self.title))
	210	self._needs_download = False
	211	self._parsed = True
	212
	213	def download(self, base_dir):
	214	""" Download all files for a given thing. """
	215	if not self._parsed:
	216	self._parse(base_dir)
	217
	218	if not self._needs_download:
	219	if VERBOSE:
	220	print("{} already downloaded - skipping.".format(self.title))
	221	return
	222
	223	# Have we already downloaded some things?
	224	timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
	225	prev_dir = None
	226	if os.path.exists(self.download_dir):
	227	if not os.path.exists(timestamp_file):
	228	# edge case: old style dir w/out timestamp.
	229	print("Old style download dir found for {}".format(self.title))
	230	os.rename(self.download_dir, "{}_old".format(self.download_dir))
	231	else:
	232	prev_dir = "{}_{}".format(self.download_dir, self.last_time)
	233	os.rename(self.download_dir, prev_dir)
	234
	235	# Get the list of files to download
	236	soup = BeautifulSoup(self.text, features='lxml')
	237	file_links = soup.find_all('a', {'class':'file-download'})
	238
	239	new_file_links = []
	240	old_file_links = []
	241	new_last_time = None
	242
	243	if not self.last_time:
	244	# If we don't have anything to copy from, then it is all new.
	245	new_file_links = file_links
	246	new_last_time = file_links[0].find_all('time')[0]['datetime']
	247	for file_link in file_links:
	248	timestamp = file_link.find_all('time')[0]['datetime']
	249	if VERBOSE:
	250	print("Found file {} from {}".format(file_link["title"], timestamp))
	251	if timestamp > new_last_time:
	252	new_last_time = timestamp
	253	else:
	254	for file_link in file_links:
	255	timestamp = file_link.find_all('time')[0]['datetime']
	256	if VERBOSE:
	257	print("Checking {} (updated {})".format(file_link["title"], timestamp))
	258	if timestamp > self.last_time:
	259	new_file_links.append(file_link)
	260	else:
	261	old_file_links.append(file_link)
	262	if not new_last_time or timestamp > new_last_time:
	263	new_last_time = timestamp
	264
	265	if VERBOSE:
	266	print("new timestamp {}".format(new_last_time))
	267
	268	# OK. Time to get to work.
	269	os.mkdir(self.download_dir)
	270	# First grab the cached files (if any)
	271	for file_link in old_file_links:
	272	old_file = os.path.join(prev_dir, file_link["title"])
	273	new_file = os.path.join(self.download_dir, file_link["title"])
	274	try:
	275	if VERBOSE:
	276	print("Copying {} to {}".format(old_file, new_file))
	277	copyfile(old_file, new_file)
	278	except FileNotFoundError:
	279	print("Unable to find {} in old archive, redownloading".format(file_link["title"]))
	280	new_file_links.append(file_link)
	281
	282	# Now download the new ones
	283	files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
	284	try:
	285	for url, name in files:
	286	file_name = os.path.join(self.download_dir, name)
	287	if VERBOSE:
	288	print("Downloading {} from {} to {}".format(name, url, file_name))
	289	data_req = requests.get(url)
	290	with open(file_name, 'wb') as handle:
	291	handle.write(data_req.content)
	292	except Exception as exception:
	293	print("Failed to download {} - {}".format(name, exception))
	294	os.rename(self.download_dir, "{}_failed".format(self.download_dir))
	295	return
	296
	297	try:
	298	# Now write the timestamp
	299	with open(timestamp_file, 'w') as timestamp_handle:
	300	timestamp_handle.write(new_last_time)
	301	except Exception as exception:
	302	print("Failed to write timestamp file - {}".format(exception))
	303	os.rename(self.download_dir, "{}_failed".format(self.download_dir))
	304	return
	305	self._needs_download = False
	306	if VERBOSE:
	307	print("Download of {} finished".format(self.title))
	308
	309	def main():
	310	""" Entry point for script being run as a command. """
	311	parser = argparse.ArgumentParser()
	312	parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
	313	parser.add_argument("-d", "--directory", help="Target directory to download into")
	314	subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
	315	collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
	316	collection_parser.add_argument("owner", help="The owner of the collection to get")
	317	collection_parser.add_argument("collection", help="The name of the collection to get")
	318	thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
	319	thing_parser.add_argument("thing", help="Thing ID to download")
	320	user_parser = subparsers.add_parser("user", help="Download all things by a user")
	321	user_parser.add_argument("user", help="The user to get the designs of")
	322	version_parser = subparsers.add_parser("version", help="Show the current version")
	323
	324	args = parser.parse_args()
	325	if not args.subcommand:
	326	parser.print_help()
	327	sys.exit(1)
	328	if not args.directory:
	329	args.directory = os.getcwd()
	330
	331	global VERBOSE
	332	VERBOSE = args.verbose
	333	if args.subcommand.startswith("collection"):
	334	collection = Collection(args.owner, args.collection, args.directory)
	335	print(collection.get())
	336	collection.download()
	337	if args.subcommand == "thing":
	338	Thing(args.thing).download(args.directory)
	339	if args.subcommand == "user":
	340	designs = Designs(args.user, args.directory)
	341	print(designs.get())
	342	designs.download()
	343	if args.subcommand == "version":
	344	print("thingy_grabber.py version {}".format(VERSION))
	345
	346	if __name__ == "__main__":
	347	main()