HCoop Git - clinton/thingy_grabber.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env python3
	2	"""
	3	Thingiverse bulk downloader
	4	"""
	5
	6	import re
	7	import sys
	8	import os
	9	import argparse
	10	import unicodedata
	11	import requests
	12	import logging
	13	import multiprocessing
	14	import enum
	15	import datetime
	16	from shutil import copyfile
	17	from dataclasses import dataclass
	18	import py7zr
	19	import glob
	20	import shutil
	21	from io import StringIO
	22	from html.parser import HTMLParser
	23
	24	SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
	25
	26	# I don't think this is exported by datetime
	27	DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
	28	# Windows cannot handle : in filenames
	29	SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
	30
	31	API_BASE = "https://api.thingiverse.com"
	32	ACCESS_QP = "access_token={}"
	33	PAGE_QP = "page={}"
	34	API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
	35	API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
	36
	37	# Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
	38	API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
	39	API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
	40
	41	API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
	42	API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
	43	API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
	44	API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
	45
	46	DOWNLOADER_COUNT = 1
	47	RETRY_COUNT = 3
	48
	49	MAX_PATH_LENGTH = 250
	50
	51	VERSION = "0.10.5"
	52
	53	TIMESTAMP_FILE = "timestamp.txt"
	54
	55	SESSION = requests.Session()
	56
	57
	58	class MLStripper(HTMLParser):
	59	""" Turns HTML markup into plain text
	60	"""
	61
	62	def error(self, message):
	63	raise ValueError(message)
	64
	65	def __init__(self):
	66	super().__init__()
	67	self.reset()
	68	self.strict = False
	69	self.convert_charrefs = True
	70	self.text = StringIO()
	71
	72	def handle_data(self, d):
	73	self.text.write(d)
	74
	75	def get_data(self):
	76	return self.text.getvalue()
	77
	78	@staticmethod
	79	def strip_tags(html):
	80	s = MLStripper()
	81	s.feed(html)
	82	return s.get_data()
	83
	84
	85	@dataclass
	86	class ThingLink:
	87	thing_id: str
	88	name: str
	89	api_link: str
	90
	91
	92	@dataclass
	93	class FileLink:
	94	name: str
	95	last_update: datetime.datetime
	96	link: str
	97
	98
	99	@dataclass
	100	class ImageLink:
	101	name: str
	102	link: str
	103
	104
	105	class FileLinks:
	106	def __init__(self, initial_links=None):
	107	if initial_links is None:
	108	initial_links = []
	109	self.links = []
	110	self.last_update = None
	111	for link in initial_links:
	112	self.append(link)
	113
	114	def __iter__(self):
	115	return iter(self.links)
	116
	117	def __getitem__(self, item):
	118	return self.links[item]
	119
	120	def __len__(self):
	121	return len(self.links)
	122
	123	def append(self, link):
	124	try:
	125	self.last_update = max(self.last_update, link.last_update)
	126	except TypeError:
	127	self.last_update = link.last_update
	128	self.links.append(link)
	129
	130
	131	class State(enum.Enum):
	132	OK = enum.auto()
	133	FAILED = enum.auto()
	134	ALREADY_DOWNLOADED = enum.auto()
	135
	136
	137	def sanitise_url(url):
	138	""" remove api keys from an url
	139	"""
	140	return re.sub(r'access_token=\w*',
	141	'access_token=***',
	142	url)
	143
	144
	145	def strip_time(date_obj):
	146	""" Takes a datetime object and returns another with the time set to 00:00
	147	"""
	148	return datetime.datetime.combine(date_obj.date(), datetime.time())
	149
	150
	151	def rename_unique(dir_name, target_dir_name):
	152	""" Move a directory sideways to a new name, ensuring it is unique.
	153	"""
	154	target_dir = target_dir_name
	155	inc = 0
	156	while os.path.exists(target_dir):
	157	target_dir = "{}_{}".format(target_dir_name, inc)
	158	inc += 1
	159	os.rename(dir_name, target_dir)
	160	return target_dir
	161
	162
	163	def fail_dir(dir_name):
	164	""" When a download has failed, move it sideways.
	165	"""
	166	return rename_unique(dir_name, "{}_failed".format(dir_name))
	167
	168
	169	def truncate_name(file_name):
	170	""" Ensure the filename is not too long for, well windows basically.
	171	"""
	172	path = os.path.abspath(file_name)
	173	if len(path) <= MAX_PATH_LENGTH:
	174	return path
	175	base, extension = os.path.splitext(path)
	176	inc = 0
	177	new_path = "{}_{}{}".format(base, inc, extension)
	178	while os.path.exists(new_path):
	179	new_path = "{}_{}{}".format(base, inc, extension)
	180	inc += 1
	181	return new_path
	182
	183
	184	def slugify(value):
	185	"""
	186	Normalise string, removes invalid for filename charactersr
	187	and converts string to lowercase.
	188	"""
	189	logging.debug("Sluggyfying {}".format(value))
	190	value = unicodedata.normalize('NFKC', value).lower().strip()
	191	value = re.sub(r'[\\/<>:?*\|"]', '', value)
	192	value = re.sub(r'\.*$', '', value)
	193	return value.strip()
	194
	195
	196	class Downloader(multiprocessing.Process):
	197	"""
	198	Class to handle downloading the things we have found to get.
	199	"""
	200
	201	def __init__(self, thing_queue, download_directory, compress, api_key):
	202	multiprocessing.Process.__init__(self)
	203	# TODO: add parameters
	204	self.thing_queue = thing_queue
	205	self.download_directory = download_directory
	206	self.compress = compress
	207	self.api_key = api_key
	208
	209	def run(self):
	210	""" actual download loop.
	211	"""
	212	while True:
	213	thing_id = self.thing_queue.get()
	214	if thing_id is None:
	215	logging.info("Shutting download queue")
	216	self.thing_queue.task_done()
	217	break
	218	thing = None
	219	if isinstance(thing_id, str):
	220	thing = Thing.from_thing_id(thing_id)
	221	if isinstance(thing_id, ThingLink):
	222	thing = Thing(thing_id)
	223	if not thing:
	224	logging.error("Don't know how to handle thing_id {}".format(thing_id))
	225	else:
	226	logging.info("Handling id {}".format(thing_id))
	227	thing.download(self.download_directory, self.compress, self.api_key)
	228	self.thing_queue.task_done()
	229	return
	230
	231
	232	class Grouping:
	233	""" Holds details of a group of things for download
	234	This is effectively (although not actually) an abstract class
	235	- use Collection or Designs instead.
	236	"""
	237
	238	def __init__(self, quick, compress, api_key):
	239	self.things = []
	240	self.total = 0
	241	self.req_id = None
	242	self.last_page = 0
	243	self.per_page = None
	244	# Should we stop downloading when we hit a known datestamp?
	245	self.quick = quick
	246	self.compress = compress
	247	self.api_key = api_key
	248	# These should be set by child classes.
	249	self.url = None
	250	self.download_dir = None
	251
	252	@property
	253	def get(self):
	254	""" retrieve the things of the grouping. """
	255	if self.things:
	256	# We've already done it.
	257	return self.things
	258
	259	# Check for initialisation:
	260	if not self.url:
	261	logging.error("No URL set - object not initialised properly?")
	262	raise ValueError("No URL set - object not initialised properly?")
	263
	264	# Get the internal details of the grouping.
	265	logging.debug("Querying {}".format(sanitise_url(self.url)))
	266
	267	# follow next links until all items are found
	268	current_url = self.url
	269	while current_url != None:
	270	logging.info("requesting:{}".format(sanitise_url(current_url)))
	271	current_req = SESSION.get(current_url)
	272	current_url = current_req.links.get('next', {}).get('url')
	273	if current_req.status_code != 200:
	274	logging.error(
	275	"Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
	276	current_req.text))
	277	else:
	278	current_json = current_req.json()
	279	for thing in current_json:
	280	logging.debug(thing)
	281	self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
	282	logging.info("Found {} things.".format(len(self.things)))
	283	return self.things
	284
	285	def download(self):
	286	""" Downloads all the files in a collection """
	287	if not self.things:
	288	self.get
	289
	290	if not self.download_dir:
	291	raise ValueError(
	292	"No download_dir set - invalidly initialised object?")
	293
	294	try:
	295	os.mkdir(self.download_dir)
	296	except FileExistsError:
	297	logging.info("Target directory {} already exists. Assuming a resume."
	298	.format(self.download_dir))
	299	logging.info("Downloading {} thing(s).".format(self.total))
	300	for idx, thing in enumerate(self.things):
	301	logging.info("Downloading thing {} - {}".format(idx, thing))
	302	return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
	303	if self.quick and return_code == State.ALREADY_DOWNLOADED:
	304	logging.info("Caught up, stopping.")
	305	return
	306
	307
	308	class Collection(Grouping):
	309	""" Holds details of a collection. """
	310
	311	def __init__(self, user, name, directory, quick, compress, api_key):
	312	Grouping.__init__(self, quick, compress, api_key)
	313	self.user = user
	314	self.name = name
	315	self.paginated = False
	316	# need to figure out the the ID for the collection
	317	collection_url = API_USER_COLLECTIONS.format(user, api_key)
	318	try:
	319	current_req = SESSION.get(collection_url)
	320	except requests.exceptions.ConnectionError as error:
	321	logging.error("Unable to connect for collections for user {}: {}".format(
	322	self.user, error))
	323	return
	324	if current_req.status_code != 200:
	325	logging.error(
	326	"Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
	327	current_req.text))
	328	return
	329	collection_list = current_req.json()
	330	try:
	331	# case insensitive to retain parity with previous behaviour
	332	collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
	333	except IndexError:
	334	logging.error("Unable to find collection {} for user {}".format(name, user))
	335	return
	336	self.collection_id = collection['id']
	337	self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
	338
	339	self.download_dir = os.path.join(directory,
	340	"{}-{}".format(slugify(self.user), slugify(self.name)))
	341
	342
	343	class Designs(Grouping):
	344	""" Holds details of all of a users' designs. """
	345
	346	def __init__(self, user, directory, quick, compress, api_key):
	347	Grouping.__init__(self, quick, compress, api_key)
	348	self.user = user
	349	self.url = API_USER_DESIGNS.format(user, api_key)
	350	self.download_dir = os.path.join(
	351	directory, "{} designs".format(slugify(self.user)))
	352
	353
	354	class Thing:
	355	""" An individual design on thingiverse. """
	356
	357	def __init__(self, thing_link):
	358	self.thing_id = thing_link.thing_id
	359	self.name = thing_link.name
	360	self.last_time = None
	361	self._parsed = False
	362	self._needs_download = True
	363	self.text = None
	364	self.download_dir = None
	365	self.time_stamp = None
	366	self._file_links = FileLinks()
	367	self._image_links = []
	368
	369	@classmethod
	370	def from_thing_id(cls, thing_id):
	371	"""
	372	Factory method that looks up a thing by ID and creates a Thing object for it
	373	:param thing_id: to look up
	374	:return: Thing or None
	375	"""
	376	return Thing(ThingLink(thing_id, "", ""))
	377
	378	def _parse(self, base_dir, api_key):
	379	""" Work out what, if anything needs to be done. """
	380	if self._parsed:
	381	return
	382
	383	# First get the broad details
	384	url = API_THING_DETAILS.format(self.thing_id, api_key)
	385	try:
	386	current_req = SESSION.get(url)
	387	except requests.exceptions.ConnectionError as error:
	388	logging.error("Unable to connect for thing {}: {}".format(
	389	self.thing_id, error))
	390	return
	391	# Check for DMCA
	392	if current_req.status_code == 403:
	393	logging.error("Access to thing {} is forbidden".format(self.thing_id))
	394	return
	395	if current_req.status_code != 200:
	396	logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
	397	current_req.text))
	398	return
	399
	400	thing_json = current_req.json()
	401	try:
	402	self._license = thing_json['license']
	403	except KeyError:
	404	logging.warning("No license found for thing {}?".format(self.thing_id))
	405
	406	details = None
	407	try:
	408	details = thing_json['details']
	409	except KeyError:
	410	logging.warning("No description found for thing {}?".format(self.thing_id))
	411
	412	if details:
	413	try:
	414	self._details = MLStripper.strip_tags(details)
	415	except ValueError as e:
	416	logging.warning("Unable to strip HTML from readme: {}".format(e))
	417	self._details = details
	418
	419	if not self.name:
	420	# Probably generated with factory method.
	421	try:
	422	self.name = thing_json['name']
	423	except KeyError:
	424	logging.warning("No name found for thing {}?".format(self.thing_id))
	425	self.name = self.thing_id
	426
	427	# Now get the file details
	428	file_url = API_THING_FILES.format(self.thing_id, api_key)
	429
	430	try:
	431	current_req = SESSION.get(file_url)
	432	except requests.exceptions.ConnectionError as error:
	433	logging.error("Unable to connect for thing {}: {}".format(
	434	self.thing_id, error))
	435	return
	436
	437	if current_req.status_code != 200:
	438	logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
	439	current_req.text))
	440	return
	441
	442	link_list = current_req.json()
	443
	444	if not link_list:
	445	logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
	446	self.thing_id))
	447
	448	for link in link_list:
	449	logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
	450	try:
	451	datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
	452	self._file_links.append(
	453	FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
	454	except ValueError:
	455	logging.error(link['date'])
	456
	457	# Finally get the image links
	458	image_url = API_THING_IMAGES.format(self.thing_id, api_key)
	459
	460	try:
	461	current_req = SESSION.get(image_url)
	462	except requests.exceptions.ConnectionError as error:
	463	logging.error("Unable to connect for thing {}: {}".format(
	464	self.thing_id, error))
	465	return
	466
	467	if current_req.status_code != 200:
	468	logging.error(
	469	"Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
	470	current_req.text))
	471	return
	472
	473	image_list = current_req.json()
	474
	475	if not image_list:
	476	logging.warning(
	477	"No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
	478	self.thing_id))
	479
	480	for image in image_list:
	481	logging.debug("parsing image: {}".format(image))
	482	name = None
	483	try:
	484	name = slugify(image['name'])
	485	# TODO: fallback to other types
	486	url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
	487	except KeyError:
	488	logging.warning("Missing image for {}".format(name))
	489	self._image_links.append(ImageLink(name, url))
	490
	491	self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
	492	self.download_dir = os.path.join(base_dir, self.slug)
	493
	494	self._handle_old_directory(base_dir)
	495
	496	logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
	497	latest, self.last_time = self._find_last_download(base_dir)
	498
	499	if not latest:
	500	# Not yet downloaded
	501	self._parsed = True
	502	return
	503
	504	logging.info("last downloaded version: {}".format(self.last_time))
	505
	506	# OK, so we have a timestamp, lets see if there is anything new to get
	507	# First off, are we comparing an old download that threw away the timestamp?
	508	ignore_time = self.last_time == strip_time(self.last_time)
	509	try:
	510	# TODO: Allow for comparison at the exact time
	511	files_last_update = self._file_links.last_update
	512	if ignore_time:
	513	logging.info("Dropping time from comparison stamp as old-style download dir")
	514	files_last_update = strip_time(files_last_update)
	515
	516	if files_last_update > self.last_time:
	517	logging.info(
	518	"Found new/updated files {}".format(self._file_links.last_update))
	519	self._needs_download = True
	520	self._parsed = True
	521	return
	522	except TypeError:
	523	logging.warning("No files found for {}.".format(self.thing_id))
	524
	525	# Got here, so nope, no new files.
	526	self._needs_download = False
	527	self._parsed = True
	528
	529	def _handle_old_directory(self, base_dir):
	530	""" Deal with any old directories from previous versions of the code.
	531	"""
	532	old_dir = os.path.join(base_dir, slugify(self.name))
	533	if os.path.exists(old_dir):
	534	logging.warning("Found old style download_dir. Moving.")
	535	rename_unique(old_dir, self.download_dir)
	536
	537	def _handle_outdated_directory(self):
	538	""" Move the current download directory sideways if the thing has changed.
	539	"""
	540	if not os.path.exists(self.download_dir):
	541	# No old directory to move.
	542	return None
	543	timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
	544	if not os.path.exists(timestamp_file):
	545	# Old form of download directory
	546	target_dir_name = "{} - old".format(self.download_dir)
	547	else:
	548	target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
	549	return rename_unique(self.download_dir, target_dir_name)
	550
	551	def _find_last_download(self, base_dir):
	552	""" Look for the most recent previous download (if any) of the thing.
	553	"""
	554	logging.info("Looking for old things")
	555
	556	# First the DL directory itself.
	557	timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
	558
	559	latest = None
	560	latest_time = None
	561
	562	try:
	563	logging.debug("Checking for existing download in normal place.")
	564	with open(timestamp_file) as ts_fh:
	565	timestamp_text = ts_fh.read().strip()
	566	latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
	567	latest = self.download_dir
	568	except FileNotFoundError:
	569	# No existing download directory. huh.
	570	pass
	571	except TypeError:
	572	logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
	573
	574	# TODO: Maybe look for old download directories.
	575
	576	# Now look for 7z files
	577	candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
	578	# +3 to allow for ' - '
	579	leading_length = len(self.slug) + 3
	580	for path in candidates:
	581	candidate = os.path.basename(path)
	582	try:
	583	logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
	584	candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
	585	except ValueError:
	586	logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
	587	continue
	588	try:
	589	if candidate_time > latest_time:
	590	latest_time = candidate_time
	591	latest = candidate
	592	except TypeError:
	593	latest_time = candidate_time
	594	latest = candidate
	595	logging.info("Found last old thing: {} / {}".format(latest, latest_time))
	596	return latest, latest_time
	597
	598	def download(self, base_dir, compress, api_key):
	599	""" Download all files for a given thing.
	600	Returns True iff the thing is now downloaded (not iff it downloads the thing!)
	601	"""
	602	if not self._parsed:
	603	self._parse(base_dir, api_key)
	604
	605	if not self._parsed:
	606	logging.error(
	607	"Unable to parse {} - aborting download".format(self.thing_id))
	608	return State.FAILED
	609
	610	if not self._needs_download:
	611	logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
	612	return State.ALREADY_DOWNLOADED
	613
	614	if not self._file_links:
	615	logging.error(
	616	"{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
	617	return State.FAILED
	618
	619	# Have we already downloaded some things?
	620	renamed_dir = self._handle_outdated_directory()
	621
	622	# Get the list of files to download
	623
	624	new_file_links = []
	625	old_file_links = []
	626	self.time_stamp = None
	627
	628	if not self.last_time:
	629	# If we don't have anything to copy from, then it is all new.
	630	logging.debug("No last time, downloading all files")
	631	new_file_links = self._file_links
	632	self.time_stamp = new_file_links[0].last_update
	633
	634	for file_link in new_file_links:
	635	self.time_stamp = max(self.time_stamp, file_link.last_update)
	636	logging.debug("New timestamp will be {}".format(self.time_stamp))
	637	else:
	638	self.time_stamp = self.last_time
	639	for file_link in self._file_links:
	640	if file_link.last_update > self.last_time:
	641	new_file_links.append(file_link)
	642	self.time_stamp = max(self.time_stamp, file_link.last_update)
	643	else:
	644	old_file_links.append(file_link)
	645
	646	logging.debug("new timestamp {}".format(self.time_stamp))
	647
	648	# OK. Time to get to work.
	649	logging.debug("Generating download_dir")
	650	os.mkdir(self.download_dir)
	651	filelist_file = os.path.join(self.download_dir, "filelist.txt")
	652	with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
	653	for fl in self._file_links:
	654	fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
	655
	656	# First grab the cached files (if any)
	657	logging.info("Copying {} unchanged files.".format(len(old_file_links)))
	658	if renamed_dir:
	659	for file_link in old_file_links:
	660	try:
	661	old_file = os.path.join(renamed_dir, file_link.name)
	662	new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
	663	logging.debug("Copying {} to {}".format(old_file, new_file))
	664	copyfile(old_file, new_file)
	665	except FileNotFoundError:
	666	logging.warning(
	667	"Unable to find {} in old archive, redownloading".format(file_link.name))
	668	new_file_links.append(file_link)
	669	except TypeError:
	670	# Not altogether sure how this could occur, possibly with some combination of the old file types
	671	logging.warning(
	672	"Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
	673	new_file_links.append(file_link)
	674
	675	# Now download the new ones
	676	logging.info("Downloading {} new files of {}".format(
	677	len(new_file_links), len(self._file_links)))
	678	try:
	679	for file_link in new_file_links:
	680	file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
	681	logging.debug("Downloading {} from {} to {}".format(
	682	file_link.name, file_link.link, file_name))
	683	data_req = SESSION.get(file_link.link)
	684	if data_req.status_code != 200:
	685	logging.error("Unexpected status code {} for {}".format(data_req.status_code,
	686	sanitise_url(file_link.link)))
	687	logging.debug("Unexpected status code {} for {}: {}".format(data_req.status_code,
	688	sanitise_url(file_link.link),
	689	data_req.text))
	690	fail_dir(self.download_dir)
	691	return State.FAILED
	692
	693	with open(file_name, 'wb') as handle:
	694	handle.write(data_req.content)
	695	except Exception as exception:
	696	logging.error("Failed to download {} - {}".format(file_link.name, exception))
	697	fail_dir(self.download_dir)
	698	return State.FAILED
	699
	700	# People like images.
	701	image_dir = os.path.join(self.download_dir, 'images')
	702	logging.info("Downloading {} images.".format(len(self._image_links)))
	703	try:
	704	os.mkdir(image_dir)
	705	for imagelink in self._image_links:
	706	filename = os.path.join(image_dir, imagelink.name)
	707	image_req = SESSION.get(imagelink.link)
	708	if image_req.status_code != 200:
	709	logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
	710	sanitise_url(imagelink.link),
	711	image_req.text))
	712	fail_dir(self.download_dir)
	713	return State.FAILED
	714	with open(truncate_name(filename), 'wb') as handle:
	715	handle.write(image_req.content)
	716	except Exception as exception:
	717	logging.error("Failed to download {} - {}".format(imagelink.name, exception))
	718	fail_dir(self.download_dir)
	719	return State.FAILED
	720
	721	# Best get some licenses
	722	logging.info("writing license file")
	723	try:
	724	if self._license:
	725	with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
	726	encoding="utf-8") as license_handle:
	727	license_handle.write("{}\n".format(self._license))
	728	except IOError as exception:
	729	logging.warning("Failed to write license! {}".format(exception))
	730
	731	logging.info("writing readme")
	732	try:
	733	if self._details:
	734	with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
	735	encoding="utf-8") as readme_handle:
	736	readme_handle.write("{}\n".format(self._details))
	737	except IOError as exception:
	738	logging.warning("Failed to write readme! {}".format(exception))
	739
	740	try:
	741	# Now write the timestamp
	742	with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
	743	timestamp_handle.write(self.time_stamp.__str__())
	744	except Exception as exception:
	745	logging.error("Failed to write timestamp file - {}".format(exception))
	746	fail_dir(self.download_dir)
	747	return State.FAILED
	748	self._needs_download = False
	749	logging.debug("Download of {} finished".format(self.name))
	750	if not compress:
	751	return State.OK
	752
	753	thing_dir = "{} - {} - {}".format(self.thing_id,
	754	slugify(self.name),
	755	self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
	756	file_name = os.path.join(base_dir,
	757	"{}.7z".format(thing_dir))
	758	logging.debug("Compressing {} to {}".format(
	759	self.name,
	760	file_name))
	761	with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
	762	archive.writeall(self.download_dir, thing_dir)
	763	logging.debug("Compression of {} finished.".format(self.name))
	764	shutil.rmtree(self.download_dir)
	765	logging.debug("Removed temporary download dir of {}.".format(self.name))
	766	return State.OK
	767
	768
	769	def do_batch(batch_file, download_dir, quick, compress, api_key):
	770	""" Read a file in line by line, parsing each as a set of calls to this script."""
	771	with open(batch_file) as handle:
	772	for line in handle:
	773	line = line.strip()
	774	if not line:
	775	# Skip empty lines
	776	continue
	777	logging.info("Handling instruction {}".format(line))
	778	command_arr = line.split()
	779	if command_arr[0] == "thing":
	780	logging.debug(
	781	"Handling batch thing instruction: {}".format(line))
	782	Thing.from_thing_id(command_arr[1]).download(download_dir, compress, api_key)
	783	continue
	784	if command_arr[0] == "collection":
	785	logging.debug(
	786	"Handling batch collection instruction: {}".format(line))
	787	Collection(command_arr[1], command_arr[2],
	788	download_dir, quick, compress, api_key).download()
	789	continue
	790	if command_arr[0] == "user":
	791	logging.debug(
	792	"Handling batch collection instruction: {}".format(line))
	793	Designs(command_arr[1], download_dir, quick, compress, api_key).download()
	794	continue
	795	logging.warning("Unable to parse current instruction. Skipping.")
	796
	797
	798	def main():
	799	""" Entry point for script being run as a command. """
	800	parser = argparse.ArgumentParser()
	801	parser.add_argument("-l", "--log-level", choices=[
	802	'debug', 'info', 'warning'], default='info', help="level of logging desired")
	803	parser.add_argument("-d", "--directory",
	804	help="Target directory to download into")
	805	parser.add_argument("-f", "--log-file",
	806	help="Place to log debug information to")
	807	parser.add_argument("-q", "--quick", action="store_true",
	808	help="Assume date ordering on posts")
	809	parser.add_argument("-c", "--compress", action="store_true",
	810	help="Compress files")
	811	parser.add_argument("-a", "--api-key",
	812	help="API key for thingiverse")
	813
	814	subparsers = parser.add_subparsers(
	815	help="Type of thing to download", dest="subcommand")
	816	collection_parser = subparsers.add_parser(
	817	'collection', help="Download one or more entire collection(s)")
	818	collection_parser.add_argument(
	819	"owner", help="The owner of the collection(s) to get")
	820	collection_parser.add_argument(
	821	"collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
	822	thing_parser = subparsers.add_parser(
	823	'thing', help="Download a single thing.")
	824	thing_parser.add_argument(
	825	"things", nargs="*", help="Space seperated list of thing ID(s) to download")
	826	user_parser = subparsers.add_parser(
	827	"user", help="Download all things by one or more users")
	828	user_parser.add_argument(
	829	"users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
	830	batch_parser = subparsers.add_parser(
	831	"batch", help="Perform multiple actions written in a text file")
	832	batch_parser.add_argument(
	833	"batch_file", help="The name of the file to read.")
	834	subparsers.add_parser("version", help="Show the current version")
	835
	836	args = parser.parse_args()
	837	if not args.subcommand:
	838	parser.print_help()
	839	sys.exit(1)
	840	if not args.directory:
	841	args.directory = os.getcwd()
	842
	843	logger = logging.getLogger()
	844	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	845	logger.setLevel(logging.DEBUG)
	846	console_handler = logging.StreamHandler()
	847	console_handler.setLevel(args.log_level.upper())
	848
	849	if args.api_key:
	850	api_key = args.api_key
	851	else:
	852	try:
	853	with open("api.key") as fh:
	854	api_key = fh.read().strip()
	855	except Exception as e:
	856	logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
	857	logging.error("Exception: {}".format(e))
	858	return
	859
	860	logger.addHandler(console_handler)
	861	if args.log_file:
	862	file_handler = logging.FileHandler(args.log_file)
	863	file_handler.setLevel(logging.DEBUG)
	864	file_handler.setFormatter(formatter)
	865	logger.addHandler(file_handler)
	866
	867	# Start downloader
	868	thing_queue = multiprocessing.JoinableQueue()
	869	logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
	870	downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
	871	for downloader in downloaders:
	872	downloader.start()
	873
	874	if args.subcommand.startswith("collection"):
	875	for collection in args.collections:
	876	Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
	877	if args.subcommand == "thing":
	878	for thing in args.things:
	879	thing_queue.put(thing)
	880	if args.subcommand == "user":
	881	for user in args.users:
	882	Designs(user, args.directory, args.quick, args.compress, api_key).download()
	883	if args.subcommand == "version":
	884	print("thingy_grabber.py version {}".format(VERSION))
	885	if args.subcommand == "batch":
	886	do_batch(args.batch_file, args.directory, args.quick, args.compress, api_key)
	887
	888	# Stop the downloader processes
	889	for _ in downloaders:
	890	thing_queue.put(None)
	891
	892
	893	if __name__ == "__main__":
	894	multiprocessing.freeze_support()
	895	main()