import py7zr
import glob
import shutil
+from io import StringIO
+from html.parser import HTMLParser
SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
API_BASE = "https://api.thingiverse.com"
ACCESS_QP = "access_token={}"
PAGE_QP = "page={}"
-API_USER_DESIGNS = API_BASE + "/users/{}/things/"
+API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
# Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
-API_KEY = None
-
DOWNLOADER_COUNT = 1
RETRY_COUNT = 3
MAX_PATH_LENGTH = 250
-VERSION = "0.10.2"
+VERSION = "0.10.5"
TIMESTAMP_FILE = "timestamp.txt"
SESSION = requests.Session()
+class MLStripper(HTMLParser):
+ """ Turns HTML markup into plain text
+ """
+
+ def error(self, message):
+ raise ValueError(message)
+
+ def __init__(self):
+ super().__init__()
+ self.reset()
+ self.strict = False
+ self.convert_charrefs = True
+ self.text = StringIO()
+
+ def handle_data(self, d):
+ self.text.write(d)
+
+ def get_data(self):
+ return self.text.getvalue()
+
+ @staticmethod
+ def strip_tags(html):
+ s = MLStripper()
+ s.feed(html)
+ return s.get_data()
+
+
@dataclass
class ThingLink:
thing_id: str
path = os.path.abspath(file_name)
if len(path) <= MAX_PATH_LENGTH:
return path
- to_cut = len(path) - (MAX_PATH_LENGTH + 3)
base, extension = os.path.splitext(path)
inc = 0
new_path = "{}_{}{}".format(base, inc, extension)
value = unicodedata.normalize('NFKC', value).lower().strip()
value = re.sub(r'[\\/<>:?*|"]', '', value)
value = re.sub(r'\.*$', '', value)
- return value
+ return value.strip()
class Downloader(multiprocessing.Process):
Class to handle downloading the things we have found to get.
"""
- def __init__(self, thing_queue, download_directory, compress):
+ def __init__(self, thing_queue, download_directory, compress, api_key):
multiprocessing.Process.__init__(self)
# TODO: add parameters
self.thing_queue = thing_queue
self.download_directory = download_directory
self.compress = compress
+ self.api_key = api_key
def run(self):
""" actual download loop.
"""
while True:
- thing_id = self.thing_queue.get
+ thing_id = self.thing_queue.get()
if thing_id is None:
logging.info("Shutting download queue")
self.thing_queue.task_done()
break
- logging.info("Handling id {}".format(thing_id))
- Thing(thing_id).download(self.download_directory, self.compress)
+ thing = None
+ if isinstance(thing_id, str):
+ thing = Thing.from_thing_id(thing_id)
+ if isinstance(thing_id, ThingLink):
+ thing = Thing(thing_id)
+ if not thing:
+ logging.error("Don't know how to handle thing_id {}".format(thing_id))
+ else:
+ logging.info("Handling id {}".format(thing_id))
+ thing.download(self.download_directory, self.compress, self.api_key)
self.thing_queue.task_done()
return
- use Collection or Designs instead.
"""
- def __init__(self, quick, compress):
+ def __init__(self, quick, compress, api_key):
self.things = []
self.total = 0
self.req_id = None
# Should we stop downloading when we hit a known datestamp?
self.quick = quick
self.compress = compress
+ self.api_key = api_key
# These should be set by child classes.
self.url = None
self.download_dir = None
# Get the internal details of the grouping.
logging.debug("Querying {}".format(sanitise_url(self.url)))
- page = 0
- # self.url should already have been formatted as we don't need pagination
- logging.info("requesting:{}".format(sanitise_url(self.url)))
- current_req = SESSION.get(self.url)
- if current_req.status_code != 200:
- logging.error(
- "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
- current_req.text))
- else:
- current_json = current_req.json()
- for thing in current_json:
- logging.info(thing)
- self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
+ # follow next links until all items are found
+ current_url = self.url
+ while current_url != None:
+ logging.info("requesting:{}".format(sanitise_url(current_url)))
+ current_req = SESSION.get(current_url)
+ current_url = current_req.links.get('next', {}).get('url')
+ if current_req.status_code != 200:
+ logging.error(
+ "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
+ current_req.text))
+ else:
+ current_json = current_req.json()
+ for thing in current_json:
+ logging.debug(thing)
+ self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
logging.info("Found {} things.".format(len(self.things)))
return self.things
def download(self):
""" Downloads all the files in a collection """
if not self.things:
- self.get()
+ self.get
if not self.download_dir:
raise ValueError(
"No download_dir set - invalidly initialised object?")
- base_dir = os.getcwd()
try:
os.mkdir(self.download_dir)
except FileExistsError:
logging.info("Downloading {} thing(s).".format(self.total))
for idx, thing in enumerate(self.things):
logging.info("Downloading thing {} - {}".format(idx, thing))
- RC = Thing(thing).download(self.download_dir, self.compress)
- if self.quick and RC == State.ALREADY_DOWNLOADED:
+ return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
+ if self.quick and return_code == State.ALREADY_DOWNLOADED:
logging.info("Caught up, stopping.")
return
class Collection(Grouping):
""" Holds details of a collection. """
- def __init__(self, user, name, directory, quick, compress):
- Grouping.__init__(self, quick, compress)
+ def __init__(self, user, name, directory, quick, compress, api_key):
+ Grouping.__init__(self, quick, compress, api_key)
self.user = user
self.name = name
self.paginated = False
# need to figure out the the ID for the collection
- collection_url = API_USER_COLLECTIONS.format(user, API_KEY)
+ collection_url = API_USER_COLLECTIONS.format(user, api_key)
try:
current_req = SESSION.get(collection_url)
except requests.exceptions.ConnectionError as error:
logging.error("Unable to find collection {} for user {}".format(name, user))
return
self.collection_id = collection['id']
- self.url = API_COLLECTION_THINGS.format(self.collection_id, API_KEY)
+ self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
self.download_dir = os.path.join(directory,
"{}-{}".format(slugify(self.user), slugify(self.name)))
class Designs(Grouping):
""" Holds details of all of a users' designs. """
- def __init__(self, user, directory, quick, compress):
- Grouping.__init__(self, quick, compress)
+ def __init__(self, user, directory, quick, compress, api_key):
+ Grouping.__init__(self, quick, compress, api_key)
self.user = user
- self.url = API_USER_DESIGNS.format(user)
- self.paginated = True
+ self.url = API_USER_DESIGNS.format(user, api_key)
self.download_dir = os.path.join(
directory, "{} designs".format(slugify(self.user)))
def __init__(self, thing_link):
self.thing_id = thing_link.thing_id
self.name = thing_link.name
- self.api_link = thing_link.api_link
self.last_time = None
self._parsed = False
self._needs_download = True
self._file_links = FileLinks()
self._image_links = []
- def _parse(self, base_dir):
+ @classmethod
+ def from_thing_id(cls, thing_id):
+ """
+ Factory method that looks up a thing by ID and creates a Thing object for it
+ :param thing_id: to look up
+ :return: Thing or None
+ """
+ return Thing(ThingLink(thing_id, "", ""))
+
+ def _parse(self, base_dir, api_key):
""" Work out what, if anything needs to be done. """
if self._parsed:
return
# First get the broad details
- url = API_THING_DETAILS.format(self.thing_id, API_KEY)
+ url = API_THING_DETAILS.format(self.thing_id, api_key)
try:
current_req = SESSION.get(url)
except requests.exceptions.ConnectionError as error:
except KeyError:
logging.warning("No license found for thing {}?".format(self.thing_id))
- # TODO: Get non-html version of this?
+ details = None
try:
- self._details = thing_json['details']
+ details = thing_json['details']
except KeyError:
logging.warning("No description found for thing {}?".format(self.thing_id))
+ if details:
+ try:
+ self._details = MLStripper.strip_tags(details)
+ except ValueError as e:
+ logging.warning("Unable to strip HTML from readme: {}".format(e))
+ self._details = details
+
+ if not self.name:
+ # Probably generated with factory method.
+ try:
+ self.name = thing_json['name']
+ except KeyError:
+ logging.warning("No name found for thing {}?".format(self.thing_id))
+ self.name = self.thing_id
+
# Now get the file details
- file_url = API_THING_FILES.format(self.thing_id, API_KEY)
+ file_url = API_THING_FILES.format(self.thing_id, api_key)
try:
current_req = SESSION.get(file_url)
try:
datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
self._file_links.append(
- FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(API_KEY)))
+ FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
except ValueError:
logging.error(link['date'])
# Finally get the image links
- image_url = API_THING_IMAGES.format(self.thing_id, API_KEY)
+ image_url = API_THING_IMAGES.format(self.thing_id, api_key)
try:
current_req = SESSION.get(image_url)
latest_time = candidate_time
latest = candidate
logging.info("Found last old thing: {} / {}".format(latest, latest_time))
- return (latest, latest_time)
+ return latest, latest_time
- def download(self, base_dir, compress):
+ def download(self, base_dir, compress, api_key):
""" Download all files for a given thing.
Returns True iff the thing is now downloaded (not iff it downloads the thing!)
"""
if not self._parsed:
- self._parse(base_dir)
+ self._parse(base_dir, api_key)
if not self._parsed:
logging.error(
file_link.name, file_link.link, file_name))
data_req = SESSION.get(file_link.link)
if data_req.status_code != 200:
- logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
+ logging.error("Unexpected status code {} for {}".format(data_req.status_code,
+ sanitise_url(file_link.link)))
+ logging.debug("Unexpected status code {} for {}: {}".format(data_req.status_code,
sanitise_url(file_link.link),
data_req.text))
fail_dir(self.download_dir)
return State.OK
-def do_batch(batch_file, download_dir, quick, compress):
+def do_batch(batch_file, download_dir, quick, compress, api_key):
""" Read a file in line by line, parsing each as a set of calls to this script."""
with open(batch_file) as handle:
for line in handle:
if command_arr[0] == "thing":
logging.debug(
"Handling batch thing instruction: {}".format(line))
- Thing(command_arr[1]).download(download_dir, compress)
+ Thing.from_thing_id(command_arr[1]).download(download_dir, compress, api_key)
continue
if command_arr[0] == "collection":
logging.debug(
"Handling batch collection instruction: {}".format(line))
Collection(command_arr[1], command_arr[2],
- download_dir, quick, compress).download()
+ download_dir, quick, compress, api_key).download()
continue
if command_arr[0] == "user":
logging.debug(
"Handling batch collection instruction: {}".format(line))
- Designs(command_arr[1], download_dir, quick, compress).download()
+ Designs(command_arr[1], download_dir, quick, compress, api_key).download()
continue
logging.warning("Unable to parse current instruction. Skipping.")
console_handler = logging.StreamHandler()
console_handler.setLevel(args.log_level.upper())
- global API_KEY
if args.api_key:
- API_KEY = args.api_key
+ api_key = args.api_key
else:
try:
with open("api.key") as fh:
- API_KEY = fh.read().strip()
+ api_key = fh.read().strip()
except Exception as e:
logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
logging.error("Exception: {}".format(e))
# Start downloader
thing_queue = multiprocessing.JoinableQueue()
logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
- downloaders = [Downloader(thing_queue, args.directory, args.compress) for _ in range(DOWNLOADER_COUNT)]
+ downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
for downloader in downloaders:
downloader.start()
if args.subcommand.startswith("collection"):
for collection in args.collections:
- Collection(args.owner, collection, args.directory, args.quick, args.compress).download()
+ Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
if args.subcommand == "thing":
for thing in args.things:
thing_queue.put(thing)
if args.subcommand == "user":
for user in args.users:
- Designs(user, args.directory, args.quick, args.compress).download()
+ Designs(user, args.directory, args.quick, args.compress, api_key).download()
if args.subcommand == "version":
print("thingy_grabber.py version {}".format(VERSION))
if args.subcommand == "batch":
- do_batch(args.batch_file, args.directory, args.quick, args.compress)
+ do_batch(args.batch_file, args.directory, args.quick, args.compress, api_key)
# Stop the downloader processes
for _ in downloaders: