Only output file download error text when logging is turned up.
[clinton/thingy_grabber.git] / thingy_grabber.py
index a53b164..c4fb963 100755 (executable)
@@ -18,6 +18,8 @@ from dataclasses import dataclass
 import py7zr
 import glob
 import shutil
+from io import StringIO
+from html.parser import HTMLParser
 
 SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
 
@@ -29,7 +31,7 @@ SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
 API_BASE = "https://api.thingiverse.com"
 ACCESS_QP = "access_token={}"
 PAGE_QP = "page={}"
-API_USER_DESIGNS = API_BASE + "/users/{}/things/"
+API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
 API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
 
 # Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
@@ -41,20 +43,45 @@ API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
 API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
 API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
 
-API_KEY = None
-
 DOWNLOADER_COUNT = 1
 RETRY_COUNT = 3
 
 MAX_PATH_LENGTH = 250
 
-VERSION = "0.10.2"
+VERSION = "0.10.5"
 
 TIMESTAMP_FILE = "timestamp.txt"
 
 SESSION = requests.Session()
 
 
+class MLStripper(HTMLParser):
+    """ Turns HTML markup into plain text
+    """
+
+    def error(self, message):
+        raise ValueError(message)
+
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.strict = False
+        self.convert_charrefs = True
+        self.text = StringIO()
+
+    def handle_data(self, d):
+        self.text.write(d)
+
+    def get_data(self):
+        return self.text.getvalue()
+
+    @staticmethod
+    def strip_tags(html):
+        s = MLStripper()
+        s.feed(html)
+        return s.get_data()
+
+
 @dataclass
 class ThingLink:
     thing_id: str
@@ -145,7 +172,6 @@ def truncate_name(file_name):
     path = os.path.abspath(file_name)
     if len(path) <= MAX_PATH_LENGTH:
         return path
-    to_cut = len(path) - (MAX_PATH_LENGTH + 3)
     base, extension = os.path.splitext(path)
     inc = 0
     new_path = "{}_{}{}".format(base, inc, extension)
@@ -164,7 +190,7 @@ def slugify(value):
     value = unicodedata.normalize('NFKC', value).lower().strip()
     value = re.sub(r'[\\/<>:?*|"]', '', value)
     value = re.sub(r'\.*$', '', value)
-    return value
+    return value.strip()
 
 
 class Downloader(multiprocessing.Process):
@@ -172,24 +198,33 @@ class Downloader(multiprocessing.Process):
     Class to handle downloading the things we have found to get.
     """
 
-    def __init__(self, thing_queue, download_directory, compress):
+    def __init__(self, thing_queue, download_directory, compress, api_key):
         multiprocessing.Process.__init__(self)
         # TODO: add parameters
         self.thing_queue = thing_queue
         self.download_directory = download_directory
         self.compress = compress
+        self.api_key = api_key
 
     def run(self):
         """ actual download loop.
         """
         while True:
-            thing_id = self.thing_queue.get
+            thing_id = self.thing_queue.get()
             if thing_id is None:
                 logging.info("Shutting download queue")
                 self.thing_queue.task_done()
                 break
-            logging.info("Handling id {}".format(thing_id))
-            Thing(thing_id).download(self.download_directory, self.compress)
+            thing = None
+            if isinstance(thing_id, str):
+                thing = Thing.from_thing_id(thing_id)
+            if isinstance(thing_id, ThingLink):
+                thing = Thing(thing_id)
+            if not thing:
+                logging.error("Don't know how to handle thing_id {}".format(thing_id))
+            else:
+                logging.info("Handling id {}".format(thing_id))
+                thing.download(self.download_directory, self.compress, self.api_key)
             self.thing_queue.task_done()
         return
 
@@ -200,7 +235,7 @@ class Grouping:
         - use Collection or Designs instead.
     """
 
-    def __init__(self, quick, compress):
+    def __init__(self, quick, compress, api_key):
         self.things = []
         self.total = 0
         self.req_id = None
@@ -209,6 +244,7 @@ class Grouping:
         # Should we stop downloading when we hit a known datestamp?
         self.quick = quick
         self.compress = compress
+        self.api_key = api_key
         # These should be set by child classes.
         self.url = None
         self.download_dir = None
@@ -227,33 +263,34 @@ class Grouping:
 
         # Get the internal details of the grouping.
         logging.debug("Querying {}".format(sanitise_url(self.url)))
-        page = 0
 
-        # self.url should already have been formatted as we don't need pagination
-        logging.info("requesting:{}".format(sanitise_url(self.url)))
-        current_req = SESSION.get(self.url)
-        if current_req.status_code != 200:
-            logging.error(
-                "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
-                                                                current_req.text))
-        else:
-            current_json = current_req.json()
-            for thing in current_json:
-                logging.info(thing)
-                self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
+        # follow next links until all items are found
+        current_url = self.url
+        while current_url != None:
+            logging.info("requesting:{}".format(sanitise_url(current_url)))
+            current_req = SESSION.get(current_url)
+            current_url = current_req.links.get('next', {}).get('url')
+            if current_req.status_code != 200:
+                logging.error(
+                    "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
+                                                                    current_req.text))
+            else:
+                current_json = current_req.json()
+                for thing in current_json:
+                    logging.debug(thing)
+                    self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
         logging.info("Found {} things.".format(len(self.things)))
         return self.things
 
     def download(self):
         """ Downloads all the files in a collection """
         if not self.things:
-            self.get()
+            self.get
 
         if not self.download_dir:
             raise ValueError(
                 "No download_dir set - invalidly initialised object?")
 
-        base_dir = os.getcwd()
         try:
             os.mkdir(self.download_dir)
         except FileExistsError:
@@ -262,8 +299,8 @@ class Grouping:
         logging.info("Downloading {} thing(s).".format(self.total))
         for idx, thing in enumerate(self.things):
             logging.info("Downloading thing {} - {}".format(idx, thing))
-            RC = Thing(thing).download(self.download_dir, self.compress)
-            if self.quick and RC == State.ALREADY_DOWNLOADED:
+            return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
+            if self.quick and return_code == State.ALREADY_DOWNLOADED:
                 logging.info("Caught up, stopping.")
                 return
 
@@ -271,13 +308,13 @@ class Grouping:
 class Collection(Grouping):
     """ Holds details of a collection. """
 
-    def __init__(self, user, name, directory, quick, compress):
-        Grouping.__init__(self, quick, compress)
+    def __init__(self, user, name, directory, quick, compress, api_key):
+        Grouping.__init__(self, quick, compress, api_key)
         self.user = user
         self.name = name
         self.paginated = False
         # need to figure out the the ID for the collection
-        collection_url = API_USER_COLLECTIONS.format(user, API_KEY)
+        collection_url = API_USER_COLLECTIONS.format(user, api_key)
         try:
             current_req = SESSION.get(collection_url)
         except requests.exceptions.ConnectionError as error:
@@ -297,7 +334,7 @@ class Collection(Grouping):
             logging.error("Unable to find collection {} for user {}".format(name, user))
             return
         self.collection_id = collection['id']
-        self.url = API_COLLECTION_THINGS.format(self.collection_id, API_KEY)
+        self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
 
         self.download_dir = os.path.join(directory,
                                          "{}-{}".format(slugify(self.user), slugify(self.name)))
@@ -306,11 +343,10 @@ class Collection(Grouping):
 class Designs(Grouping):
     """ Holds details of all of a users' designs. """
 
-    def __init__(self, user, directory, quick, compress):
-        Grouping.__init__(self, quick, compress)
+    def __init__(self, user, directory, quick, compress, api_key):
+        Grouping.__init__(self, quick, compress, api_key)
         self.user = user
-        self.url = API_USER_DESIGNS.format(user)
-        self.paginated = True
+        self.url = API_USER_DESIGNS.format(user, api_key)
         self.download_dir = os.path.join(
             directory, "{} designs".format(slugify(self.user)))
 
@@ -321,7 +357,6 @@ class Thing:
     def __init__(self, thing_link):
         self.thing_id = thing_link.thing_id
         self.name = thing_link.name
-        self.api_link = thing_link.api_link
         self.last_time = None
         self._parsed = False
         self._needs_download = True
@@ -331,13 +366,22 @@ class Thing:
         self._file_links = FileLinks()
         self._image_links = []
 
-    def _parse(self, base_dir):
+    @classmethod
+    def from_thing_id(cls, thing_id):
+        """
+        Factory method that looks up a thing by ID and creates a Thing object for it
+        :param thing_id: to look up
+        :return: Thing or None
+        """
+        return Thing(ThingLink(thing_id, "", ""))
+
+    def _parse(self, base_dir, api_key):
         """ Work out what, if anything needs to be done. """
         if self._parsed:
             return
 
         # First get the broad details
-        url = API_THING_DETAILS.format(self.thing_id, API_KEY)
+        url = API_THING_DETAILS.format(self.thing_id, api_key)
         try:
             current_req = SESSION.get(url)
         except requests.exceptions.ConnectionError as error:
@@ -359,14 +403,29 @@ class Thing:
         except KeyError:
             logging.warning("No license found for thing {}?".format(self.thing_id))
 
-        # TODO: Get non-html version of this?
+        details = None
         try:
-            self._details = thing_json['details']
+            details = thing_json['details']
         except KeyError:
             logging.warning("No description found for thing {}?".format(self.thing_id))
 
+        if details:
+            try:
+                self._details = MLStripper.strip_tags(details)
+            except ValueError as e:
+                logging.warning("Unable to strip HTML from readme: {}".format(e))
+                self._details = details
+
+        if not self.name:
+            # Probably generated with factory method.
+            try:
+                self.name = thing_json['name']
+            except KeyError:
+                logging.warning("No name found for thing {}?".format(self.thing_id))
+                self.name = self.thing_id
+
         # Now get the file details
-        file_url = API_THING_FILES.format(self.thing_id, API_KEY)
+        file_url = API_THING_FILES.format(self.thing_id, api_key)
 
         try:
             current_req = SESSION.get(file_url)
@@ -391,12 +450,12 @@ class Thing:
             try:
                 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
                 self._file_links.append(
-                    FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(API_KEY)))
+                    FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
             except ValueError:
                 logging.error(link['date'])
 
         # Finally get the image links
-        image_url = API_THING_IMAGES.format(self.thing_id, API_KEY)
+        image_url = API_THING_IMAGES.format(self.thing_id, api_key)
 
         try:
             current_req = SESSION.get(image_url)
@@ -534,14 +593,14 @@ class Thing:
                 latest_time = candidate_time
                 latest = candidate
         logging.info("Found last old thing: {} / {}".format(latest, latest_time))
-        return (latest, latest_time)
+        return latest, latest_time
 
-    def download(self, base_dir, compress):
+    def download(self, base_dir, compress, api_key):
         """ Download all files for a given thing. 
             Returns True iff the thing is now downloaded (not iff it downloads the thing!)
         """
         if not self._parsed:
-            self._parse(base_dir)
+            self._parse(base_dir, api_key)
 
         if not self._parsed:
             logging.error(
@@ -623,7 +682,9 @@ class Thing:
                     file_link.name, file_link.link, file_name))
                 data_req = SESSION.get(file_link.link)
                 if data_req.status_code != 200:
-                    logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
+                    logging.error("Unexpected status code {} for {}".format(data_req.status_code,
+                                                                                sanitise_url(file_link.link)))
+                    logging.debug("Unexpected status code {} for {}: {}".format(data_req.status_code,
                                                                                 sanitise_url(file_link.link),
                                                                                 data_req.text))
                     fail_dir(self.download_dir)
@@ -705,7 +766,7 @@ class Thing:
         return State.OK
 
 
-def do_batch(batch_file, download_dir, quick, compress):
+def do_batch(batch_file, download_dir, quick, compress, api_key):
     """ Read a file in line by line, parsing each as a set of calls to this script."""
     with open(batch_file) as handle:
         for line in handle:
@@ -718,18 +779,18 @@ def do_batch(batch_file, download_dir, quick, compress):
             if command_arr[0] == "thing":
                 logging.debug(
                     "Handling batch thing instruction: {}".format(line))
-                Thing(command_arr[1]).download(download_dir, compress)
+                Thing.from_thing_id(command_arr[1]).download(download_dir, compress, api_key)
                 continue
             if command_arr[0] == "collection":
                 logging.debug(
                     "Handling batch collection instruction: {}".format(line))
                 Collection(command_arr[1], command_arr[2],
-                           download_dir, quick, compress).download()
+                           download_dir, quick, compress, api_key).download()
                 continue
             if command_arr[0] == "user":
                 logging.debug(
                     "Handling batch collection instruction: {}".format(line))
-                Designs(command_arr[1], download_dir, quick, compress).download()
+                Designs(command_arr[1], download_dir, quick, compress, api_key).download()
                 continue
             logging.warning("Unable to parse current instruction. Skipping.")
 
@@ -785,13 +846,12 @@ def main():
     console_handler = logging.StreamHandler()
     console_handler.setLevel(args.log_level.upper())
 
-    global API_KEY
     if args.api_key:
-        API_KEY = args.api_key
+        api_key = args.api_key
     else:
         try:
             with open("api.key") as fh:
-                API_KEY = fh.read().strip()
+                api_key = fh.read().strip()
         except Exception as e:
             logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
             logging.error("Exception: {}".format(e))
@@ -807,23 +867,23 @@ def main():
     # Start downloader
     thing_queue = multiprocessing.JoinableQueue()
     logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
-    downloaders = [Downloader(thing_queue, args.directory, args.compress) for _ in range(DOWNLOADER_COUNT)]
+    downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
     for downloader in downloaders:
         downloader.start()
 
     if args.subcommand.startswith("collection"):
         for collection in args.collections:
-            Collection(args.owner, collection, args.directory, args.quick, args.compress).download()
+            Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
     if args.subcommand == "thing":
         for thing in args.things:
             thing_queue.put(thing)
     if args.subcommand == "user":
         for user in args.users:
-            Designs(user, args.directory, args.quick, args.compress).download()
+            Designs(user, args.directory, args.quick, args.compress, api_key).download()
     if args.subcommand == "version":
         print("thingy_grabber.py version {}".format(VERSION))
     if args.subcommand == "batch":
-        do_batch(args.batch_file, args.directory, args.quick, args.compress)
+        do_batch(args.batch_file, args.directory, args.quick, args.compress, api_key)
 
     # Stop the downloader processes
     for _ in downloaders: