Only output file download error text when logging is turned up.

[clinton/thingy_grabber.git] / thingy_grabber.py
diff --git a/thingy_grabber.py b/thingy_grabber.py

index a53b164..c4fb963 100755 (executable)
--- a/thingy_grabber.py
+++ b/thingy_grabber.py
@@ -18,6 +18,8 @@ from dataclasses import dataclass
  import py7zr
  import glob
  import shutil
+from io import StringIO
+from html.parser import HTMLParser
  
  SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
  
@@ -29,7 +31,7 @@ SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
  API_BASE = "https://api.thingiverse.com"
  ACCESS_QP = "access_token={}"
  PAGE_QP = "page={}"
-API_USER_DESIGNS = API_BASE + "/users/{}/things/"
+API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
  API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
  
  # Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
@@ -41,20 +43,45 @@ API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
  API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
  API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
  
-API_KEY = None
-
  DOWNLOADER_COUNT = 1
  RETRY_COUNT = 3
  
  MAX_PATH_LENGTH = 250
  
-VERSION = "0.10.2"
+VERSION = "0.10.5"
  
  TIMESTAMP_FILE = "timestamp.txt"
  
  SESSION = requests.Session()
  
  
+class MLStripper(HTMLParser):
+    """ Turns HTML markup into plain text
+    """
+
+    def error(self, message):
+        raise ValueError(message)
+
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.strict = False
+        self.convert_charrefs = True
+        self.text = StringIO()
+
+    def handle_data(self, d):
+        self.text.write(d)
+
+    def get_data(self):
+        return self.text.getvalue()
+
+    @staticmethod
+    def strip_tags(html):
+        s = MLStripper()
+        s.feed(html)
+        return s.get_data()
+
+
  @dataclass
  class ThingLink:
      thing_id: str
@@ -145,7 +172,6 @@ def truncate_name(file_name):
      path = os.path.abspath(file_name)
      if len(path) <= MAX_PATH_LENGTH:
          return path
-    to_cut = len(path) - (MAX_PATH_LENGTH + 3)
      base, extension = os.path.splitext(path)
      inc = 0
      new_path = "{}_{}{}".format(base, inc, extension)
@@ -164,7 +190,7 @@ def slugify(value):
      value = unicodedata.normalize('NFKC', value).lower().strip()
      value = re.sub(r'[\\/<>:?*|"]', '', value)
      value = re.sub(r'\.*$', '', value)
-    return value
+    return value.strip()
  
  
  class Downloader(multiprocessing.Process):
@@ -172,24 +198,33 @@ class Downloader(multiprocessing.Process):
      Class to handle downloading the things we have found to get.
      """
  
-    def __init__(self, thing_queue, download_directory, compress):
+    def __init__(self, thing_queue, download_directory, compress, api_key):
          multiprocessing.Process.__init__(self)
          # TODO: add parameters
          self.thing_queue = thing_queue
          self.download_directory = download_directory
          self.compress = compress
+        self.api_key = api_key
  
      def run(self):
          """ actual download loop.
          """
          while True:
-            thing_id = self.thing_queue.get
+            thing_id = self.thing_queue.get()
              if thing_id is None:
                  logging.info("Shutting download queue")
                  self.thing_queue.task_done()
                  break
-            logging.info("Handling id {}".format(thing_id))
-            Thing(thing_id).download(self.download_directory, self.compress)
+            thing = None
+            if isinstance(thing_id, str):
+                thing = Thing.from_thing_id(thing_id)
+            if isinstance(thing_id, ThingLink):
+                thing = Thing(thing_id)
+            if not thing:
+                logging.error("Don't know how to handle thing_id {}".format(thing_id))
+            else:
+                logging.info("Handling id {}".format(thing_id))
+                thing.download(self.download_directory, self.compress, self.api_key)
              self.thing_queue.task_done()
          return
  
@@ -200,7 +235,7 @@ class Grouping:
          - use Collection or Designs instead.
      """
  
-    def __init__(self, quick, compress):
+    def __init__(self, quick, compress, api_key):
          self.things = []
          self.total = 0
          self.req_id = None
@@ -209,6 +244,7 @@ class Grouping:
          # Should we stop downloading when we hit a known datestamp?
          self.quick = quick
          self.compress = compress
+        self.api_key = api_key
          # These should be set by child classes.
          self.url = None
          self.download_dir = None
@@ -227,33 +263,34 @@ class Grouping:
  
          # Get the internal details of the grouping.
          logging.debug("Querying {}".format(sanitise_url(self.url)))
-        page = 0
  
-        # self.url should already have been formatted as we don't need pagination
-        logging.info("requesting:{}".format(sanitise_url(self.url)))
-        current_req = SESSION.get(self.url)
-        if current_req.status_code != 200:
-            logging.error(
-                "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
-                                                                current_req.text))
-        else:
-            current_json = current_req.json()
-            for thing in current_json:
-                logging.info(thing)
-                self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
+        # follow next links until all items are found
+        current_url = self.url
+        while current_url != None:
+            logging.info("requesting:{}".format(sanitise_url(current_url)))
+            current_req = SESSION.get(current_url)
+            current_url = current_req.links.get('next', {}).get('url')
+            if current_req.status_code != 200:
+                logging.error(
+                    "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
+                                                                    current_req.text))
+            else:
+                current_json = current_req.json()
+                for thing in current_json:
+                    logging.debug(thing)
+                    self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
          logging.info("Found {} things.".format(len(self.things)))
          return self.things
  
      def download(self):
          """ Downloads all the files in a collection """
          if not self.things:
-            self.get()
+            self.get
  
          if not self.download_dir:
              raise ValueError(
                  "No download_dir set - invalidly initialised object?")
  
-        base_dir = os.getcwd()
          try:
              os.mkdir(self.download_dir)
          except FileExistsError:
@@ -262,8 +299,8 @@ class Grouping:
          logging.info("Downloading {} thing(s).".format(self.total))
          for idx, thing in enumerate(self.things):
              logging.info("Downloading thing {} - {}".format(idx, thing))
-            RC = Thing(thing).download(self.download_dir, self.compress)
-            if self.quick and RC == State.ALREADY_DOWNLOADED:
+            return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
+            if self.quick and return_code == State.ALREADY_DOWNLOADED:
                  logging.info("Caught up, stopping.")
                  return
  
@@ -271,13 +308,13 @@ class Grouping:
  class Collection(Grouping):
      """ Holds details of a collection. """
  
-    def __init__(self, user, name, directory, quick, compress):
-        Grouping.__init__(self, quick, compress)
+    def __init__(self, user, name, directory, quick, compress, api_key):
+        Grouping.__init__(self, quick, compress, api_key)
          self.user = user
          self.name = name
          self.paginated = False
          # need to figure out the the ID for the collection
-        collection_url = API_USER_COLLECTIONS.format(user, API_KEY)
+        collection_url = API_USER_COLLECTIONS.format(user, api_key)
          try:
              current_req = SESSION.get(collection_url)
          except requests.exceptions.ConnectionError as error:
@@ -297,7 +334,7 @@ class Collection(Grouping):
              logging.error("Unable to find collection {} for user {}".format(name, user))
              return
          self.collection_id = collection['id']
-        self.url = API_COLLECTION_THINGS.format(self.collection_id, API_KEY)
+        self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
  
          self.download_dir = os.path.join(directory,
                                           "{}-{}".format(slugify(self.user), slugify(self.name)))
@@ -306,11 +343,10 @@ class Collection(Grouping):
  class Designs(Grouping):
      """ Holds details of all of a users' designs. """
  
-    def __init__(self, user, directory, quick, compress):
-        Grouping.__init__(self, quick, compress)
+    def __init__(self, user, directory, quick, compress, api_key):
+        Grouping.__init__(self, quick, compress, api_key)
          self.user = user
-        self.url = API_USER_DESIGNS.format(user)
-        self.paginated = True
+        self.url = API_USER_DESIGNS.format(user, api_key)
          self.download_dir = os.path.join(
              directory, "{} designs".format(slugify(self.user)))
  
@@ -321,7 +357,6 @@ class Thing:
      def __init__(self, thing_link):
          self.thing_id = thing_link.thing_id
          self.name = thing_link.name
-        self.api_link = thing_link.api_link
          self.last_time = None
          self._parsed = False
          self._needs_download = True
@@ -331,13 +366,22 @@ class Thing:
          self._file_links = FileLinks()
          self._image_links = []
  
-    def _parse(self, base_dir):
+    @classmethod
+    def from_thing_id(cls, thing_id):
+        """
+        Factory method that looks up a thing by ID and creates a Thing object for it
+        :param thing_id: to look up
+        :return: Thing or None
+        """
+        return Thing(ThingLink(thing_id, "", ""))
+
+    def _parse(self, base_dir, api_key):
          """ Work out what, if anything needs to be done. """
          if self._parsed:
              return
  
          # First get the broad details
-        url = API_THING_DETAILS.format(self.thing_id, API_KEY)
+        url = API_THING_DETAILS.format(self.thing_id, api_key)
          try:
              current_req = SESSION.get(url)
          except requests.exceptions.ConnectionError as error:
@@ -359,14 +403,29 @@ class Thing:
          except KeyError:
              logging.warning("No license found for thing {}?".format(self.thing_id))
  
-        # TODO: Get non-html version of this?
+        details = None
          try:
-            self._details = thing_json['details']
+            details = thing_json['details']
          except KeyError:
              logging.warning("No description found for thing {}?".format(self.thing_id))
  
+        if details:
+            try:
+                self._details = MLStripper.strip_tags(details)
+            except ValueError as e:
+                logging.warning("Unable to strip HTML from readme: {}".format(e))
+                self._details = details
+
+        if not self.name:
+            # Probably generated with factory method.
+            try:
+                self.name = thing_json['name']
+            except KeyError:
+                logging.warning("No name found for thing {}?".format(self.thing_id))
+                self.name = self.thing_id
+
          # Now get the file details
-        file_url = API_THING_FILES.format(self.thing_id, API_KEY)
+        file_url = API_THING_FILES.format(self.thing_id, api_key)
  
          try:
              current_req = SESSION.get(file_url)
@@ -391,12 +450,12 @@ class Thing:
              try:
                  datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
                  self._file_links.append(
-                    FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(API_KEY)))
+                    FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
              except ValueError:
                  logging.error(link['date'])
  
          # Finally get the image links
-        image_url = API_THING_IMAGES.format(self.thing_id, API_KEY)
+        image_url = API_THING_IMAGES.format(self.thing_id, api_key)
  
          try:
              current_req = SESSION.get(image_url)
@@ -534,14 +593,14 @@ class Thing:
                  latest_time = candidate_time
                  latest = candidate
          logging.info("Found last old thing: {} / {}".format(latest, latest_time))
-        return (latest, latest_time)
+        return latest, latest_time
  
-    def download(self, base_dir, compress):
+    def download(self, base_dir, compress, api_key):
          """ Download all files for a given thing. 
              Returns True iff the thing is now downloaded (not iff it downloads the thing!)
          """
          if not self._parsed:
-            self._parse(base_dir)
+            self._parse(base_dir, api_key)
  
          if not self._parsed:
              logging.error(
@@ -623,7 +682,9 @@ class Thing:
                      file_link.name, file_link.link, file_name))
                  data_req = SESSION.get(file_link.link)
                  if data_req.status_code != 200:
-                    logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
+                    logging.error("Unexpected status code {} for {}".format(data_req.status_code,
+                                                                                sanitise_url(file_link.link)))
+                    logging.debug("Unexpected status code {} for {}: {}".format(data_req.status_code,
                                                                                  sanitise_url(file_link.link),
                                                                                  data_req.text))
                      fail_dir(self.download_dir)
@@ -705,7 +766,7 @@ class Thing:
          return State.OK
  
  
-def do_batch(batch_file, download_dir, quick, compress):
+def do_batch(batch_file, download_dir, quick, compress, api_key):
      """ Read a file in line by line, parsing each as a set of calls to this script."""
      with open(batch_file) as handle:
          for line in handle:
@@ -718,18 +779,18 @@ def do_batch(batch_file, download_dir, quick, compress):
              if command_arr[0] == "thing":
                  logging.debug(
                      "Handling batch thing instruction: {}".format(line))
-                Thing(command_arr[1]).download(download_dir, compress)
+                Thing.from_thing_id(command_arr[1]).download(download_dir, compress, api_key)
                  continue
              if command_arr[0] == "collection":
                  logging.debug(
                      "Handling batch collection instruction: {}".format(line))
                  Collection(command_arr[1], command_arr[2],
-                           download_dir, quick, compress).download()
+                           download_dir, quick, compress, api_key).download()
                  continue
              if command_arr[0] == "user":
                  logging.debug(
                      "Handling batch collection instruction: {}".format(line))
-                Designs(command_arr[1], download_dir, quick, compress).download()
+                Designs(command_arr[1], download_dir, quick, compress, api_key).download()
                  continue
              logging.warning("Unable to parse current instruction. Skipping.")
  
@@ -785,13 +846,12 @@ def main():
      console_handler = logging.StreamHandler()
      console_handler.setLevel(args.log_level.upper())
  
-    global API_KEY
      if args.api_key:
-        API_KEY = args.api_key
+        api_key = args.api_key
      else:
          try:
              with open("api.key") as fh:
-                API_KEY = fh.read().strip()
+                api_key = fh.read().strip()
          except Exception as e:
              logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
              logging.error("Exception: {}".format(e))
@@ -807,23 +867,23 @@ def main():
      # Start downloader
      thing_queue = multiprocessing.JoinableQueue()
      logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
-    downloaders = [Downloader(thing_queue, args.directory, args.compress) for _ in range(DOWNLOADER_COUNT)]
+    downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
      for downloader in downloaders:
          downloader.start()
  
      if args.subcommand.startswith("collection"):
          for collection in args.collections:
-            Collection(args.owner, collection, args.directory, args.quick, args.compress).download()
+            Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
      if args.subcommand == "thing":
          for thing in args.things:
              thing_queue.put(thing)
      if args.subcommand == "user":
          for user in args.users:
-            Designs(user, args.directory, args.quick, args.compress).download()
+            Designs(user, args.directory, args.quick, args.compress, api_key).download()
      if args.subcommand == "version":
          print("thingy_grabber.py version {}".format(VERSION))
      if args.subcommand == "batch":
-        do_batch(args.batch_file, args.directory, args.quick, args.compress)
+        do_batch(args.batch_file, args.directory, args.quick, args.compress, api_key)
  
      # Stop the downloader processes
      for _ in downloaders: