Make readmes text files

author Oliver Matthews <oliver@codersoffortune.net>

Mon, 14 Sep 2020 15:09:35 +0000 (16:09 +0100)

committer Oliver Matthews <oliver@codersoffortune.net>

Mon, 14 Sep 2020 15:09:35 +0000 (16:09 +0100)
author Oliver Matthews <oliver@codersoffortune.net>
Mon, 14 Sep 2020 15:09:35 +0000 (16:09 +0100)
committer Oliver Matthews <oliver@codersoffortune.net>
Mon, 14 Sep 2020 15:09:35 +0000 (16:09 +0100)
diff --git a/README.md b/README.md

index 522e243..16e5273 100644 (file)
--- a/README.md
+++ b/README.md
@@ -124,6 +124,9 @@ python3, requests, py7xr (>=0.8.2)
  - If there is an updated file, the old directory will be moved to `name_timestamp` where `timestamp` is the last upload time of the old files. The code will then copy unchanged files across and download any new ones.
  
  ## Changelog
+* v0.10.4
+  - Readme.txt files are now text files, not HTML files.
+  - removed some debug print statements that I forgot to remove from the last release (oops).
  * v0.10.3
    - Handle trailing whitespace in thing names
    - Fix raw thing grabbing
diff --git a/thingy_grabber.py b/thingy_grabber.py

index 66b3674..a9177a2 100755 (executable)
--- a/thingy_grabber.py
+++ b/thingy_grabber.py
@@ -18,6 +18,8 @@ from dataclasses import dataclass
  import py7zr
  import glob
  import shutil
+from io import StringIO
+from html.parser import HTMLParser
  
  SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
  
@@ -46,13 +48,39 @@ RETRY_COUNT = 3
  
  MAX_PATH_LENGTH = 250
  
-VERSION = "0.10.3"
+VERSION = "0.10.4"
  
  TIMESTAMP_FILE = "timestamp.txt"
  
  SESSION = requests.Session()
  
  
+class MLStripper(HTMLParser):
+    """ Turns HTML markup into plain text
+    """
+
+    def error(self, message):
+        raise ValueError(message)
+
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.strict = False
+        self.convert_charrefs= True
+        self.text = StringIO()
+
+    def handle_data(self, d):
+        self.text.write(d)
+
+    def get_data(self):
+        return self.text.getvalue()
+
+    @staticmethod
+    def strip_tags(html):
+        s = MLStripper()
+        s.feed(html)
+        return s.get_data()
+
  @dataclass
  class ThingLink:
      thing_id: str
@@ -353,7 +381,6 @@ class Thing:
  
          # First get the broad details
          url = API_THING_DETAILS.format(self.thing_id, api_key)
-        logging.error(url)
          try:
              current_req = SESSION.get(url)
          except requests.exceptions.ConnectionError as error:
@@ -375,12 +402,21 @@ class Thing:
          except KeyError:
              logging.warning("No license found for thing {}?".format(self.thing_id))
  
-        # TODO: Get non-html version of this?
+        details = None
          try:
-            self._details = thing_json['details']
+            details = thing_json['details']
          except KeyError:
              logging.warning("No description found for thing {}?".format(self.thing_id))
  
+
+        if details:
+            try:
+                self._details = MLStripper.strip_tags(details)
+            except ValueError as e:
+                logging.warning("Unable to strip HTML from readme: {}".format(e))
+                self._details = details
+
+
          if not self.name:
              # Probably generated with factory method.
              try:
author	Oliver Matthews <oliver@codersoffortune.net>
	Mon, 14 Sep 2020 15:09:35 +0000 (16:09 +0100)
committer	Oliver Matthews <oliver@codersoffortune.net>
	Mon, 14 Sep 2020 15:09:35 +0000 (16:09 +0100)
README.md		patch \| blob \| blame \| history
thingy_grabber.py		patch \| blob \| blame \| history