From e13060998f28ba25597704bf67527e2b2ad884ca Mon Sep 17 00:00:00 2001 From: Oliver Matthews Date: Mon, 14 Sep 2020 16:09:35 +0100 Subject: [PATCH] Make readmes text files --- README.md | 3 +++ thingy_grabber.py | 44 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 522e243..16e5273 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,9 @@ python3, requests, py7xr (>=0.8.2) - If there is an updated file, the old directory will be moved to `name_timestamp` where `timestamp` is the last upload time of the old files. The code will then copy unchanged files across and download any new ones. ## Changelog +* v0.10.4 + - Readme.txt files are now text files, not HTML files. + - removed some debug print statements that I forgot to remove from the last release (oops). * v0.10.3 - Handle trailing whitespace in thing names - Fix raw thing grabbing diff --git a/thingy_grabber.py b/thingy_grabber.py index 66b3674..a9177a2 100755 --- a/thingy_grabber.py +++ b/thingy_grabber.py @@ -18,6 +18,8 @@ from dataclasses import dataclass import py7zr import glob import shutil +from io import StringIO +from html.parser import HTMLParser SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}] @@ -46,13 +48,39 @@ RETRY_COUNT = 3 MAX_PATH_LENGTH = 250 -VERSION = "0.10.3" +VERSION = "0.10.4" TIMESTAMP_FILE = "timestamp.txt" SESSION = requests.Session() +class MLStripper(HTMLParser): + """ Turns HTML markup into plain text + """ + + def error(self, message): + raise ValueError(message) + + def __init__(self): + super().__init__() + self.reset() + self.strict = False + self.convert_charrefs= True + self.text = StringIO() + + def handle_data(self, d): + self.text.write(d) + + def get_data(self): + return self.text.getvalue() + + @staticmethod + def strip_tags(html): + s = MLStripper() + s.feed(html) + return s.get_data() + @dataclass class ThingLink: thing_id: str @@ -353,7 +381,6 @@ class Thing: # First get the broad details url = API_THING_DETAILS.format(self.thing_id, api_key) - logging.error(url) try: current_req = SESSION.get(url) except requests.exceptions.ConnectionError as error: @@ -375,12 +402,21 @@ class Thing: except KeyError: logging.warning("No license found for thing {}?".format(self.thing_id)) - # TODO: Get non-html version of this? + details = None try: - self._details = thing_json['details'] + details = thing_json['details'] except KeyError: logging.warning("No description found for thing {}?".format(self.thing_id)) + + if details: + try: + self._details = MLStripper.strip_tags(details) + except ValueError as e: + logging.warning("Unable to strip HTML from readme: {}".format(e)) + self._details = details + + if not self.name: # Probably generated with factory method. try: -- 2.20.1