From e0e69fc6c9e95f91814e742d1adb95c1c8d0cd2e Mon Sep 17 00:00:00 2001 From: Oliver Matthews Date: Tue, 28 Jan 2020 09:52:48 +0000 Subject: [PATCH] add error handling --- README.md | 2 ++ thingy_grabber.py | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2d9a103..9063329 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,8 @@ python3, beautifulsoup4, requests, lxml - If there is an updated file, the old directory will be moved to `name_timestamp` where `timestamp` is the last upload time of the old files. The code will then copy unchanged files across and download any new ones. ## Changelog +* v0.6.2 + - Added catches for 404s, 504s and malformed pages * v0.6.1 - now downloads readme.txt and licence details * v0.6.0 diff --git a/thingy_grabber.py b/thingy_grabber.py index 63c929a..bdddb8a 100755 --- a/thingy_grabber.py +++ b/thingy_grabber.py @@ -173,12 +173,30 @@ class Thing: return url = "{}/thing:{}/files".format(URL_BASE, self.thing_id) - req = requests.get(url) + try: + req = requests.get(url) + except requests.exceptions.ConnectionError as error: + logging.error("Unable to connect for thing {}: {}".format(self.thing_id, error)) + return + self.text = req.text soup = BeautifulSoup(self.text, features='lxml') #import code #code.interact(local=dict(globals(), **locals())) - self.title = slugify(soup.find_all('h1')[0].text.strip()) + try: + self.title = slugify(soup.find_all('h1')[0].text.strip()) + except IndexError: + logging.warning("No title found for thing {}".format(self.thing_id)) + self.title = self.thing_id + + if req.status_code == 404: + logging.warning("404 for thing {} - DMCA or invalid number?".format(self.thing_id)) + return + + if req.status_code > 299: + logging.warning("bad status code {} for thing {} - try again later?".format(req.status_code, self.thing_id)) + return + self.download_dir = os.path.join(base_dir, self.title) logging.debug("Parsing {} ({})".format(self.thing_id, self.title)) @@ -229,6 +247,10 @@ class Thing: if not self._parsed: self._parse(base_dir) + if not self._parsed: + logging.error("Unable to parse {} - aborting download".format(self.thing_id)) + return + if not self._needs_download: print("{} already downloaded - skipping.".format(self.title)) return @@ -258,7 +280,12 @@ class Thing: if not self.last_time: # If we don't have anything to copy from, then it is all new. new_file_links = file_links - new_last_time = file_links[0].find_all('time')[0]['datetime'] + try: + new_last_time = file_links[0].find_all('time')[0]['datetime'] + except: + import code + code.interact(local=dict(globals(), **locals())) + for file_link in file_links: timestamp = file_link.find_all('time')[0]['datetime'] logging.debug("Found file {} from {}".format( -- 2.20.1