add error handling
authorOliver Matthews <oliver@codersoffortune.net>
Tue, 28 Jan 2020 09:52:48 +0000 (09:52 +0000)
committerOliver Matthews <oliver@codersoffortune.net>
Tue, 28 Jan 2020 09:52:48 +0000 (09:52 +0000)
README.md
thingy_grabber.py

index 2d9a103..9063329 100644 (file)
--- a/README.md
+++ b/README.md
@@ -75,6 +75,8 @@ python3, beautifulsoup4, requests, lxml
 - If there is an updated file, the old directory will be moved to `name_timestamp` where `timestamp` is the last upload time of the old files. The code will then copy unchanged files across and download any new ones.
 
 ## Changelog
+* v0.6.2
+  - Added catches for 404s, 504s and malformed pages
 * v0.6.1
   - now downloads readme.txt and licence details
 * v0.6.0
index 63c929a..bdddb8a 100755 (executable)
@@ -173,12 +173,30 @@ class Thing:
             return
 
         url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
-        req = requests.get(url)
+        try:
+          req = requests.get(url)
+        except requests.exceptions.ConnectionError as error:
+          logging.error("Unable to connect for thing {}: {}".format(self.thing_id, error))
+          return
+
         self.text = req.text
         soup = BeautifulSoup(self.text, features='lxml')
         #import code
         #code.interact(local=dict(globals(), **locals()))
-        self.title = slugify(soup.find_all('h1')[0].text.strip())
+        try:
+          self.title = slugify(soup.find_all('h1')[0].text.strip())
+        except IndexError:
+          logging.warning("No title found for thing {}".format(self.thing_id))
+          self.title = self.thing_id
+
+        if req.status_code == 404:
+          logging.warning("404 for thing {} - DMCA or invalid number?".format(self.thing_id))
+          return
+
+        if req.status_code > 299:
+          logging.warning("bad status code {}  for thing {} - try again later?".format(req.status_code, self.thing_id))
+          return
+
         self.download_dir = os.path.join(base_dir, self.title)
 
         logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
@@ -229,6 +247,10 @@ class Thing:
         if not self._parsed:
             self._parse(base_dir)
 
+        if not self._parsed:
+          logging.error("Unable to parse {} - aborting download".format(self.thing_id))
+          return
+
         if not self._needs_download:
             print("{} already downloaded - skipping.".format(self.title))
             return
@@ -258,7 +280,12 @@ class Thing:
         if not self.last_time:
             # If we don't have anything to copy from, then it is all new.
             new_file_links = file_links
-            new_last_time = file_links[0].find_all('time')[0]['datetime']
+            try:
+              new_last_time = file_links[0].find_all('time')[0]['datetime']
+            except:
+              import code
+              code.interact(local=dict(globals(), **locals()))
+
             for file_link in file_links:
                 timestamp = file_link.find_all('time')[0]['datetime']
                 logging.debug("Found file {} from {}".format(