add error handling

author Oliver Matthews <oliver@codersoffortune.net>

Tue, 28 Jan 2020 09:52:48 +0000 (09:52 +0000)

committer Oliver Matthews <oliver@codersoffortune.net>

Tue, 28 Jan 2020 09:52:48 +0000 (09:52 +0000)
author Oliver Matthews <oliver@codersoffortune.net>
Tue, 28 Jan 2020 09:52:48 +0000 (09:52 +0000)
committer Oliver Matthews <oliver@codersoffortune.net>
Tue, 28 Jan 2020 09:52:48 +0000 (09:52 +0000)
diff --git a/README.md b/README.md

index 2d9a103..9063329 100644 (file)
--- a/README.md
+++ b/README.md
@@ -75,6 +75,8 @@ python3, beautifulsoup4, requests, lxml
  - If there is an updated file, the old directory will be moved to `name_timestamp` where `timestamp` is the last upload time of the old files. The code will then copy unchanged files across and download any new ones.
  
  ## Changelog
+* v0.6.2
+  - Added catches for 404s, 504s and malformed pages
  * v0.6.1
    - now downloads readme.txt and licence details
  * v0.6.0
diff --git a/thingy_grabber.py b/thingy_grabber.py

index 63c929a..bdddb8a 100755 (executable)
--- a/thingy_grabber.py
+++ b/thingy_grabber.py
@@ -173,12 +173,30 @@ class Thing:
              return
  
          url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
-        req = requests.get(url)
+        try:
+          req = requests.get(url)
+        except requests.exceptions.ConnectionError as error:
+          logging.error("Unable to connect for thing {}: {}".format(self.thing_id, error))
+          return
+
          self.text = req.text
          soup = BeautifulSoup(self.text, features='lxml')
          #import code
          #code.interact(local=dict(globals(), **locals()))
-        self.title = slugify(soup.find_all('h1')[0].text.strip())
+        try:
+          self.title = slugify(soup.find_all('h1')[0].text.strip())
+        except IndexError:
+          logging.warning("No title found for thing {}".format(self.thing_id))
+          self.title = self.thing_id
+
+        if req.status_code == 404:
+          logging.warning("404 for thing {} - DMCA or invalid number?".format(self.thing_id))
+          return
+
+        if req.status_code > 299:
+          logging.warning("bad status code {}  for thing {} - try again later?".format(req.status_code, self.thing_id))
+          return
+
          self.download_dir = os.path.join(base_dir, self.title)
  
          logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
@@ -229,6 +247,10 @@ class Thing:
          if not self._parsed:
              self._parse(base_dir)
  
+        if not self._parsed:
+          logging.error("Unable to parse {} - aborting download".format(self.thing_id))
+          return
+
          if not self._needs_download:
              print("{} already downloaded - skipping.".format(self.title))
              return
@@ -258,7 +280,12 @@ class Thing:
          if not self.last_time:
              # If we don't have anything to copy from, then it is all new.
              new_file_links = file_links
-            new_last_time = file_links[0].find_all('time')[0]['datetime']
+            try:
+              new_last_time = file_links[0].find_all('time')[0]['datetime']
+            except:
+              import code
+              code.interact(local=dict(globals(), **locals()))
+
              for file_link in file_links:
                  timestamp = file_link.find_all('time')[0]['datetime']
                  logging.debug("Found file {} from {}".format(
author	Oliver Matthews <oliver@codersoffortune.net>
	Tue, 28 Jan 2020 09:52:48 +0000 (09:52 +0000)
committer	Oliver Matthews <oliver@codersoffortune.net>
	Tue, 28 Jan 2020 09:52:48 +0000 (09:52 +0000)
README.md		patch \| blob \| blame \| history
thingy_grabber.py		patch \| blob \| blame \| history