From 73695baf344cdd5d57a9bd2c5683d6f4682bdfaf Mon Sep 17 00:00:00 2001
From: Oliver Matthews <oliver@codersoffortune.net>
Date: Mon, 7 Sep 2020 09:52:59 +0100
Subject: [PATCH] cleanup in progress

---
 thingy_grabber.py | 190 +++++++++++++++++++++-------------------------
 1 file changed, 88 insertions(+), 102 deletions(-)

diff --git a/thingy_grabber.py b/thingy_grabber.py
index 59b5ff5..a53b164 100755
--- a/thingy_grabber.py
+++ b/thingy_grabber.py
@@ -15,7 +15,6 @@ import enum
 import datetime
 from shutil import copyfile
 from dataclasses import dataclass
-import atexit
 import py7zr
 import glob
 import shutil
@@ -27,9 +26,9 @@ DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
 # Windows cannot handle : in filenames
 SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
 
-API_BASE="https://api.thingiverse.com"
-ACCESS_QP="access_token={}"
-PAGE_QP="page={}"
+API_BASE = "https://api.thingiverse.com"
+ACCESS_QP = "access_token={}"
+PAGE_QP = "page={}"
 API_USER_DESIGNS = API_BASE + "/users/{}/things/"
 API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
 
@@ -55,28 +54,34 @@ TIMESTAMP_FILE = "timestamp.txt"
 
 SESSION = requests.Session()
 
+
 @dataclass
 class ThingLink:
     thing_id: str
     name: str
     api_link: str
 
+
 @dataclass
 class FileLink:
     name: str
     last_update: datetime.datetime
     link: str
 
+
 @dataclass
 class ImageLink:
     name: str
     link: str
 
+
 class FileLinks:
-    def __init__(self, initial_links=[]):
+    def __init__(self, initial_links=None):
+        if initial_links is None:
+            initial_links = []
         self.links = []
         self.last_update = None
-        for link in initial_links: 
+        for link in initial_links:
             self.append(link)
 
     def __iter__(self):
@@ -101,6 +106,7 @@ class State(enum.Enum):
     FAILED = enum.auto()
     ALREADY_DOWNLOADED = enum.auto()
 
+
 def sanitise_url(url):
     """ remove api keys from an url
     """
@@ -108,19 +114,21 @@ def sanitise_url(url):
                   'access_token=***',
                   url)
 
+
 def strip_time(date_obj):
     """ Takes a datetime object and returns another with the time set to 00:00
     """
     return datetime.datetime.combine(date_obj.date(), datetime.time())
 
+
 def rename_unique(dir_name, target_dir_name):
     """ Move a directory sideways to a new name, ensuring it is unique.
     """
     target_dir = target_dir_name
     inc = 0
     while os.path.exists(target_dir):
-      target_dir = "{}_{}".format(target_dir_name, inc)
-      inc += 1
+        target_dir = "{}_{}".format(target_dir_name, inc)
+        inc += 1
     os.rename(dir_name, target_dir)
     return target_dir
 
@@ -128,7 +136,7 @@ def rename_unique(dir_name, target_dir_name):
 def fail_dir(dir_name):
     """ When a download has failed, move it sideways.
     """
-    return rename_unique(dir_name,"{}_failed".format(dir_name))
+    return rename_unique(dir_name, "{}_failed".format(dir_name))
 
 
 def truncate_name(file_name):
@@ -147,11 +155,6 @@ def truncate_name(file_name):
     return new_path
 
 
-def strip_ws(value):
-    """ Remove whitespace from a string """
-    return str(NO_WHITESPACE_REGEX.sub('-', value))
-
-
 def slugify(value):
     """
     Normalise string, removes invalid for filename charactersr
@@ -159,7 +162,7 @@ def slugify(value):
     """
     logging.debug("Sluggyfying {}".format(value))
     value = unicodedata.normalize('NFKC', value).lower().strip()
-    value = re.sub(r'[\\/<>:\?\*\|"]', '', value)
+    value = re.sub(r'[\\/<>:?*|"]', '', value)
     value = re.sub(r'\.*$', '', value)
     return value
 
@@ -180,7 +183,7 @@ class Downloader(multiprocessing.Process):
         """ actual download loop.
         """
         while True:
-            thing_id = self.thing_queue.get()
+            thing_id = self.thing_queue.get
             if thing_id is None:
                 logging.info("Shutting download queue")
                 self.thing_queue.task_done()
@@ -191,9 +194,6 @@ class Downloader(multiprocessing.Process):
         return
 
 
-                
-
-
 class Grouping:
     """ Holds details of a group of things for download
         This is effectively (although not actually) an abstract class
@@ -207,12 +207,13 @@ class Grouping:
         self.last_page = 0
         self.per_page = None
         # Should we stop downloading when we hit a known datestamp?
-        self.quick = quick 
+        self.quick = quick
         self.compress = compress
         # These should be set by child classes.
         self.url = None
         self.download_dir = None
 
+    @property
     def get(self):
         """ retrieve the things of the grouping. """
         if self.things:
@@ -227,35 +228,19 @@ class Grouping:
         # Get the internal details of the grouping.
         logging.debug("Querying {}".format(sanitise_url(self.url)))
         page = 0
-        # TODO:: Must be a way to refactor this cleanly
-        if self.paginated:
-        # Slightly nasty, but afaik python lacks a clean way to do partial string formatting.
-            page_url = self.url + "?" + ACCESS_QP + "&" + PAGE_QP
-            while True:
-                page += 1
-                current_url = page_url.format(API_KEY, page)
-                logging.info("requesting:{}".format(sanitise_url(current_url)))
-                current_req = SESSION.get(current_url)
-                if current_req.status_code != 200:
-                    logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(current_url), current_req.text))
-                    break
-                current_json = current_req.json()
-                if not current_json:
-                    # No more!
-                    break
-                for thing in current_json:
-                    self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
+
+        # self.url should already have been formatted as we don't need pagination
+        logging.info("requesting:{}".format(sanitise_url(self.url)))
+        current_req = SESSION.get(self.url)
+        if current_req.status_code != 200:
+            logging.error(
+                "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
+                                                                current_req.text))
         else:
-            # self.url should already have been formatted as we don't need pagination
-            logging.info("requesting:{}".format(sanitise_url(self.url)))
-            current_req = SESSION.get(self.url)
-            if current_req.status_code != 200:
-                logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(current_url), current_req.text))
-            else:
-                current_json = current_req.json()
-                for thing in current_json:
-                    logging.info(thing)
-                    self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
+            current_json = current_req.json()
+            for thing in current_json:
+                logging.info(thing)
+                self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
         logging.info("Found {} things.".format(len(self.things)))
         return self.things
 
@@ -278,10 +263,11 @@ class Grouping:
         for idx, thing in enumerate(self.things):
             logging.info("Downloading thing {} - {}".format(idx, thing))
             RC = Thing(thing).download(self.download_dir, self.compress)
-            if self.quick and RC==State.ALREADY_DOWNLOADED:
+            if self.quick and RC == State.ALREADY_DOWNLOADED:
                 logging.info("Caught up, stopping.")
                 return
 
+
 class Collection(Grouping):
     """ Holds details of a collection. """
 
@@ -295,11 +281,13 @@ class Collection(Grouping):
         try:
             current_req = SESSION.get(collection_url)
         except requests.exceptions.ConnectionError as error:
-            logging.error("Unable to connect for thing {}: {}".format(
-                self.thing_id, error))
+            logging.error("Unable to connect for collections for user {}: {}".format(
+                self.user, error))
             return
         if current_req.status_code != 200:
-            logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url), current_req.text))
+            logging.error(
+                "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
+                                                                current_req.text))
             return
         collection_list = current_req.json()
         try:
@@ -348,7 +336,6 @@ class Thing:
         if self._parsed:
             return
 
-
         # First get the broad details
         url = API_THING_DETAILS.format(self.thing_id, API_KEY)
         try:
@@ -362,7 +349,8 @@ class Thing:
             logging.error("Access to thing {} is forbidden".format(self.thing_id))
             return
         if current_req.status_code != 200:
-            logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url), current_req.text))
+            logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
+                                                                          current_req.text))
             return
 
         thing_json = current_req.json()
@@ -377,8 +365,6 @@ class Thing:
         except KeyError:
             logging.warning("No description found for thing {}?".format(self.thing_id))
 
-
-
         # Now get the file details
         file_url = API_THING_FILES.format(self.thing_id, API_KEY)
 
@@ -390,19 +376,22 @@ class Thing:
             return
 
         if current_req.status_code != 200:
-            logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url), current_req.text))
+            logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
+                                                                        current_req.text))
             return
 
         link_list = current_req.json()
 
         if not link_list:
-            logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(self.thing_id))
+            logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
+                self.thing_id))
 
         for link in link_list:
             logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
             try:
                 datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
-                self._file_links.append(FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(API_KEY)))
+                self._file_links.append(
+                    FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(API_KEY)))
             except ValueError:
                 logging.error(link['date'])
 
@@ -417,20 +406,25 @@ class Thing:
             return
 
         if current_req.status_code != 200:
-            logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url), current_req.text))
+            logging.error(
+                "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
+                                                              current_req.text))
             return
 
         image_list = current_req.json()
 
         if not image_list:
-            logging.warning("No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(self.thing_id))
+            logging.warning(
+                "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
+                    self.thing_id))
 
         for image in image_list:
             logging.debug("parsing image: {}".format(image))
+            name = None
             try:
                 name = slugify(image['name'])
                 # TODO: fallback to other types
-                url = [x for x in image['sizes'] if x['type']=='display' and x['size']=='large'][0]['url']
+                url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
             except KeyError:
                 logging.warning("Missing image for {}".format(name))
             self._image_links.append(ImageLink(name, url))
@@ -444,10 +438,9 @@ class Thing:
         latest, self.last_time = self._find_last_download(base_dir)
 
         if not latest:
-                # Not yet downloaded
-                self._parsed = True
-                return
-
+            # Not yet downloaded
+            self._parsed = True
+            return
 
         logging.info("last downloaded version: {}".format(self.last_time))
 
@@ -461,7 +454,6 @@ class Thing:
                 logging.info("Dropping time from comparison stamp as old-style download dir")
                 files_last_update = strip_time(files_last_update)
 
-
             if files_last_update > self.last_time:
                 logging.info(
                     "Found new/updated files {}".format(self._file_links.last_update))
@@ -483,7 +475,7 @@ class Thing:
             logging.warning("Found old style download_dir. Moving.")
             rename_unique(old_dir, self.download_dir)
 
-    def _handle_outdated_directory(self, base_dir):
+    def _handle_outdated_directory(self):
         """ Move the current download directory sideways if the thing has changed.
         """
         if not os.path.exists(self.download_dir):
@@ -522,11 +514,10 @@ class Thing:
 
         # TODO:  Maybe look for old download directories.
 
-
         # Now look for 7z files
         candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
         # +3 to allow for ' - '
-        leading_length =len(self.slug)+3
+        leading_length = len(self.slug) + 3
         for path in candidates:
             candidate = os.path.basename(path)
             try:
@@ -542,11 +533,9 @@ class Thing:
             except TypeError:
                 latest_time = candidate_time
                 latest = candidate
-        logging.info("Found last old thing: {} / {}".format(latest,latest_time))
+        logging.info("Found last old thing: {} / {}".format(latest, latest_time))
         return (latest, latest_time)
 
-
-
     def download(self, base_dir, compress):
         """ Download all files for a given thing. 
             Returns True iff the thing is now downloaded (not iff it downloads the thing!)
@@ -564,11 +553,12 @@ class Thing:
             return State.ALREADY_DOWNLOADED
 
         if not self._file_links:
-            logging.error("{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
+            logging.error(
+                "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
             return State.FAILED
 
         # Have we already downloaded some things?
-        renamed_dir = self._handle_outdated_directory(base_dir)
+        renamed_dir = self._handle_outdated_directory()
 
         # Get the list of files to download
 
@@ -581,7 +571,7 @@ class Thing:
             logging.debug("No last time, downloading all files")
             new_file_links = self._file_links
             self.time_stamp = new_file_links[0].last_update
-            
+
             for file_link in new_file_links:
                 self.time_stamp = max(self.time_stamp, file_link.last_update)
             logging.debug("New timestamp will be {}".format(self.time_stamp))
@@ -602,8 +592,7 @@ class Thing:
         filelist_file = os.path.join(self.download_dir, "filelist.txt")
         with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
             for fl in self._file_links:
-              fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
-
+                fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
 
         # First grab the cached files (if any)
         logging.info("Copying {} unchanged files.".format(len(old_file_links)))
@@ -624,7 +613,6 @@ class Thing:
                         "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
                     new_file_links.append(file_link)
 
-
         # Now download the new ones
         logging.info("Downloading {} new files of {}".format(
             len(new_file_links), len(self._file_links)))
@@ -635,10 +623,11 @@ class Thing:
                     file_link.name, file_link.link, file_name))
                 data_req = SESSION.get(file_link.link)
                 if data_req.status_code != 200:
-                    logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code, sanitise_url(file_link.link), data_req.text))
+                    logging.error("Unexpected status code {} for {}: {}".format(data_req.status_code,
+                                                                                sanitise_url(file_link.link),
+                                                                                data_req.text))
                     fail_dir(self.download_dir)
                     return State.FAILED
-                   
 
                 with open(file_name, 'wb') as handle:
                     handle.write(data_req.content)
@@ -647,7 +636,6 @@ class Thing:
             fail_dir(self.download_dir)
             return State.FAILED
 
-
         # People like images.
         image_dir = os.path.join(self.download_dir, 'images')
         logging.info("Downloading {} images.".format(len(self._image_links)))
@@ -657,7 +645,9 @@ class Thing:
                 filename = os.path.join(image_dir, imagelink.name)
                 image_req = SESSION.get(imagelink.link)
                 if image_req.status_code != 200:
-                    logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code, sanitise_url(file_link.link), image_req.text))
+                    logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
+                                                                                sanitise_url(imagelink.link),
+                                                                                image_req.text))
                     fail_dir(self.download_dir)
                     return State.FAILED
                 with open(truncate_name(filename), 'wb') as handle:
@@ -671,7 +661,8 @@ class Thing:
         logging.info("writing license file")
         try:
             if self._license:
-                with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w', encoding="utf-8") as license_handle:
+                with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
+                          encoding="utf-8") as license_handle:
                     license_handle.write("{}\n".format(self._license))
         except IOError as exception:
             logging.warning("Failed to write license! {}".format(exception))
@@ -679,14 +670,15 @@ class Thing:
         logging.info("writing readme")
         try:
             if self._details:
-                with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w', encoding="utf-8") as readme_handle:
+                with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
+                          encoding="utf-8") as readme_handle:
                     readme_handle.write("{}\n".format(self._details))
         except IOError as exception:
             logging.warning("Failed to write readme! {}".format(exception))
 
         try:
             # Now write the timestamp
-            with open(os.path.join(self.download_dir,TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
+            with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
                 timestamp_handle.write(self.time_stamp.__str__())
         except Exception as exception:
             logging.error("Failed to write timestamp file - {}".format(exception))
@@ -697,12 +689,11 @@ class Thing:
         if not compress:
             return State.OK
 
-
         thing_dir = "{} - {} - {}".format(self.thing_id,
-            slugify(self.name),
-            self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
+                                          slugify(self.name),
+                                          self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
         file_name = os.path.join(base_dir,
-            "{}.7z".format(thing_dir))
+                                 "{}.7z".format(thing_dir))
         logging.debug("Compressing {} to {}".format(
             self.name,
             file_name))
@@ -714,8 +705,6 @@ class Thing:
         return State.OK
 
 
-
-
 def do_batch(batch_file, download_dir, quick, compress):
     """ Read a file in line by line, parsing each as a set of calls to this script."""
     with open(batch_file) as handle:
@@ -749,7 +738,7 @@ def main():
     """ Entry point for script being run as a command. """
     parser = argparse.ArgumentParser()
     parser.add_argument("-l", "--log-level", choices=[
-                        'debug', 'info', 'warning'], default='info', help="level of logging desired")
+        'debug', 'info', 'warning'], default='info', help="level of logging desired")
     parser.add_argument("-d", "--directory",
                         help="Target directory to download into")
     parser.add_argument("-f", "--log-file",
@@ -760,7 +749,6 @@ def main():
                         help="Compress files")
     parser.add_argument("-a", "--api-key",
                         help="API key for thingiverse")
-            
 
     subparsers = parser.add_subparsers(
         help="Type of thing to download", dest="subcommand")
@@ -769,13 +757,13 @@ def main():
     collection_parser.add_argument(
         "owner", help="The owner of the collection(s) to get")
     collection_parser.add_argument(
-        "collections", nargs="+",  help="Space seperated list of the name(s) of collection to get")
+        "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
     thing_parser = subparsers.add_parser(
         'thing', help="Download a single thing.")
     thing_parser.add_argument(
         "things", nargs="*", help="Space seperated list of thing ID(s) to download")
     user_parser = subparsers.add_parser(
-        "user",  help="Download all things by one or more users")
+        "user", help="Download all things by one or more users")
     user_parser.add_argument(
         "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
     batch_parser = subparsers.add_parser(
@@ -799,11 +787,11 @@ def main():
 
     global API_KEY
     if args.api_key:
-        API_KEY=args.api_key
+        API_KEY = args.api_key
     else:
         try:
             with open("api.key") as fh:
-                API_KEY=fh.read().strip()
+                API_KEY = fh.read().strip()
         except Exception as e:
             logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
             logging.error("Exception: {}".format(e))
@@ -816,7 +804,6 @@ def main():
         file_handler.setFormatter(formatter)
         logger.addHandler(file_handler)
 
-
     # Start downloader
     thing_queue = multiprocessing.JoinableQueue()
     logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
@@ -824,7 +811,6 @@ def main():
     for downloader in downloaders:
         downloader.start()
 
-
     if args.subcommand.startswith("collection"):
         for collection in args.collections:
             Collection(args.owner, collection, args.directory, args.quick, args.compress).download()
@@ -840,10 +826,10 @@ def main():
         do_batch(args.batch_file, args.directory, args.quick, args.compress)
 
     # Stop the downloader processes
-    for downloader in downloaders:
+    for _ in downloaders:
         thing_queue.put(None)
 
 
-if __name__ == "__main__":    
+if __name__ == "__main__":
     multiprocessing.freeze_support()
     main()
-- 
2.20.1