From 8ed15058a30ef052d78d3dfd0f72bde5bd03d313 Mon Sep 17 00:00:00 2001
From: Oliver Matthews <oliver@codersoffortune.net>
Date: Thu, 16 Jul 2020 11:25:17 +0100
Subject: [PATCH] add compression support

---
 README.md         |   9 ++-
 requirements.yml  |   1 +
 thingy_grabber.py | 168 ++++++++++++++++++++++++++++------------------
 3 files changed, 110 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md
index e78abff..0243605 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ Script for archiving thingiverse things. Due to this being a glorified webscrape
 
 ## Usage:
 ````
-usage: thingy_grabber.py [-h] [-l {debug,info,warning}] [-d DIRECTORY] [-f LOG_FILE] [-q] {collection,thing,user,batch,version} ...
+usage: thingy_grabber.py [-h] [-l {debug,info,warning}] [-d DIRECTORY] [-f LOG_FILE] [-q] [-c] {collection,thing,user,batch,version} ...
 
 positional arguments:
   {collection,thing,user,batch,version}
@@ -23,6 +23,7 @@ optional arguments:
   -f LOG_FILE, --log-file LOG_FILE
                         Place to log debug information to
   -q, --quick           Assume date ordering on posts
+  -c, --compress        Compress files
 ````
 
 ### Things
@@ -77,7 +78,7 @@ Download all designs by jim and bob into directories under `c:\downloads`, give
 `
 
 ## Requirements
-python3, beautifulsoup4, requests, lxml
+python3, beautifulsoup4, requests, lxml, py7xr (>=0.8.2)
 
 ## Current features:
 - can download an entire collection, creating seperate subdirs for each thing in the collection
@@ -85,6 +86,10 @@ python3, beautifulsoup4, requests, lxml
 - If there is an updated file, the old directory will be moved to `name_timestamp` where `timestamp` is the last upload time of the old files. The code will then copy unchanged files across and download any new ones.
 
 ## Changelog
+* v0.9.0
+  - Compression! New -c option will use 7z to create an archival copy of the file once downloaded. 
+    Note that although it will use the presence of 7z files to determine if a file has been updated, it currently _won't_ read old files from inside the 7z for handling updates, resulting in marginally larger bandwidth usage when dealing with partially updated things. This will be fixed later.
+  - Internal tidying of how old directories are handled - I've tested this fairly heavily, but do let me know if there are issues.
 * v0.8.7
   - Always, Always generate a valid time stamp.
 * v0.8.6
diff --git a/requirements.yml b/requirements.yml
index 0f92ad0..ff03206 100644
--- a/requirements.yml
+++ b/requirements.yml
@@ -5,3 +5,4 @@ dependencies:
   - beautifulsoup4
   - lxml
   - requests
+  - py7xr
diff --git a/thingy_grabber.py b/thingy_grabber.py
index 5c018bd..cda3939 100755
--- a/thingy_grabber.py
+++ b/thingy_grabber.py
@@ -24,9 +24,14 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.firefox.options import Options
 import atexit
 import py7zr
+import glob
+import shutil
 
 SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
 
+# I don't think this is exported by datetime
+DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
+
 URL_BASE = "https://www.thingiverse.com"
 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
@@ -45,6 +50,7 @@ MAX_PATH_LENGTH = 250
 
 VERSION = "0.9.0"
 
+TIMESTAMP_FILE = "timestamp.txt"
 
 #BROWSER = webdriver.PhantomJS('./phantomjs')
 options = Options()
@@ -82,23 +88,29 @@ class FileLinks:
         except TypeError:
             self.last_update = link.last_update
         self.links.append(link)
-    
+
 
 class State(enum.Enum):
     OK = enum.auto()
     FAILED = enum.auto()
     ALREADY_DOWNLOADED = enum.auto()
 
-
-def fail_dir(dir_name):
-    """ When a download has failed, move it sideways.
+def rename_unique(dir_name, target_dir_name):
+    """ Move a directory sideways to a new name, ensuring it is unique.
     """
-    target_dir = "{}_failed".format(dir_name)
+    target_dir = target_dir_name
     inc = 0
     while os.path.exists(target_dir):
-      target_dir = "{}_failed_{}".format(dir_name, inc)
+      target_dir = "{}_{}".format(target_dir_name, inc)
       inc += 1
     os.rename(dir_name, target_dir)
+    return target_dir
+
+
+def fail_dir(dir_name):
+    """ When a download has failed, move it sideways.
+    """
+    return rename_unique(dir_name,"{}_failed".format(dir_name))
 
 
 def truncate_name(file_name):
@@ -392,55 +404,21 @@ class Thing:
         self.pc = pc
 
 
-        self.old_download_dir = os.path.join(base_dir, slugify(self.title))
-        self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, slugify(self.title)))
+        self.slug = "{} - {}".format(self.thing_id, slugify(self.title))
+        self.download_dir = os.path.join(base_dir, self.slug)
+
+        self._handle_old_directory(base_dir)
 
         logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
+        latest, self.last_time = self._find_last_download(base_dir)
 
-        if not os.path.exists(self.download_dir):
-            logging.info("Looking for old dir at {}".format(self.old_download_dir))
-            if os.path.exists(self.old_download_dir):
-                logging.warning("Found previous style download directory. Moving it from {} to {}".format(self.old_download_dir, self.download_dir))
-                os.rename(self.old_download_dir, self.download_dir)
-            else:
+        if not latest:
                 # Not yet downloaded
                 self._parsed = True
                 return
 
-        timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
-        if not os.path.exists(timestamp_file):
-            # Old download from before
-            logging.warning(
-                "Old-style download directory found. Assuming update required.")
-            self._parsed = True
-            return
 
-        try:
-            with open(timestamp_file, 'r') as timestamp_handle:
-                # add the .split(' ')[0] to remove the timestamp from the old style timestamps
-                last_bits = [int(x) for x in timestamp_handle.readlines()[0].split(' ')[0].split("-")]
-                logging.warning(last_bits)
-                if last_bits[0] == 0:
-                    last_bits[0] = 1
-                if last_bits[1] == 0:
-                    last_bits[1] = 1
-                if last_bits[2] == 0:
-                    last_bits[2] = 1980
-                try:
-                    self.last_time = datetime.datetime(last_bits[0], last_bits[1], last_bits[2])
-                except ValueError:
-                    # This one appears to be M D Y
-                    self.last_time = datetime.datetime(last_bits[2], last_bits[0], last_bits[1])
-
-            logging.info("last downloaded version: {}".format(self.last_time))
-        except FileNotFoundError:
-            # Not run on this thing before.
-            logging.info(
-                "Old-style download directory found. Assuming update required.")
-            self.last_time = None
-            self._needs_download = True
-            self._parsed = True
-            return
+        logging.info("last downloaded version: {}".format(self.last_time))
 
         # OK, so we have a timestamp, lets see if there is anything new to get
         try:
@@ -457,6 +435,78 @@ class Thing:
         self._needs_download = False
         self._parsed = True
 
+    def _handle_old_directory(self, base_dir):
+        """ Deal with any old directories from previous versions of the code.
+        """
+        old_dir = os.path.join(base_dir, slugify(self.title))
+        if os.path.exists(old_dir):
+            logging.warning("Found old style download_dir. Moving.")
+            rename_unique(old_dir, self.download_dir)
+
+    def _handle_outdated_directory(self, base_dir):
+        """ Move the current download directory sideways if the thing has changed.
+        """
+        if not os.path.exists(self.download_dir):
+            # No old directory to move.
+            return None
+        timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
+        if not os.path.exists(timestamp_file):
+            # Old form of download directory
+            target_dir_name = "{} - old".format(self.download_dir)
+        else:
+            target_dir_name = "{} - {}".format(self.download_dir, slugify(self.last_time.__str__()))
+        return rename_unique(self.download_dir, target_dir_name)
+
+    def _find_last_download(self, base_dir):
+        """ Look for the most recent previous download (if any) of the thing.
+        """
+        logging.info("Looking for old things")
+
+        # First the DL directory itself.
+        timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
+
+        latest = None
+        latest_time = None
+
+        try:
+            logging.debug("Checking for existing download in normal place.")
+            with open(timestamp_file) as ts_fh:
+                timestamp_text = ts_fh.read().strip()
+            latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
+            latest = self.download_dir
+        except FileNotFoundError:
+            # No existing download directory. huh.
+            pass
+        except TypeError:
+            logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
+
+        # TODO:  Maybe look for old download directories.
+
+
+        # Now look for 7z files
+        candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
+        # +3 to allow for ' - '
+        leading_length =len(self.slug)+3
+        for path in candidates:
+            candidate = os.path.basename(path)
+            try:
+                logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
+                candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], DEFAULT_DATETIME_FORMAT)
+            except ValueError:
+                logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
+                continue
+            try:
+                if candidate_time > latest_time:
+                    latest_time = candidate_time
+                    latest = candidate
+            except TypeError:
+                latest_time = candidate_time
+                latest = candidate
+        logging.info("Found last old thing: {} / {}".format(latest,latest_time))
+        return (latest, latest_time)
+
+
+
     def download(self, base_dir, compress):
         """ Download all files for a given thing. 
             Returns True iff the thing is now downloaded (not iff it downloads the thing!)
@@ -478,21 +528,7 @@ class Thing:
             return State.FAILED
 
         # Have we already downloaded some things?
-        timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
-        prev_dir = None
-        if os.path.exists(self.download_dir):
-            if not os.path.exists(timestamp_file):
-                # edge case: old style dir w/out timestamp.
-                logging.warning("Old style download dir found at {}".format(self.title))
-                prev_count = 0
-                target_dir = "{}_old".format(self.download_dir)
-                while os.path.exists(target_dir):
-                    prev_count = prev_count + 1
-                    target_dir = "{}_old_{}".format(self.download_dir, prev_count)
-                os.rename(self.download_dir, target_dir)
-            else:
-                prev_dir = "{}_{}".format(self.download_dir, slugify(self.last_time.__str__()))
-                os.rename(self.download_dir, prev_dir)
+        renamed_dir = self._handle_outdated_directory(base_dir)
 
         # Get the list of files to download
 
@@ -539,7 +575,7 @@ class Thing:
         # First grab the cached files (if any)
         logging.info("Copying {} unchanged files.".format(len(old_file_links)))
         for file_link in old_file_links:
-            old_file = os.path.join(prev_dir, file_link.name)
+            old_file = os.path.join(renamed_dir, file_link.name)
             new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
             try:
                 logging.debug("Copying {} to {}".format(old_file, new_file))
@@ -608,7 +644,7 @@ class Thing:
 
         try:
             # Now write the timestamp
-            with open(timestamp_file, 'w', encoding="utf-8") as timestamp_handle:
+            with open(os.path.join(self.download_dir,TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
                 timestamp_handle.write(self.time_stamp.__str__())
         except Exception as exception:
             print("Failed to write timestamp file - {}".format(exception))
@@ -628,11 +664,11 @@ class Thing:
         logging.debug("Compressing {} to {}".format(
             self.title,
             file_name))
-        #with libarchive.file_writer(filename, 'lzma', '7z') as archive:
         with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
-        #with py7zr.SevenZipFile(file_name, 'w' ) as archive:
             archive.writeall(self.download_dir, thing_dir)
         logging.debug("Compression of {} finished.".format(self.title))
+        shutil.rmtree(self.download_dir)
+        logging.debug("Removed temporary download dir of {}.".format(self.title))
         return State.OK
 
 
-- 
2.20.1