Move old downloads sideways

author Oliver Matthews <oliver@codersoffortune.net>

Tue, 26 Nov 2019 16:21:07 +0000 (16:21 +0000)

committer Oliver Matthews <oliver@codersoffortune.net>

Tue, 26 Nov 2019 16:21:07 +0000 (16:21 +0000)
author Oliver Matthews <oliver@codersoffortune.net>
Tue, 26 Nov 2019 16:21:07 +0000 (16:21 +0000)
committer Oliver Matthews <oliver@codersoffortune.net>
Tue, 26 Nov 2019 16:21:07 +0000 (16:21 +0000)
diff --git a/README.md b/README.md

index f6d7d90..58d525a 100644 (file)
--- a/README.md
+++ b/README.md
@@ -44,11 +44,10 @@ python3, beautifulsoup4, requests, lxml
  ## Current features:
  - can download an entire collection, creating seperate subdirs for each thing in the collection
  - If you run it again with the same settings, it will check for updated files and only update what has changed. This should make it suitible for syncing a collection on a cronjob
-CAVEAT: This script will *not delete files*. So if there has been an update and some files have been moved or renamed, they will be mixed in with the old stuff.
+- If there is an updated file, the old directory will be moved to `name_timestamp` where `timestamp` is the last upload time of the old files. The code will then copy unchanged files across and download any new ones.
  
  
  ## Todo features (maybe):
  - less perfunctory error checking / handling
  - attempt to use -failed dirs for resuming
  - pull down images as well
-- handle old/deleted files on update
diff --git a/thingy_grabber.py b/thingy_grabber.py

index 6f9919c..896e4d6 100755 (executable)
--- a/thingy_grabber.py
+++ b/thingy_grabber.py
@@ -9,6 +9,7 @@ import os
  import argparse
  import unicodedata
  import requests
+from shutil import copyfile
  from bs4 import BeautifulSoup
  
  URL_BASE = "https://www.thingiverse.com"
@@ -39,7 +40,10 @@ def slugify(value):
      return value
  
  class Grouping:
-    """ Holds details of a group of things. """
+    """ Holds details of a group of things.
+        This is effectively (although not actually) an abstract class
+        - use Collection or Designs instead.
+    """
      def __init__(self):
          self.things = []
          self.total = 0
@@ -108,11 +112,10 @@ class Grouping:
          try:
              os.mkdir(self.download_dir)
          except FileExistsError:
-            print("Target directory {} already exists. Assuming a resume.".format(self.download_dir))
-        os.chdir(self.download_dir)
+            print("Target directory {} already exists. Assuming a resume."
+                  .format(self.download_dir))
          for thing in self.things:
-            download_thing(thing)
-        os.chdir(base_dir)
+            Thing(thing).download(self.download_dir)
  
  class Collection(Grouping):
      """ Holds details of a collection. """
@@ -120,8 +123,10 @@ class Collection(Grouping):
          Grouping.__init__(self)
          self.user = user
          self.name = name
-        self.url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
-        self.download_dir = os.path.join(os.getcwd(), "{}-{}".format(slugify(self.user), slugify(self.name)))
+        self.url = "{}/{}/collections/{}".format(
+            URL_BASE, self.user, strip_ws(self.name))
+        self.download_dir = os.path.join(os.getcwd(),
+                                         "{}-{}".format(slugify(self.user), slugify(self.name)))
  
  class Designs(Grouping):
      """ Holds details of all of a users' designs. """
@@ -131,69 +136,167 @@ class Designs(Grouping):
          self.url = "{}/{}/designs".format(URL_BASE, self.user)
          self.download_dir = os.path.join(os.getcwd(), "{} designs".format(slugify(self.user)))
  
-def download_thing(thing):
-    """ Downloads all the files for a given thing. """
-    file_url = "{}/thing:{}/files".format(URL_BASE, thing)
-    file_req = requests.get(file_url)
-    file_soup = BeautifulSoup(file_req.text, features='lxml')
-
-    title = slugify(file_soup.find_all('h1')[0].text.strip())
-    base_dir = os.getcwd()
-    try:
-        os.mkdir(title)
-    except FileExistsError:
-        pass
-
-    print("Downloading {} ({})".format(thing, title))
-    os.chdir(title)
-    last_time = None
-
-    try:
-        with open('timestamp.txt', 'r') as timestamp_handle:
-            last_time = timestamp_handle.readlines()[0]
-        if VERBOSE:
-            print("last downloaded version: {}".format(last_time))
-    except FileNotFoundError:
-        # Not run on this thing before.
-        if VERBOSE:
-            print('Directory for thing already exists, checking for update.')
-        last_time = None
+class Thing:
+    """ An individual design on thingiverse. """
+    def __init__(self, thing_id):
+        self.thing_id = thing_id
+        self.last_time = None
+        self._parsed = False
+        self._needs_download = True
+        self.text = None
+        self.title = None
+        self.download_dir = None
  
-    file_links = file_soup.find_all('a', {'class':'file-download'})
-    new_last_time = last_time
-    new_file_links = []
+    def _parse(self, base_dir):
+        """ Work out what, if anything needs to be done. """
+        if self._parsed:
+            return
  
-    for file_link in file_links:
-        timestamp = file_link.find_all('time')[0]['datetime']
-        if VERBOSE:
-            print("Checking {} (updated {})".format(file_link["title"], timestamp))
-        if not last_time or timestamp > last_time:
-            new_file_links.append(file_link)
-        if not new_last_time or timestamp > new_last_time:
-            new_last_time = timestamp
-
-    if last_time and new_last_time <= last_time:
-        print("Thing already downloaded. Skipping.")
-    files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
-
-    try:
-        for url, name in files:
+        url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
+        req = requests.get(url)
+        self.text = req.text
+        soup = BeautifulSoup(self.text, features='lxml')
+
+        self.title = slugify(soup.find_all('h1')[0].text.strip())
+        self.download_dir = os.path.join(base_dir, self.title)
+
+        if not os.path.exists(self.download_dir):
+            # Not yet downloaded
+            self._parsed = True
+            return
+
+        timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
+        if not os.path.exists(timestamp_file):
+            # Old download from before
+            if VERBOSE:
+                print("Old-style download directory found. Assuming update required.")
+            self._parsed = True
+            return
+
+        try:
+            with open(timestamp_file, 'r') as timestamp_handle:
+                self.last_time = timestamp_handle.readlines()[0]
+            if VERBOSE:
+                print("last downloaded version: {}".format(self.last_time))
+        except FileNotFoundError:
+            # Not run on this thing before.
+            if VERBOSE:
+                print("Old-style download directory found. Assuming update required.")
+            self.last_time = None
+            self._parsed = True
+            return
+
+        # OK, so we have a timestamp, lets see if there is anything new to get
+        file_links = soup.find_all('a', {'class':'file-download'})
+        for file_link in file_links:
+            timestamp = file_link.find_all('time')[0]['datetime']
              if VERBOSE:
-                print("Downloading {} from {}".format(name, url))
-            data_req = requests.get(url)
-            with open(name, 'wb') as handle:
-                handle.write(data_req.content)
-        # now write timestamp
-        with open('timestamp.txt', 'w') as timestamp_handle:
-            timestamp_handle.write(new_last_time)
-    except Exception as exception:
-        print("Failed to download {} - {}".format(name, exception))
-        os.chdir(base_dir)
-        os.rename(title, "{}_failed".format(title))
-        return
-
-
-    os.chdir(base_dir)
+                print("Checking {} (updated {})".format(file_link["title"], timestamp))
+            if timestamp > self.last_time:
+                print("Found new/updated file {}".format(file_link["title"]))
+                self._needs_download = True
+                self._parsed = True
+                return
+        # Got here, so nope, no new files.
+        print("Found no new files for {}".format(self.title))
+        self._needs_download = False
+        self._parsed = True
+
+    def download(self, base_dir):
+        """ Download all files for a given thing. """
+        if not self._parsed:
+            self._parse(base_dir)
+
+        if not self._needs_download:
+            if VERBOSE:
+                print("{} already downloaded - skipping.".format(self.title))
+            return
+
+        # Have we already downloaded some things?
+        timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
+        prev_dir = None
+        if os.path.exists(self.download_dir):
+            if not os.path.exists(timestamp_file):
+                # edge case: old style dir w/out timestamp.
+                print("Old style download dir found for {}".format(self.title))
+                os.rename(self.download_dir, "{}_old".format(self.download_dir))
+            else:
+                prev_dir = "{}_{}".format(self.download_dir, self.last_time)
+                os.rename(self.download_dir, prev_dir)
+
+        # Get the list of files to download
+        soup = BeautifulSoup(self.text, features='lxml')
+        file_links = soup.find_all('a', {'class':'file-download'})
+
+        new_file_links = []
+        old_file_links = []
+        new_last_time = None
+
+        if not self.last_time:
+            # If we don't have anything to copy from, then it is all new.
+            new_file_links = file_links
+            new_last_time = file_links[0].find_all('time')[0]['datetime']
+            for file_link in file_links:
+                timestamp = file_link.find_all('time')[0]['datetime']
+                if VERBOSE:
+                    print("Found file {} from {}".format(file_link["title"], timestamp))
+                if timestamp > new_last_time:
+                    new_last_time = timestamp
+        else:
+            for file_link in file_links:
+                timestamp = file_link.find_all('time')[0]['datetime']
+                if VERBOSE:
+                    print("Checking {} (updated {})".format(file_link["title"], timestamp))
+                if timestamp > self.last_time:
+                    new_file_links.append(file_link)
+                else:
+                    old_file_links.append(file_link)
+                if not new_last_time or timestamp > new_last_time:
+                    new_last_time = timestamp
+
+        if VERBOSE:
+            print("new timestamp {}".format(new_last_time))
+
+        # OK. Time to get to work.
+        os.mkdir(self.download_dir)
+        # First grab the cached files (if any)
+        for file_link in old_file_links:
+            old_file = os.path.join(prev_dir, file_link["title"])
+            new_file = os.path.join(self.download_dir, file_link["title"])
+            try:
+                if VERBOSE:
+                    print("Copying {} to {}".format(old_file, new_file))
+                copyfile(old_file, new_file)
+            except FileNotFoundError:
+                print("Unable to find {} in old archive, redownloading".format(file_link["title"]))
+                new_file_links.append(file_link)
+
+        # Now download the new ones
+        files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
+        try:
+            for url, name in files:
+                file_name = os.path.join(self.download_dir, name)
+                if VERBOSE:
+                    print("Downloading {} from {} to {}".format(name, url, file_name))
+                data_req = requests.get(url)
+                with open(file_name, 'wb') as handle:
+                    handle.write(data_req.content)
+        except Exception as exception:
+            print("Failed to download {} - {}".format(name, exception))
+            os.rename(self.download_dir, "{}_failed".format(self.download_dir))
+            return
+
+        try:
+            # Now write the timestamp
+            with open(timestamp_file, 'w') as timestamp_handle:
+                timestamp_handle.write(new_last_time)
+        except Exception as exception:
+            print("Failed to write timestamp file - {}".format(exception))
+            os.rename(self.download_dir, "{}_failed".format(self.download_dir))
+            return
+        self._needs_download = False
+        if VERBOSE:
+            print("Download of {} finished".format(self.title))
  
  def main():
      """ Entry point for script being run as a command. """
@@ -219,7 +322,7 @@ def main():
          print(collection.get())
          collection.download()
      if args.subcommand == "thing":
-        download_thing(args.thing)
+        Thing(args.thing).download(os.getcwd())
      if args.subcommand == "user":
          designs = Designs(args.user)
          print(designs.get())
author	Oliver Matthews <oliver@codersoffortune.net>
	Tue, 26 Nov 2019 16:21:07 +0000 (16:21 +0000)
committer	Oliver Matthews <oliver@codersoffortune.net>
	Tue, 26 Nov 2019 16:21:07 +0000 (16:21 +0000)
README.md		patch \| blob \| blame \| history
thingy_grabber.py		patch \| blob \| blame \| history