add thing downloading

[clinton/thingy_grabber.git] / thingy_grabber.py
diff --git a/thingy_grabber.py b/thingy_grabber.py

index e9bdebb..570a1df 100755 (executable)
--- a/thingy_grabber.py
+++ b/thingy_grabber.py
@@ -4,6 +4,7 @@ Thingiverse bulk downloader
  """
  
  import re
+import sys
  import os
  import argparse
  import unicodedata
@@ -18,6 +19,13 @@ TOTAL_REGEX = re.compile(r'"total":(\d*),')
  LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
  # This appears to be fixed at 12, but if it changes would screw the rest up.
  PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
+NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
+
+VERBOSE = False
+
+def strip_ws(value):
+    """ Remove whitespace from a string """
+    return str(NO_WHITESPACE_REGEX.sub('-', value))
  
  def slugify(value):
      """
@@ -26,7 +34,8 @@ def slugify(value):
      """
      value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
      value = str(re.sub(r'[^\w\s-]', '', value).strip())
-    value = str(re.sub(r'[-\s]+', '-', value))
+    value = str(NO_WHITESPACE_REGEX.sub('-', value))
+    #value = str(re.sub(r'[-\s]+', '-', value))
      return value
  
  class Collection:
@@ -55,7 +64,9 @@ class Collection:
              return self.things
  
          # Get the internal details of the collection.
-        c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, self.name)
+        c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
+        if VERBOSE:
+            print("Querying {}".format(c_url))
          c_req = requests.get(c_url)
          total = TOTAL_REGEX.search(c_req.text)
          if total is None:
@@ -107,30 +118,83 @@ def download_thing(thing):
      try:
          os.mkdir(title)
      except FileExistsError:
-        print("Directory for {} ({}) already exists, skipping".format(thing, title))
-        return
+        pass
+
      print("Downloading {} ({})".format(thing, title))
      os.chdir(title)
+    last_time = None
+
+    try:
+        with open('timestamp.txt', 'r') as timestamp_handle:
+            last_time = timestamp_handle.readlines()[0]
+        if VERBOSE:
+            print("last downloaded version: {}".format(last_time))
+    except FileNotFoundError:
+        # Not run on this thing before.
+        if VERBOSE:
+            print('Directory for thing already exists, checking for update.')
+        last_time = None
  
      file_links = file_soup.find_all('a', {'class':'file-download'})
-    files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in file_links]
+    new_last_time = last_time
+    new_file_links = []
+
+    for file_link in file_links:
+        timestamp = file_link.find_all('time')[0]['datetime']
+        if VERBOSE:
+            print("Checking {} (updated {})".format(file_link["title"], timestamp))
+        if not last_time or timestamp > last_time:
+            new_file_links.append(file_link)
+        if not new_last_time or timestamp > new_last_time:
+            new_last_time = timestamp
+
+    if last_time and new_last_time <= last_time:
+        print("Thing already downloaded. Skipping.")
+    files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
+
+    try:
+        for url, name in files:
+            if VERBOSE:
+                print("Downloading {} from {}".format(name, url))
+            data_req = requests.get(url)
+            with open(name, 'wb') as handle:
+                handle.write(data_req.content)
+        # now write timestamp
+        with open('timestamp.txt', 'w') as timestamp_handle:
+            timestamp_handle.write(new_last_time)
+    except Exception as exception:
+        print("Failed to download {} - {}".format(name, exception))
+        os.chdir(base_dir)
+        os.rename(title, "{}_failed".format(title))
+        return
+
  
-    for url, name in files:
-        data_req = requests.get(url)
-        with open(name, 'wb') as handle:
-            handle.write(data_req.content)
      os.chdir(base_dir)
  
  def main():
      """ Entry point for script being run as a command. """
      parser = argparse.ArgumentParser()
-    parser.add_argument("owner", help="The owner of the collection to get")
-    parser.add_argument("collection", help="The name of the collection to get")
+    parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
+    subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
+    collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
+    collection_parser.add_argument("owner", help="The owner of the collection to get")
+    collection_parser.add_argument("collection", help="The name of the collection to get")
+    thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
+    thing_parser.add_argument("thing", help="Thing ID to download")
+
      args = parser.parse_args()
+    if not args.subcommand:
+        parser.print_help()
+        sys.exit(1)
+    global VERBOSE
+    VERBOSE = args.verbose
+    if args.subcommand.startswith("collection"):
+        collection = Collection(args.owner, args.collection)
+        print(collection.get_collection())
+        collection.download()
+    if args.subcommand == "thing":
+        download_thing(args.thing)
  
-    collection = Collection(args.owner, args.collection)
-    print(collection.get_collection())
-    collection.download()
  
  if __name__ == "__main__":
      main()