"""
import re
+import sys
import os
import argparse
import unicodedata
LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
# This appears to be fixed at 12, but if it changes would screw the rest up.
PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
+NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
+
+VERBOSE = False
+
+def strip_ws(value):
+ """ Remove whitespace from a string """
+ return str(NO_WHITESPACE_REGEX.sub('-', value))
def slugify(value):
"""
"""
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
value = str(re.sub(r'[^\w\s-]', '', value).strip())
- value = str(re.sub(r'[-\s]+', '-', value))
+ value = str(NO_WHITESPACE_REGEX.sub('-', value))
+ #value = str(re.sub(r'[-\s]+', '-', value))
return value
class Collection:
return self.things
# Get the internal details of the collection.
- c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, self.name)
+ c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
+ if VERBOSE:
+ print("Querying {}".format(c_url))
c_req = requests.get(c_url)
total = TOTAL_REGEX.search(c_req.text)
if total is None:
try:
os.mkdir(title)
except FileExistsError:
- print("Directory for {} ({}) already exists, skipping".format(thing, title))
- return
+ pass
+
print("Downloading {} ({})".format(thing, title))
os.chdir(title)
+ last_time = None
+
+ try:
+ with open('timestamp.txt', 'r') as timestamp_handle:
+ last_time = timestamp_handle.readlines()[0]
+ if VERBOSE:
+ print("last downloaded version: {}".format(last_time))
+ except FileNotFoundError:
+ # Not run on this thing before.
+ if VERBOSE:
+ print('Directory for thing already exists, checking for update.')
+ last_time = None
file_links = file_soup.find_all('a', {'class':'file-download'})
- files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in file_links]
+ new_last_time = last_time
+ new_file_links = []
+
+ for file_link in file_links:
+ timestamp = file_link.find_all('time')[0]['datetime']
+ if VERBOSE:
+ print("Checking {} (updated {})".format(file_link["title"], timestamp))
+ if not last_time or timestamp > last_time:
+ new_file_links.append(file_link)
+ if not new_last_time or timestamp > new_last_time:
+ new_last_time = timestamp
+
+ if last_time and new_last_time <= last_time:
+ print("Thing already downloaded. Skipping.")
+ files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
+
+ try:
+ for url, name in files:
+ if VERBOSE:
+ print("Downloading {} from {}".format(name, url))
+ data_req = requests.get(url)
+ with open(name, 'wb') as handle:
+ handle.write(data_req.content)
+ # now write timestamp
+ with open('timestamp.txt', 'w') as timestamp_handle:
+ timestamp_handle.write(new_last_time)
+ except Exception as exception:
+ print("Failed to download {} - {}".format(name, exception))
+ os.chdir(base_dir)
+ os.rename(title, "{}_failed".format(title))
+ return
+
- for url, name in files:
- data_req = requests.get(url)
- with open(name, 'wb') as handle:
- handle.write(data_req.content)
os.chdir(base_dir)
def main():
""" Entry point for script being run as a command. """
parser = argparse.ArgumentParser()
- parser.add_argument("owner", help="The owner of the collection to get")
- parser.add_argument("collection", help="The name of the collection to get")
+ parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
+ subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
+ collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
+ collection_parser.add_argument("owner", help="The owner of the collection to get")
+ collection_parser.add_argument("collection", help="The name of the collection to get")
+ thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
+ thing_parser.add_argument("thing", help="Thing ID to download")
+
args = parser.parse_args()
+ if not args.subcommand:
+ parser.print_help()
+ sys.exit(1)
+ global VERBOSE
+ VERBOSE = args.verbose
+ if args.subcommand.startswith("collection"):
+ collection = Collection(args.owner, args.collection)
+ print(collection.get_collection())
+ collection.download()
+ if args.subcommand == "thing":
+ download_thing(args.thing)
- collection = Collection(args.owner, args.collection)
- print(collection.get_collection())
- collection.download()
if __name__ == "__main__":
main()