From 65bd8b43699246737f2d85d882acc43c9f48e41e Mon Sep 17 00:00:00 2001 From: Oliver Matthews Date: Fri, 17 Apr 2020 13:55:54 +0100 Subject: [PATCH] Couple of minor filname handling fixes for windows - resolves #10, resolves #11 --- README.md | 4 ++++ thingy_grabber.py | 50 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d4a81c6..eec2397 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,10 @@ python3, beautifulsoup4, requests, lxml - If there is an updated file, the old directory will be moved to `name_timestamp` where `timestamp` is the last upload time of the old files. The code will then copy unchanged files across and download any new ones. ## Changelog +* v0.8.5 + - Strip '.'s from the end of filenames + - If you fail a download for an already failed download it no longer throws an exception + - Truncates paths that are too long for windows * v0.8.4 - Just use unicode filenames - puts the unicode characters back in! - Force selenium to shutdown firefox on assert and normal exit diff --git a/thingy_grabber.py b/thingy_grabber.py index 8a9780c..e9aeb06 100755 --- a/thingy_grabber.py +++ b/thingy_grabber.py @@ -38,7 +38,9 @@ NO_WHITESPACE_REGEX = re.compile(r'[-\s]+') DOWNLOADER_COUNT = 1 RETRY_COUNT = 3 -VERSION = "0.8.4" +MAX_PATH_LENGTH = 250 + +VERSION = "0.8.5" #BROWSER = webdriver.PhantomJS('./phantomjs') @@ -62,19 +64,47 @@ class State(enum.Enum): ALREADY_DOWNLOADED = enum.auto() +def fail_dir(dir_name): + """ When a download has failed, move it sideways. + """ + target_dir = "{}_failed".format(dir_name) + inc = 0 + while os.path.exists(target_dir): + target_dir = "{}_failed_{}".format(dir_name, inc) + inc += 1 + os.rename(dir_name, target_dir) + + +def truncate_name(file_name): + """ Ensure the filename is not too long for, well windows basically. + """ + path = os.path.abspath(file_name) + if len(path) <= MAX_PATH_LENGTH: + return path + to_cut = len(path) - (MAX_PATH_LENGTH + 3) + base, extension = os.path.splitext(path) + inc = 0 + new_path = "{}_{}{}".format(base, inc, extension) + while os.path.exists(new_path): + new_path = "{}_{}{}".format(base, inc, extension) + inc += 1 + return new_path + + def strip_ws(value): """ Remove whitespace from a string """ return str(NO_WHITESPACE_REGEX.sub('-', value)) - def slugify(value): """ Normalise string, removes invalid for filename charactersr and converts string to lowercase. """ value = unicodedata.normalize('NFKC', value).lower().strip() - return re.sub(r'[\\/<>:\?\*\|"]', '', value) + value = re.sub(r'[\\/<>:\?\*\|"]', '', value) + value = re.sub(r'\.*$', '', value) + return value class PageChecker(object): def __init__(self): @@ -464,7 +494,7 @@ class Thing: logging.info("Copying {} unchanged files.".format(len(old_file_links))) for file_link in old_file_links: old_file = os.path.join(prev_dir, file_link.name) - new_file = os.path.join(self.download_dir, file_link.name) + new_file = truncate_name(os.path.join(self.download_dir, file_link.name)) try: logging.debug("Copying {} to {}".format(old_file, new_file)) copyfile(old_file, new_file) @@ -478,7 +508,7 @@ class Thing: len(new_file_links), len(self._file_links))) try: for file_link in new_file_links: - file_name = os.path.join(self.download_dir, file_link.name) + file_name = truncate_name(os.path.join(self.download_dir, file_link.name)) logging.debug("Downloading {} from {} to {}".format( file_link.name, file_link.link, file_name)) data_req = requests.get(file_link.link) @@ -486,7 +516,7 @@ class Thing: handle.write(data_req.content) except Exception as exception: logging.error("Failed to download {} - {}".format(file_link.name, exception)) - os.rename(self.download_dir, "{}_failed".format(self.download_dir)) + fail_dir(self.download_dir) return State.FAILED @@ -500,11 +530,11 @@ class Thing: if filename.endswith('stl'): filename = "{}.png".format(filename) image_req = requests.get(imagelink) - with open(os.path.join(image_dir, filename), 'wb') as handle: + with open(truncate_name(os.path.join(image_dir, filename)), 'wb') as handle: handle.write(image_req.content) except Exception as exception: print("Failed to download {} - {}".format(filename, exception)) - os.rename(self.download_dir, "{}_failed".format(self.download_dir)) + fail_dir(self.download_dir) return State.FAILED """ @@ -525,7 +555,7 @@ class Thing: logging.info("Downloading license") try: if self._license: - with open(os.path.join(self.download_dir, 'license.txt'), 'w', encoding="utf-8") as license_handle: + with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w', encoding="utf-8") as license_handle: license_handle.write("{}\n".format(self._license)) except IOError as exception: logging.warning("Failed to write license! {}".format(exception)) @@ -536,7 +566,7 @@ class Thing: timestamp_handle.write(new_last_time.__str__()) except Exception as exception: print("Failed to write timestamp file - {}".format(exception)) - os.rename(self.download_dir, "{}_failed".format(self.download_dir)) + fail_dir(self.download_dir) return State.FAILED self._needs_download = False logging.debug("Download of {} finished".format(self.title)) -- 2.20.1