Remove unicode characters from filenames - fixes #6
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 import logging
13 import multiprocessing
14 import enum
15 import datetime
16 from shutil import copyfile
17 from bs4 import BeautifulSoup
18 from dataclasses import dataclass
19 import selenium
20 from selenium import webdriver
21 from selenium.webdriver.common.by import By
22 from selenium.webdriver.support.ui import WebDriverWait
23 from selenium.webdriver.support import expected_conditions as EC
24 from selenium.webdriver.firefox.options import Options
25
26 URL_BASE = "https://www.thingiverse.com"
27 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
28 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
29
30 ID_REGEX = re.compile(r'"id":(\d*),')
31 TOTAL_REGEX = re.compile(r'"total":(\d*),')
32 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
33 # This appears to be fixed at 12, but if it changes would screw the rest up.
34 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
35 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
36
37 DOWNLOADER_COUNT = 1
38 RETRY_COUNT = 3
39
40 VERSION = "0.8.2"
41
42
43 #BROWSER = webdriver.PhantomJS('./phantomjs')
44 options = Options()
45 options.add_argument("--headless")
46 BROWSER = webdriver.Firefox(options=options)
47
48 BROWSER.set_window_size(1980, 1080)
49
50
51 @dataclass
52 class FileLink:
53 name: str
54 last_update: str
55 link: datetime.datetime
56
57
58 class State(enum.Enum):
59 OK = enum.auto()
60 FAILED = enum.auto()
61 ALREADY_DOWNLOADED = enum.auto()
62
63
64 def strip_ws(value):
65 """ Remove whitespace from a string """
66 return str(NO_WHITESPACE_REGEX.sub('-', value))
67
68
69 def strip_invalid_chars(value):
70 """
71 Normalizes string, converts to lowercase, removes non-alpha characters.
72 """
73 return unicodedata.normalize('NFKD', value).encode(
74 'ascii', 'ignore').decode()
75
76
77 def slugify(value):
78 """
79 Normalizes string, converts to lowercase, removes non-alpha characters,
80 and converts spaces to hyphens.
81 """
82 value = strip_invalid_chars(value)
83 value = str(re.sub(r'[^\w\s-]', '', value).strip())
84 value = strip_ws(value)
85 return value
86
87 class PageChecker(object):
88 def __init__(self):
89 self.log = []
90 self.title = None
91 self.file_count = None
92 self.files = None
93 self.images = None
94 self.license = None
95
96
97 def __call__(self, _):
98 try:
99 self.log.append("call")
100 if self.title is None:
101 # first find the name
102 name = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=ThingPage__modelName]"))
103 if name is None:
104 return False
105 self.title = name.text
106
107 if self.file_count is None:
108 # OK. Do we know how many files we have to download?
109 metrics = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=MetricButton]"))
110 self.log.append("got some metrics: {}".format(len(metrics)))
111 cur_count = int([x.text.split("\n")[0] for x in metrics if x.text.endswith("\nThing Files")][0])
112 self.log.append(cur_count)
113 if cur_count == 0:
114 return False
115 self.file_count = cur_count
116
117 self.log.append("looking for {} files".format(self.file_count))
118 fileRows = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=ThingFile__fileRow]"))
119 self.log.append("found {} files".format(len(fileRows)))
120 if len(fileRows) < self.file_count:
121 return False
122
123 self.log.append("Looking for images")
124 # By this point _should_ have loaded all the images
125 self.images = EC._find_elements(BROWSER, (By.CSS_SELECTOR, "[class^=thumb]"))
126 self.license = EC._find_element(BROWSER, (By.CSS_SELECTOR, "[class^=License__licenseText]")).text
127 self.log.append("found {} images".format(len(self.images)))
128 self.files = fileRows
129 return True
130 except Exception:
131 return False
132
133
134
135
136 class Downloader(multiprocessing.Process):
137 """
138 Class to handle downloading the things we have found to get.
139 """
140
141 def __init__(self, thing_queue, download_directory):
142 multiprocessing.Process.__init__(self)
143 # TODO: add parameters
144 self.thing_queue = thing_queue
145 self.download_directory = download_directory
146
147 def run(self):
148 """ actual download loop.
149 """
150 while True:
151 thing_id = self.thing_queue.get()
152 if thing_id is None:
153 logging.info("Shutting download queue")
154 self.thing_queue.task_done()
155 break
156 logging.info("Handling id {}".format(thing_id))
157 Thing(thing_id).download(self.download_directory)
158 self.thing_queue.task_done()
159 return
160
161
162
163
164
165 class Grouping:
166 """ Holds details of a group of things for download
167 This is effectively (although not actually) an abstract class
168 - use Collection or Designs instead.
169 """
170
171 def __init__(self, quick):
172 self.things = []
173 self.total = 0
174 self.req_id = None
175 self.last_page = 0
176 self.per_page = None
177 # Should we stop downloading when we hit a known datestamp?
178 self.quick = quick
179 # These should be set by child classes.
180 self.url = None
181 self.download_dir = None
182 self.collection_url = None
183
184 def _get_small_grouping(self, req):
185 """ Handle small groupings """
186 soup = BeautifulSoup(req.text, features='lxml')
187 links = soup.find_all('a', {'class': 'card-img-holder'})
188 self.things = [x['href'].split(':')[1] for x in links]
189 self.total = len(self.things)
190
191 return self.things
192
193 def get(self):
194 """ retrieve the things of the grouping. """
195 if self.things:
196 # We've already done it.
197 return self.things
198
199 # Check for initialisation:
200 if not self.url:
201 logging.error("No URL set - object not initialised properly?")
202 raise ValueError("No URL set - object not initialised properly?")
203
204 # Get the internal details of the grouping.
205 logging.debug("Querying {}".format(self.url))
206 c_req = requests.get(self.url)
207 total = TOTAL_REGEX.search(c_req.text)
208 if total is None:
209 # This is a small (<13) items grouping. Pull the list from this req.
210 return self._get_small_grouping(c_req)
211 self.total = total.groups()[0]
212 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
213 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
214 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
215 parameters = {
216 'base_url': self.url,
217 'page': '1',
218 'per_page': '12',
219 'id': self.req_id
220 }
221 for current_page in range(1, self.last_page + 1):
222 parameters['page'] = current_page
223 req = requests.post(self.collection_url, parameters)
224 soup = BeautifulSoup(req.text, features='lxml')
225 links = soup.find_all('a', {'class': 'card-img-holder'})
226 self.things += [x['href'].split(':')[1] for x in links]
227
228 return self.things
229
230 def download(self):
231 """ Downloads all the files in a collection """
232 if not self.things:
233 self.get()
234
235 if not self.download_dir:
236 raise ValueError(
237 "No download_dir set - invalidly initialised object?")
238
239 base_dir = os.getcwd()
240 try:
241 os.mkdir(self.download_dir)
242 except FileExistsError:
243 logging.info("Target directory {} already exists. Assuming a resume."
244 .format(self.download_dir))
245 logging.info("Downloading {} thing(s).".format(self.total))
246 for idx, thing in enumerate(self.things):
247 logging.info("Downloading thing {} - {}".format(idx, thing))
248 RC = Thing(thing).download(self.download_dir)
249 if self.quick and RC==State.ALREADY_DOWNLOADED:
250 logging.info("Caught up, stopping.")
251 return
252
253
254 class Collection(Grouping):
255 """ Holds details of a collection. """
256
257 def __init__(self, user, name, directory, quick):
258 Grouping.__init__(self, quick)
259 self.user = user
260 self.name = name
261 self.url = "{}/{}/collections/{}".format(
262 URL_BASE, self.user, strip_ws(self.name))
263 self.download_dir = os.path.join(directory,
264 "{}-{}".format(slugify(self.user), slugify(self.name)))
265 self.collection_url = URL_COLLECTION
266
267
268 class Designs(Grouping):
269 """ Holds details of all of a users' designs. """
270
271 def __init__(self, user, directory, quick):
272 Grouping.__init__(self, quick)
273 self.user = user
274 self.url = "{}/{}/designs".format(URL_BASE, self.user)
275 self.download_dir = os.path.join(
276 directory, "{} designs".format(slugify(self.user)))
277 self.collection_url = USER_COLLECTION
278
279
280 class Thing:
281 """ An individual design on thingiverse. """
282
283 def __init__(self, thing_id):
284 self.thing_id = thing_id
285 self.last_time = None
286 self._parsed = False
287 self._needs_download = True
288 self.text = None
289 self.title = None
290 self.download_dir = None
291
292 def _parse(self, base_dir):
293 """ Work out what, if anything needs to be done. """
294 if self._parsed:
295 return
296
297 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
298 try:
299 BROWSER.get(url)
300 wait = WebDriverWait(BROWSER, 60)
301 pc = PageChecker()
302 wait.until(pc)
303 except requests.exceptions.ConnectionError as error:
304 logging.error("Unable to connect for thing {}: {}".format(
305 self.thing_id, error))
306 return
307 except selenium.common.exceptions.TimeoutException:
308 logging.error(pc.log)
309 logging.error("Timeout trying to parse thing {}".format(self.thing_id))
310 return
311
312 self.title = pc.title
313 self._file_links=[]
314 for link in pc.files:
315 logging.debug("Parsing link: {}".format(link.text))
316 link_link = link.find_element_by_xpath(".//a").get_attribute("href")
317 if link_link.endswith("/zip"):
318 # bulk link.
319 continue
320 try:
321 link_title, link_details, _ = link.text.split("\n")
322 except ValueError:
323 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
324 _, link_title, link_details, _ = link.text.split("\n")
325
326 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
327 #need to convert from M D Y to Y M D
328 link_date = [int(x) for x in link_details.split("|")[1].split()[-1].split("-")]
329 try:
330 self._file_links.append(FileLink(strip_invalid_chars(link_title), datetime.datetime(link_date[2], link_date[0], link_date[1]), link_link))
331 except ValueError:
332 logging.error(link_date)
333
334 self._image_links=[x.find_element_by_xpath(".//img").get_attribute("src") for x in pc.images]
335 self._license = pc.license
336 self.pc = pc
337
338
339 self.old_download_dir = os.path.join(base_dir, slugify(self.title))
340 self.download_dir = os.path.join(base_dir, "{} - {}".format(self.thing_id, slugify(self.title)))
341
342 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
343
344 if not os.path.exists(self.download_dir):
345 logging.info("Looking for old dir at {}".format(self.old_download_dir))
346 if os.path.exists(self.old_download_dir):
347 logging.warning("Found previous style download directory. Moving it from {} to {}".format(self.old_download_dir, self.download_dir))
348 os.rename(self.old_download_dir, self.download_dir)
349 else:
350 # Not yet downloaded
351 self._parsed = True
352 return
353
354 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
355 if not os.path.exists(timestamp_file):
356 # Old download from before
357 logging.warning(
358 "Old-style download directory found. Assuming update required.")
359 self._parsed = True
360 return
361
362 try:
363 with open(timestamp_file, 'r') as timestamp_handle:
364 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
365 last_bits = [int(x) for x in timestamp_handle.readlines()[0].split(' ')[0].split("-")]
366 logging.warning(last_bits)
367 try:
368 self.last_time = datetime.datetime(last_bits[0], last_bits[1], last_bits[2])
369 except ValueError:
370 # This one appears to be M D Y
371 self.last_time = datetime.datetime(last_bits[2], last_bits[0], last_bits[1])
372
373 logging.info("last downloaded version: {}".format(self.last_time))
374 except FileNotFoundError:
375 # Not run on this thing before.
376 logging.info(
377 "Old-style download directory found. Assuming update required.")
378 self.last_time = None
379 self._needs_download = True
380 self._parsed = True
381 return
382
383 # OK, so we have a timestamp, lets see if there is anything new to get
384 for file_link in self._file_links:
385 if file_link.last_update > self.last_time:
386 logging.info(
387 "Found new/updated file {} - {}".format(file_link.name, file_link.last_update))
388 self._needs_download = True
389 self._parsed = True
390 return
391
392 # Got here, so nope, no new files.
393 self._needs_download = False
394 self._parsed = True
395
396 def download(self, base_dir):
397 """ Download all files for a given thing.
398 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
399 """
400 if not self._parsed:
401 self._parse(base_dir)
402
403 if not self._parsed:
404 logging.error(
405 "Unable to parse {} - aborting download".format(self.thing_id))
406 return State.FAILED
407
408 if not self._needs_download:
409 print("{} - {} already downloaded - skipping.".format(self.thing_id, self.title))
410 return State.ALREADY_DOWNLOADED
411
412 # Have we already downloaded some things?
413 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
414 prev_dir = None
415 if os.path.exists(self.download_dir):
416 if not os.path.exists(timestamp_file):
417 # edge case: old style dir w/out timestamp.
418 logging.warning("Old style download dir found at {}".format(self.title))
419 prev_count = 0
420 target_dir = "{}_old".format(self.download_dir)
421 while os.path.exists(target_dir):
422 prev_count = prev_count + 1
423 target_dir = "{}_old_{}".format(self.download_dir, prev_count)
424 os.rename(self.download_dir, target_dir)
425 else:
426 prev_dir = "{}_{}".format(self.download_dir, slugify(self.last_time.__str__()))
427 os.rename(self.download_dir, prev_dir)
428
429 # Get the list of files to download
430
431 new_file_links = []
432 old_file_links = []
433 new_last_time = None
434
435 if not self.last_time:
436 # If we don't have anything to copy from, then it is all new.
437 logging.debug("No last time, downloading all files")
438 new_file_links = self._file_links
439 new_last_time = new_file_links[0].last_update
440
441 for file_link in new_file_links:
442 new_last_time = max(new_last_time, file_link.last_update)
443 logging.debug("New timestamp will be {}".format(new_last_time))
444 else:
445 new_last_time = self.last_time
446 for file_link in self._file_links:
447 if file_link.last_update > self.last_time:
448 new_file_links.append(file_link)
449 new_last_time = max(new_last_time, file_link.last_update)
450 else:
451 old_file_links.append(file_link)
452
453 logging.debug("new timestamp {}".format(new_last_time))
454
455 # OK. Time to get to work.
456 logging.debug("Generating download_dir")
457 os.mkdir(self.download_dir)
458 filelist_file = os.path.join(self.download_dir, "filelist.txt")
459 with open(filelist_file, 'w') as fl_handle:
460 for fl in self._file_links:
461 base_link = fl.link
462 try:
463 fl.link=requests.get(fl.link, allow_redirects=False).headers['location']
464 except Exception:
465 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
466 pass
467
468 fl_handle.write("{},{},{}, {}\n".format(fl.link, fl.name, fl.last_update, base_link))
469
470
471 # First grab the cached files (if any)
472 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
473 for file_link in old_file_links:
474 old_file = os.path.join(prev_dir, file_link.name)
475 new_file = os.path.join(self.download_dir, file_link.name)
476 try:
477 logging.debug("Copying {} to {}".format(old_file, new_file))
478 copyfile(old_file, new_file)
479 except FileNotFoundError:
480 logging.warning(
481 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
482 new_file_links.append(file_link)
483
484 # Now download the new ones
485 logging.info("Downloading {} new files of {}".format(
486 len(new_file_links), len(self._file_links)))
487 try:
488 for file_link in new_file_links:
489 file_name = os.path.join(self.download_dir, file_link.name)
490 logging.debug("Downloading {} from {} to {}".format(
491 file_link.name, file_link.link, file_name))
492 data_req = requests.get(file_link.link)
493 with open(file_name, 'wb') as handle:
494 handle.write(data_req.content)
495 except Exception as exception:
496 logging.error("Failed to download {} - {}".format(file_link.name, exception))
497 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
498 return State.FAILED
499
500
501 # People like images. But this doesn't work yet.
502 image_dir = os.path.join(self.download_dir, 'images')
503 logging.info("Downloading {} images.".format(len(self._image_links)))
504 try:
505 os.mkdir(image_dir)
506 for imagelink in self._image_links:
507 filename = os.path.basename(imagelink)
508 if filename.endswith('stl'):
509 filename = "{}.png".format(filename)
510 image_req = requests.get(imagelink)
511 with open(os.path.join(image_dir, filename), 'wb') as handle:
512 handle.write(image_req.content)
513 except Exception as exception:
514 print("Failed to download {} - {}".format(filename, exception))
515 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
516 return State.FAILED
517
518 """
519 # instructions are good too.
520 logging.info("Downloading readme")
521 try:
522 readme_txt = soup.find('meta', property='og:description')[
523 'content']
524 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
525 readme_handle.write("{}\n".format(readme_txt))
526 except (TypeError, KeyError) as exception:
527 logging.warning("No readme? {}".format(exception))
528 except IOError as exception:
529 logging.warning("Failed to write readme! {}".format(exception))
530
531 """
532 # Best get some licenses
533 logging.info("Downloading license")
534 try:
535 if self._license:
536 with open(os.path.join(self.download_dir, 'license.txt'), 'w') as license_handle:
537 license_handle.write("{}\n".format(self._license))
538 except IOError as exception:
539 logging.warning("Failed to write license! {}".format(exception))
540
541 try:
542 # Now write the timestamp
543 with open(timestamp_file, 'w') as timestamp_handle:
544 timestamp_handle.write(new_last_time.__str__())
545 except Exception as exception:
546 print("Failed to write timestamp file - {}".format(exception))
547 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
548 return State.FAILED
549 self._needs_download = False
550 logging.debug("Download of {} finished".format(self.title))
551 return State.OK
552
553
554 def do_batch(batch_file, download_dir, quick):
555 """ Read a file in line by line, parsing each as a set of calls to this script."""
556 with open(batch_file) as handle:
557 for line in handle:
558 line = line.strip()
559 if not line:
560 # Skip empty lines
561 continue
562 logging.info("Handling instruction {}".format(line))
563 command_arr = line.split()
564 if command_arr[0] == "thing":
565 logging.debug(
566 "Handling batch thing instruction: {}".format(line))
567 Thing(command_arr[1]).download(download_dir)
568 continue
569 if command_arr[0] == "collection":
570 logging.debug(
571 "Handling batch collection instruction: {}".format(line))
572 Collection(command_arr[1], command_arr[2],
573 download_dir, quick).download()
574 continue
575 if command_arr[0] == "user":
576 logging.debug(
577 "Handling batch collection instruction: {}".format(line))
578 Designs(command_arr[1], download_dir, quick).download()
579 continue
580 logging.warning("Unable to parse current instruction. Skipping.")
581
582
583 def main():
584 """ Entry point for script being run as a command. """
585 parser = argparse.ArgumentParser()
586 parser.add_argument("-l", "--log-level", choices=[
587 'debug', 'info', 'warning'], default='info', help="level of logging desired")
588 parser.add_argument("-d", "--directory",
589 help="Target directory to download into")
590 parser.add_argument("-f", "--log-file",
591 help="Place to log debug information to")
592 parser.add_argument("-q", "--quick", action="store_true",
593 help="Assume date ordering on posts")
594
595 subparsers = parser.add_subparsers(
596 help="Type of thing to download", dest="subcommand")
597 collection_parser = subparsers.add_parser(
598 'collection', help="Download one or more entire collection(s)")
599 collection_parser.add_argument(
600 "owner", help="The owner of the collection(s) to get")
601 collection_parser.add_argument(
602 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
603 thing_parser = subparsers.add_parser(
604 'thing', help="Download a single thing.")
605 thing_parser.add_argument(
606 "things", nargs="*", help="Space seperated list of thing ID(s) to download")
607 user_parser = subparsers.add_parser(
608 "user", help="Download all things by one or more users")
609 user_parser.add_argument(
610 "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
611 batch_parser = subparsers.add_parser(
612 "batch", help="Perform multiple actions written in a text file")
613 batch_parser.add_argument(
614 "batch_file", help="The name of the file to read.")
615 subparsers.add_parser("version", help="Show the current version")
616
617 args = parser.parse_args()
618 if not args.subcommand:
619 parser.print_help()
620 sys.exit(1)
621 if not args.directory:
622 args.directory = os.getcwd()
623
624 logger = logging.getLogger()
625 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
626 logger.setLevel(logging.DEBUG)
627 console_handler = logging.StreamHandler()
628 console_handler.setLevel(args.log_level.upper())
629
630 logger.addHandler(console_handler)
631 if args.log_file:
632 file_handler = logging.FileHandler(args.log_file)
633 file_handler.setLevel(logging.DEBUG)
634 file_handler.setFormatter(formatter)
635 logger.addHandler(file_handler)
636
637
638 # Start downloader
639 thing_queue = multiprocessing.JoinableQueue()
640 logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
641 downloaders = [Downloader(thing_queue, args.directory) for _ in range(DOWNLOADER_COUNT)]
642 for downloader in downloaders:
643 downloader.start()
644
645
646 if args.subcommand.startswith("collection"):
647 for collection in args.collections:
648 Collection(args.owner, collection, args.directory, args.quick).download()
649 if args.subcommand == "thing":
650 for thing in args.things:
651 thing_queue.put(thing)
652 if args.subcommand == "user":
653 for user in args.users:
654 Designs(user, args.directory, args.quick).download()
655 if args.subcommand == "version":
656 print("thingy_grabber.py version {}".format(VERSION))
657 if args.subcommand == "batch":
658 do_batch(args.batch_file, args.directory, args.quick)
659
660 # Stop the downloader processes
661 for downloader in downloaders:
662 thing_queue.put(None)
663
664 if __name__ == "__main__":
665 multiprocessing.freeze_support()
666 main()