3 Thingiverse bulk downloader
13 import multiprocessing
16 from shutil
import copyfile
17 from bs4
import BeautifulSoup
18 from dataclasses
import dataclass
20 from selenium
import webdriver
21 from selenium
.webdriver
.common
.by
import By
22 from selenium
.webdriver
.support
.ui
import WebDriverWait
23 from selenium
.webdriver
.support
import expected_conditions
as EC
24 from selenium
.webdriver
.firefox
.options
import Options
28 URL_BASE
= "https://www.thingiverse.com"
29 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
30 USER_COLLECTION
= URL_BASE
+ "/ajax/user/designs"
32 ID_REGEX
= re
.compile(r
'"id":(\d*),')
33 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
34 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
35 # This appears to be fixed at 12, but if it changes would screw the rest up.
36 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
37 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
47 #BROWSER = webdriver.PhantomJS('./phantomjs')
49 options
.add_argument("--headless")
50 BROWSER
= webdriver
.Firefox(options
=options
)
52 BROWSER
.set_window_size(1980, 1080)
59 link
: datetime
.datetime
62 class State(enum
.Enum
):
65 ALREADY_DOWNLOADED
= enum
.auto()
68 def fail_dir(dir_name
):
69 """ When a download has failed, move it sideways.
71 target_dir
= "{}_failed".format(dir_name
)
73 while os
.path
.exists(target_dir
):
74 target_dir
= "{}_failed_{}".format(dir_name
, inc
)
76 os
.rename(dir_name
, target_dir
)
79 def truncate_name(file_name
):
80 """ Ensure the filename is not too long for, well windows basically.
82 path
= os
.path
.abspath(file_name
)
83 if len(path
) <= MAX_PATH_LENGTH
:
85 to_cut
= len(path
) - (MAX_PATH_LENGTH
+ 3)
86 base
, extension
= os
.path
.splitext(path
)
88 new_path
= "{}_{}{}".format(base
, inc
, extension
)
89 while os
.path
.exists(new_path
):
90 new_path
= "{}_{}{}".format(base
, inc
, extension
)
96 """ Remove whitespace from a string """
97 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
102 Normalise string, removes invalid for filename charactersr
103 and converts string to lowercase.
105 value
= unicodedata
.normalize('NFKC', value
).lower().strip()
106 value
= re
.sub(r
'[\\/<>:\?\*\|"]', '', value
)
107 value
= re
.sub(r
'\.*$', '', value
)
110 class PageChecker(object):
114 self
.file_count
= None
120 def __call__(self
, _
):
122 self
.log
.append("call")
123 if self
.title
is None:
124 # first find the name
125 name
= EC
._find
_element
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingPage__modelName]"))
128 self
.title
= name
.text
130 if self
.file_count
is None:
131 # OK. Do we know how many files we have to download?
132 metrics
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=MetricButton]"))
133 self
.log
.append("got some metrics: {}".format(len(metrics
)))
134 cur_count
= int([x
.text
.split("\n")[0] for x
in metrics
if x
.text
.endswith("\nThing Files")][0])
135 self
.log
.append(cur_count
)
138 self
.file_count
= cur_count
140 self
.log
.append("looking for {} files".format(self
.file_count
))
141 fileRows
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingFile__fileRow]"))
142 self
.log
.append("found {} files".format(len(fileRows
)))
143 if len(fileRows
) < self
.file_count
:
146 self
.log
.append("Looking for images")
147 # By this point _should_ have loaded all the images
148 self
.images
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=thumb]"))
149 self
.license
= EC
._find
_element
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=License__licenseText]")).text
150 self
.log
.append("found {} images".format(len(self
.images
)))
151 self
.files
= fileRows
159 class Downloader(multiprocessing
.Process
):
161 Class to handle downloading the things we have found to get.
164 def __init__(self
, thing_queue
, download_directory
):
165 multiprocessing
.Process
.__init
__(self
)
166 # TODO: add parameters
167 self
.thing_queue
= thing_queue
168 self
.download_directory
= download_directory
171 """ actual download loop.
174 thing_id
= self
.thing_queue
.get()
176 logging
.info("Shutting download queue")
177 self
.thing_queue
.task_done()
179 logging
.info("Handling id {}".format(thing_id
))
180 Thing(thing_id
).download(self
.download_directory
)
181 self
.thing_queue
.task_done()
189 """ Holds details of a group of things for download
190 This is effectively (although not actually) an abstract class
191 - use Collection or Designs instead.
194 def __init__(self
, quick
):
200 # Should we stop downloading when we hit a known datestamp?
202 # These should be set by child classes.
204 self
.download_dir
= None
205 self
.collection_url
= None
207 def _get_small_grouping(self
, req
):
208 """ Handle small groupings """
209 soup
= BeautifulSoup(req
.text
, features
='lxml')
210 links
= soup
.find_all('a', {'class': 'card-img-holder'})
211 self
.things
= [x
['href'].split(':')[1] for x
in links
]
212 self
.total
= len(self
.things
)
217 """ retrieve the things of the grouping. """
219 # We've already done it.
222 # Check for initialisation:
224 logging
.error("No URL set - object not initialised properly?")
225 raise ValueError("No URL set - object not initialised properly?")
227 # Get the internal details of the grouping.
228 logging
.debug("Querying {}".format(self
.url
))
229 c_req
= requests
.get(self
.url
)
230 total
= TOTAL_REGEX
.search(c_req
.text
)
232 # This is a small (<13) items grouping. Pull the list from this req.
233 return self
._get
_small
_grouping
(c_req
)
234 self
.total
= total
.groups()[0]
235 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
236 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
237 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
239 'base_url': self
.url
,
244 for current_page
in range(1, self
.last_page
+ 1):
245 parameters
['page'] = current_page
246 req
= requests
.post(self
.collection_url
, parameters
)
247 soup
= BeautifulSoup(req
.text
, features
='lxml')
248 links
= soup
.find_all('a', {'class': 'card-img-holder'})
249 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
254 """ Downloads all the files in a collection """
258 if not self
.download_dir
:
260 "No download_dir set - invalidly initialised object?")
262 base_dir
= os
.getcwd()
264 os
.mkdir(self
.download_dir
)
265 except FileExistsError
:
266 logging
.info("Target directory {} already exists. Assuming a resume."
267 .format(self
.download_dir
))
268 logging
.info("Downloading {} thing(s).".format(self
.total
))
269 for idx
, thing
in enumerate(self
.things
):
270 logging
.info("Downloading thing {} - {}".format(idx
, thing
))
271 RC
= Thing(thing
).download(self
.download_dir
)
272 if self
.quick
and RC
==State
.ALREADY_DOWNLOADED
:
273 logging
.info("Caught up, stopping.")
277 class Collection(Grouping
):
278 """ Holds details of a collection. """
280 def __init__(self
, user
, name
, directory
, quick
):
281 Grouping
.__init
__(self
, quick
)
284 self
.url
= "{}/{}/collections/{}".format(
285 URL_BASE
, self
.user
, strip_ws(self
.name
))
286 self
.download_dir
= os
.path
.join(directory
,
287 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
288 self
.collection_url
= URL_COLLECTION
291 class Designs(Grouping
):
292 """ Holds details of all of a users' designs. """
294 def __init__(self
, user
, directory
, quick
):
295 Grouping
.__init
__(self
, quick
)
297 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
298 self
.download_dir
= os
.path
.join(
299 directory
, "{} designs".format(slugify(self
.user
)))
300 self
.collection_url
= USER_COLLECTION
304 """ An individual design on thingiverse. """
306 def __init__(self
, thing_id
):
307 self
.thing_id
= thing_id
308 self
.last_time
= None
310 self
._needs
_download
= True
313 self
.download_dir
= None
315 def _parse(self
, base_dir
):
316 """ Work out what, if anything needs to be done. """
320 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
323 wait
= WebDriverWait(BROWSER
, 60)
326 except requests
.exceptions
.ConnectionError
as error
:
327 logging
.error("Unable to connect for thing {}: {}".format(
328 self
.thing_id
, error
))
330 except selenium
.common
.exceptions
.TimeoutException
:
331 logging
.error(pc
.log
)
332 logging
.error("Timeout trying to parse thing {}".format(self
.thing_id
))
335 self
.title
= pc
.title
338 logging
.error("No files found for thing {} - probably thingiverse being broken, try again later".format(self
.thing_id
))
339 for link
in pc
.files
:
340 logging
.debug("Parsing link: {}".format(link
.text
))
341 link_link
= link
.find_element_by_xpath(".//a").get_attribute("href")
342 if link_link
.endswith("/zip"):
346 link_title
, link_details
, _
= link
.text
.split("\n")
348 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
349 _
, link_title
, link_details
, _
= link
.text
.split("\n")
351 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
352 #need to convert from M D Y to Y M D
353 link_date
= [int(x
) for x
in link_details
.split("|")[1].split()[-1].split("-")]
355 self
._file
_links
.append(FileLink(link_title
, datetime
.datetime(link_date
[2], link_date
[0], link_date
[1]), link_link
))
357 logging
.error(link_date
)
359 self
._image
_links
=[x
.find_element_by_xpath(".//img").get_attribute("src") for x
in pc
.images
]
360 self
._license
= pc
.license
364 self
.old_download_dir
= os
.path
.join(base_dir
, slugify(self
.title
))
365 self
.download_dir
= os
.path
.join(base_dir
, "{} - {}".format(self
.thing_id
, slugify(self
.title
)))
367 logging
.debug("Parsing {} ({})".format(self
.thing_id
, self
.title
))
369 if not os
.path
.exists(self
.download_dir
):
370 logging
.info("Looking for old dir at {}".format(self
.old_download_dir
))
371 if os
.path
.exists(self
.old_download_dir
):
372 logging
.warning("Found previous style download directory. Moving it from {} to {}".format(self
.old_download_dir
, self
.download_dir
))
373 os
.rename(self
.old_download_dir
, self
.download_dir
)
379 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
380 if not os
.path
.exists(timestamp_file
):
381 # Old download from before
383 "Old-style download directory found. Assuming update required.")
388 with
open(timestamp_file
, 'r') as timestamp_handle
:
389 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
390 last_bits
= [int(x
) for x
in timestamp_handle
.readlines()[0].split(' ')[0].split("-")]
391 logging
.warning(last_bits
)
392 if last_bits
[0] == 0:
394 if last_bits
[1] == 0:
396 if last_bits
[2] == 0:
399 self
.last_time
= datetime
.datetime(last_bits
[0], last_bits
[1], last_bits
[2])
401 # This one appears to be M D Y
402 self
.last_time
= datetime
.datetime(last_bits
[2], last_bits
[0], last_bits
[1])
404 logging
.info("last downloaded version: {}".format(self
.last_time
))
405 except FileNotFoundError
:
406 # Not run on this thing before.
408 "Old-style download directory found. Assuming update required.")
409 self
.last_time
= None
410 self
._needs
_download
= True
414 # OK, so we have a timestamp, lets see if there is anything new to get
415 for file_link
in self
._file
_links
:
416 if file_link
.last_update
> self
.last_time
:
418 "Found new/updated file {} - {}".format(file_link
.name
, file_link
.last_update
))
419 self
._needs
_download
= True
423 # Got here, so nope, no new files.
424 self
._needs
_download
= False
427 def download(self
, base_dir
):
428 """ Download all files for a given thing.
429 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
432 self
._parse
(base_dir
)
436 "Unable to parse {} - aborting download".format(self
.thing_id
))
439 if not self
._needs
_download
:
440 print("{} - {} already downloaded - skipping.".format(self
.thing_id
, self
.title
))
441 return State
.ALREADY_DOWNLOADED
443 if not self
._file
_links
:
444 print("{} - {} appears to have no files. Thingiverse acting up again?".format(self
.thing_id
, self
.title
))
447 # Have we already downloaded some things?
448 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
450 if os
.path
.exists(self
.download_dir
):
451 if not os
.path
.exists(timestamp_file
):
452 # edge case: old style dir w/out timestamp.
453 logging
.warning("Old style download dir found at {}".format(self
.title
))
455 target_dir
= "{}_old".format(self
.download_dir
)
456 while os
.path
.exists(target_dir
):
457 prev_count
= prev_count
+ 1
458 target_dir
= "{}_old_{}".format(self
.download_dir
, prev_count
)
459 os
.rename(self
.download_dir
, target_dir
)
461 prev_dir
= "{}_{}".format(self
.download_dir
, slugify(self
.last_time
.__str
__()))
462 os
.rename(self
.download_dir
, prev_dir
)
464 # Get the list of files to download
470 if not self
.last_time
:
471 # If we don't have anything to copy from, then it is all new.
472 logging
.debug("No last time, downloading all files")
473 new_file_links
= self
._file
_links
474 new_last_time
= new_file_links
[0].last_update
476 for file_link
in new_file_links
:
477 new_last_time
= max(new_last_time
, file_link
.last_update
)
478 logging
.debug("New timestamp will be {}".format(new_last_time
))
480 new_last_time
= self
.last_time
481 for file_link
in self
._file
_links
:
482 if file_link
.last_update
> self
.last_time
:
483 new_file_links
.append(file_link
)
484 new_last_time
= max(new_last_time
, file_link
.last_update
)
486 old_file_links
.append(file_link
)
488 logging
.debug("new timestamp {}".format(new_last_time
))
490 # OK. Time to get to work.
491 logging
.debug("Generating download_dir")
492 os
.mkdir(self
.download_dir
)
493 filelist_file
= os
.path
.join(self
.download_dir
, "filelist.txt")
494 with
open(filelist_file
, 'w', encoding
="utf-8") as fl_handle
:
495 for fl
in self
._file
_links
:
498 fl
.link
=requests
.get(fl
.link
, allow_redirects
=False).headers
['location']
500 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
503 fl_handle
.write("{},{},{}, {}\n".format(fl
.link
, fl
.name
, fl
.last_update
, base_link
))
506 # First grab the cached files (if any)
507 logging
.info("Copying {} unchanged files.".format(len(old_file_links
)))
508 for file_link
in old_file_links
:
509 old_file
= os
.path
.join(prev_dir
, file_link
.name
)
510 new_file
= truncate_name(os
.path
.join(self
.download_dir
, file_link
.name
))
512 logging
.debug("Copying {} to {}".format(old_file
, new_file
))
513 copyfile(old_file
, new_file
)
514 except FileNotFoundError
:
516 "Unable to find {} in old archive, redownloading".format(file_link
["title"]))
517 new_file_links
.append(file_link
)
519 # Now download the new ones
520 logging
.info("Downloading {} new files of {}".format(
521 len(new_file_links
), len(self
._file
_links
)))
523 for file_link
in new_file_links
:
524 file_name
= truncate_name(os
.path
.join(self
.download_dir
, file_link
.name
))
525 logging
.debug("Downloading {} from {} to {}".format(
526 file_link
.name
, file_link
.link
, file_name
))
527 data_req
= requests
.get(file_link
.link
)
528 with
open(file_name
, 'wb') as handle
:
529 handle
.write(data_req
.content
)
530 except Exception as exception
:
531 logging
.error("Failed to download {} - {}".format(file_link
.name
, exception
))
532 fail_dir(self
.download_dir
)
536 # People like images. But this doesn't work yet.
537 image_dir
= os
.path
.join(self
.download_dir
, 'images')
538 logging
.info("Downloading {} images.".format(len(self
._image
_links
)))
541 for imagelink
in self
._image
_links
:
542 filename
= os
.path
.basename(imagelink
)
543 if filename
.endswith('stl'):
544 filename
= "{}.png".format(filename
)
545 image_req
= requests
.get(imagelink
)
546 with
open(truncate_name(os
.path
.join(image_dir
, filename
)), 'wb') as handle
:
547 handle
.write(image_req
.content
)
548 except Exception as exception
:
549 print("Failed to download {} - {}".format(filename
, exception
))
550 fail_dir(self
.download_dir
)
554 # instructions are good too.
555 logging.info("Downloading readme")
557 readme_txt = soup.find('meta', property='og:description')[
559 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
560 readme_handle.write("{}\n".format(readme_txt))
561 except (TypeError, KeyError) as exception:
562 logging.warning("No readme? {}".format(exception))
563 except IOError as exception:
564 logging.warning("Failed to write readme! {}".format(exception))
567 # Best get some licenses
568 logging
.info("Downloading license")
571 with
open(truncate_name(os
.path
.join(self
.download_dir
, 'license.txt')), 'w', encoding
="utf-8") as license_handle
:
572 license_handle
.write("{}\n".format(self
._license
))
573 except IOError as exception
:
574 logging
.warning("Failed to write license! {}".format(exception
))
577 # Now write the timestamp
578 with
open(timestamp_file
, 'w', encoding
="utf-8") as timestamp_handle
:
579 timestamp_handle
.write(new_last_time
.__str
__())
580 except Exception as exception
:
581 print("Failed to write timestamp file - {}".format(exception
))
582 fail_dir(self
.download_dir
)
584 self
._needs
_download
= False
585 logging
.debug("Download of {} finished".format(self
.title
))
589 def do_batch(batch_file
, download_dir
, quick
):
590 """ Read a file in line by line, parsing each as a set of calls to this script."""
591 with
open(batch_file
) as handle
:
597 logging
.info("Handling instruction {}".format(line
))
598 command_arr
= line
.split()
599 if command_arr
[0] == "thing":
601 "Handling batch thing instruction: {}".format(line
))
602 Thing(command_arr
[1]).download(download_dir
)
604 if command_arr
[0] == "collection":
606 "Handling batch collection instruction: {}".format(line
))
607 Collection(command_arr
[1], command_arr
[2],
608 download_dir
, quick
).download()
610 if command_arr
[0] == "user":
612 "Handling batch collection instruction: {}".format(line
))
613 Designs(command_arr
[1], download_dir
, quick
).download()
615 logging
.warning("Unable to parse current instruction. Skipping.")
619 """ Entry point for script being run as a command. """
620 parser
= argparse
.ArgumentParser()
621 parser
.add_argument("-l", "--log-level", choices
=[
622 'debug', 'info', 'warning'], default
='info', help="level of logging desired")
623 parser
.add_argument("-d", "--directory",
624 help="Target directory to download into")
625 parser
.add_argument("-f", "--log-file",
626 help="Place to log debug information to")
627 parser
.add_argument("-q", "--quick", action
="store_true",
628 help="Assume date ordering on posts")
630 subparsers
= parser
.add_subparsers(
631 help="Type of thing to download", dest
="subcommand")
632 collection_parser
= subparsers
.add_parser(
633 'collection', help="Download one or more entire collection(s)")
634 collection_parser
.add_argument(
635 "owner", help="The owner of the collection(s) to get")
636 collection_parser
.add_argument(
637 "collections", nargs
="+", help="Space seperated list of the name(s) of collection to get")
638 thing_parser
= subparsers
.add_parser(
639 'thing', help="Download a single thing.")
640 thing_parser
.add_argument(
641 "things", nargs
="*", help="Space seperated list of thing ID(s) to download")
642 user_parser
= subparsers
.add_parser(
643 "user", help="Download all things by one or more users")
644 user_parser
.add_argument(
645 "users", nargs
="+", help="A space seperated list of the user(s) to get the designs of")
646 batch_parser
= subparsers
.add_parser(
647 "batch", help="Perform multiple actions written in a text file")
648 batch_parser
.add_argument(
649 "batch_file", help="The name of the file to read.")
650 subparsers
.add_parser("version", help="Show the current version")
652 args
= parser
.parse_args()
653 if not args
.subcommand
:
656 if not args
.directory
:
657 args
.directory
= os
.getcwd()
659 logger
= logging
.getLogger()
660 formatter
= logging
.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
661 logger
.setLevel(logging
.DEBUG
)
662 console_handler
= logging
.StreamHandler()
663 console_handler
.setLevel(args
.log_level
.upper())
665 logger
.addHandler(console_handler
)
667 file_handler
= logging
.FileHandler(args
.log_file
)
668 file_handler
.setLevel(logging
.DEBUG
)
669 file_handler
.setFormatter(formatter
)
670 logger
.addHandler(file_handler
)
674 thing_queue
= multiprocessing
.JoinableQueue()
675 logging
.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT
))
676 downloaders
= [Downloader(thing_queue
, args
.directory
) for _
in range(DOWNLOADER_COUNT
)]
677 for downloader
in downloaders
:
681 if args
.subcommand
.startswith("collection"):
682 for collection
in args
.collections
:
683 Collection(args
.owner
, collection
, args
.directory
, args
.quick
).download()
684 if args
.subcommand
== "thing":
685 for thing
in args
.things
:
686 thing_queue
.put(thing
)
687 if args
.subcommand
== "user":
688 for user
in args
.users
:
689 Designs(user
, args
.directory
, args
.quick
).download()
690 if args
.subcommand
== "version":
691 print("thingy_grabber.py version {}".format(VERSION
))
692 if args
.subcommand
== "batch":
693 do_batch(args
.batch_file
, args
.directory
, args
.quick
)
695 # Stop the downloader processes
696 for downloader
in downloaders
:
697 thing_queue
.put(None)
699 atexit
.register(BROWSER
.quit
)
701 if __name__
== "__main__":
702 multiprocessing
.freeze_support()