3 Thingiverse bulk downloader
13 import multiprocessing
16 from shutil
import copyfile
17 from bs4
import BeautifulSoup
18 from dataclasses
import dataclass
20 from selenium
import webdriver
21 from selenium
.webdriver
.common
.by
import By
22 from selenium
.webdriver
.support
.ui
import WebDriverWait
23 from selenium
.webdriver
.support
import expected_conditions
as EC
24 from selenium
.webdriver
.firefox
.options
import Options
27 URL_BASE
= "https://www.thingiverse.com"
28 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
29 USER_COLLECTION
= URL_BASE
+ "/ajax/user/designs"
31 ID_REGEX
= re
.compile(r
'"id":(\d*),')
32 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
33 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
34 # This appears to be fixed at 12, but if it changes would screw the rest up.
35 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
36 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
46 #BROWSER = webdriver.PhantomJS('./phantomjs')
48 options
.add_argument("--headless")
49 BROWSER
= webdriver
.Firefox(options
=options
)
51 BROWSER
.set_window_size(1980, 1080)
58 link
: datetime
.datetime
61 class State(enum
.Enum
):
64 ALREADY_DOWNLOADED
= enum
.auto()
67 def fail_dir(dir_name
):
68 """ When a download has failed, move it sideways.
70 target_dir
= "{}_failed".format(dir_name
)
72 while os
.path
.exists(target_dir
):
73 target_dir
= "{}_failed_{}".format(dir_name
, inc
)
75 os
.rename(dir_name
, target_dir
)
78 def truncate_name(file_name
):
79 """ Ensure the filename is not too long for, well windows basically.
81 path
= os
.path
.abspath(file_name
)
82 if len(path
) <= MAX_PATH_LENGTH
:
84 to_cut
= len(path
) - (MAX_PATH_LENGTH
+ 3)
85 base
, extension
= os
.path
.splitext(path
)
87 new_path
= "{}_{}{}".format(base
, inc
, extension
)
88 while os
.path
.exists(new_path
):
89 new_path
= "{}_{}{}".format(base
, inc
, extension
)
95 """ Remove whitespace from a string """
96 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
101 Normalise string, removes invalid for filename charactersr
102 and converts string to lowercase.
104 value
= unicodedata
.normalize('NFKC', value
).lower().strip()
105 value
= re
.sub(r
'[\\/<>:\?\*\|"]', '', value
)
106 value
= re
.sub(r
'\.*$', '', value
)
109 class PageChecker(object):
113 self
.file_count
= None
119 def __call__(self
, _
):
121 self
.log
.append("call")
122 if self
.title
is None:
123 # first find the name
124 name
= EC
._find
_element
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingPage__modelName]"))
127 self
.title
= name
.text
129 if self
.file_count
is None:
130 # OK. Do we know how many files we have to download?
131 metrics
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=MetricButton]"))
132 self
.log
.append("got some metrics: {}".format(len(metrics
)))
133 cur_count
= int([x
.text
.split("\n")[0] for x
in metrics
if x
.text
.endswith("\nThing Files")][0])
134 self
.log
.append(cur_count
)
137 self
.file_count
= cur_count
139 self
.log
.append("looking for {} files".format(self
.file_count
))
140 fileRows
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingFile__fileRow]"))
141 self
.log
.append("found {} files".format(len(fileRows
)))
142 if len(fileRows
) < self
.file_count
:
145 self
.log
.append("Looking for images")
146 # By this point _should_ have loaded all the images
147 self
.images
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=thumb]"))
148 self
.license
= EC
._find
_element
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=License__licenseText]")).text
149 self
.log
.append("found {} images".format(len(self
.images
)))
150 self
.files
= fileRows
158 class Downloader(multiprocessing
.Process
):
160 Class to handle downloading the things we have found to get.
163 def __init__(self
, thing_queue
, download_directory
):
164 multiprocessing
.Process
.__init
__(self
)
165 # TODO: add parameters
166 self
.thing_queue
= thing_queue
167 self
.download_directory
= download_directory
170 """ actual download loop.
173 thing_id
= self
.thing_queue
.get()
175 logging
.info("Shutting download queue")
176 self
.thing_queue
.task_done()
178 logging
.info("Handling id {}".format(thing_id
))
179 Thing(thing_id
).download(self
.download_directory
)
180 self
.thing_queue
.task_done()
188 """ Holds details of a group of things for download
189 This is effectively (although not actually) an abstract class
190 - use Collection or Designs instead.
193 def __init__(self
, quick
):
199 # Should we stop downloading when we hit a known datestamp?
201 # These should be set by child classes.
203 self
.download_dir
= None
204 self
.collection_url
= None
206 def _get_small_grouping(self
, req
):
207 """ Handle small groupings """
208 soup
= BeautifulSoup(req
.text
, features
='lxml')
209 links
= soup
.find_all('a', {'class': 'card-img-holder'})
210 self
.things
= [x
['href'].split(':')[1] for x
in links
]
211 self
.total
= len(self
.things
)
216 """ retrieve the things of the grouping. """
218 # We've already done it.
221 # Check for initialisation:
223 logging
.error("No URL set - object not initialised properly?")
224 raise ValueError("No URL set - object not initialised properly?")
226 # Get the internal details of the grouping.
227 logging
.debug("Querying {}".format(self
.url
))
228 c_req
= requests
.get(self
.url
)
229 total
= TOTAL_REGEX
.search(c_req
.text
)
231 # This is a small (<13) items grouping. Pull the list from this req.
232 return self
._get
_small
_grouping
(c_req
)
233 self
.total
= total
.groups()[0]
234 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
235 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
236 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
238 'base_url': self
.url
,
243 for current_page
in range(1, self
.last_page
+ 1):
244 parameters
['page'] = current_page
245 req
= requests
.post(self
.collection_url
, parameters
)
246 soup
= BeautifulSoup(req
.text
, features
='lxml')
247 links
= soup
.find_all('a', {'class': 'card-img-holder'})
248 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
253 """ Downloads all the files in a collection """
257 if not self
.download_dir
:
259 "No download_dir set - invalidly initialised object?")
261 base_dir
= os
.getcwd()
263 os
.mkdir(self
.download_dir
)
264 except FileExistsError
:
265 logging
.info("Target directory {} already exists. Assuming a resume."
266 .format(self
.download_dir
))
267 logging
.info("Downloading {} thing(s).".format(self
.total
))
268 for idx
, thing
in enumerate(self
.things
):
269 logging
.info("Downloading thing {} - {}".format(idx
, thing
))
270 RC
= Thing(thing
).download(self
.download_dir
)
271 if self
.quick
and RC
==State
.ALREADY_DOWNLOADED
:
272 logging
.info("Caught up, stopping.")
276 class Collection(Grouping
):
277 """ Holds details of a collection. """
279 def __init__(self
, user
, name
, directory
, quick
):
280 Grouping
.__init
__(self
, quick
)
283 self
.url
= "{}/{}/collections/{}".format(
284 URL_BASE
, self
.user
, strip_ws(self
.name
))
285 self
.download_dir
= os
.path
.join(directory
,
286 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
287 self
.collection_url
= URL_COLLECTION
290 class Designs(Grouping
):
291 """ Holds details of all of a users' designs. """
293 def __init__(self
, user
, directory
, quick
):
294 Grouping
.__init
__(self
, quick
)
296 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
297 self
.download_dir
= os
.path
.join(
298 directory
, "{} designs".format(slugify(self
.user
)))
299 self
.collection_url
= USER_COLLECTION
303 """ An individual design on thingiverse. """
305 def __init__(self
, thing_id
):
306 self
.thing_id
= thing_id
307 self
.last_time
= None
309 self
._needs
_download
= True
312 self
.download_dir
= None
314 def _parse(self
, base_dir
):
315 """ Work out what, if anything needs to be done. """
319 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
322 wait
= WebDriverWait(BROWSER
, 60)
325 except requests
.exceptions
.ConnectionError
as error
:
326 logging
.error("Unable to connect for thing {}: {}".format(
327 self
.thing_id
, error
))
329 except selenium
.common
.exceptions
.TimeoutException
:
330 logging
.error(pc
.log
)
331 logging
.error("Timeout trying to parse thing {}".format(self
.thing_id
))
334 self
.title
= pc
.title
336 for link
in pc
.files
:
337 logging
.debug("Parsing link: {}".format(link
.text
))
338 link_link
= link
.find_element_by_xpath(".//a").get_attribute("href")
339 if link_link
.endswith("/zip"):
343 link_title
, link_details
, _
= link
.text
.split("\n")
345 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
346 _
, link_title
, link_details
, _
= link
.text
.split("\n")
348 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
349 #need to convert from M D Y to Y M D
350 link_date
= [int(x
) for x
in link_details
.split("|")[1].split()[-1].split("-")]
352 self
._file
_links
.append(FileLink(link_title
, datetime
.datetime(link_date
[2], link_date
[0], link_date
[1]), link_link
))
354 logging
.error(link_date
)
356 self
._image
_links
=[x
.find_element_by_xpath(".//img").get_attribute("src") for x
in pc
.images
]
357 self
._license
= pc
.license
361 self
.old_download_dir
= os
.path
.join(base_dir
, slugify(self
.title
))
362 self
.download_dir
= os
.path
.join(base_dir
, "{} - {}".format(self
.thing_id
, slugify(self
.title
)))
364 logging
.debug("Parsing {} ({})".format(self
.thing_id
, self
.title
))
366 if not os
.path
.exists(self
.download_dir
):
367 logging
.info("Looking for old dir at {}".format(self
.old_download_dir
))
368 if os
.path
.exists(self
.old_download_dir
):
369 logging
.warning("Found previous style download directory. Moving it from {} to {}".format(self
.old_download_dir
, self
.download_dir
))
370 os
.rename(self
.old_download_dir
, self
.download_dir
)
376 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
377 if not os
.path
.exists(timestamp_file
):
378 # Old download from before
380 "Old-style download directory found. Assuming update required.")
385 with
open(timestamp_file
, 'r') as timestamp_handle
:
386 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
387 last_bits
= [int(x
) for x
in timestamp_handle
.readlines()[0].split(' ')[0].split("-")]
388 logging
.warning(last_bits
)
390 self
.last_time
= datetime
.datetime(last_bits
[0], last_bits
[1], last_bits
[2])
392 # This one appears to be M D Y
393 self
.last_time
= datetime
.datetime(last_bits
[2], last_bits
[0], last_bits
[1])
395 logging
.info("last downloaded version: {}".format(self
.last_time
))
396 except FileNotFoundError
:
397 # Not run on this thing before.
399 "Old-style download directory found. Assuming update required.")
400 self
.last_time
= None
401 self
._needs
_download
= True
405 # OK, so we have a timestamp, lets see if there is anything new to get
406 for file_link
in self
._file
_links
:
407 if file_link
.last_update
> self
.last_time
:
409 "Found new/updated file {} - {}".format(file_link
.name
, file_link
.last_update
))
410 self
._needs
_download
= True
414 # Got here, so nope, no new files.
415 self
._needs
_download
= False
418 def download(self
, base_dir
):
419 """ Download all files for a given thing.
420 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
423 self
._parse
(base_dir
)
427 "Unable to parse {} - aborting download".format(self
.thing_id
))
430 if not self
._needs
_download
:
431 print("{} - {} already downloaded - skipping.".format(self
.thing_id
, self
.title
))
432 return State
.ALREADY_DOWNLOADED
434 # Have we already downloaded some things?
435 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
437 if os
.path
.exists(self
.download_dir
):
438 if not os
.path
.exists(timestamp_file
):
439 # edge case: old style dir w/out timestamp.
440 logging
.warning("Old style download dir found at {}".format(self
.title
))
442 target_dir
= "{}_old".format(self
.download_dir
)
443 while os
.path
.exists(target_dir
):
444 prev_count
= prev_count
+ 1
445 target_dir
= "{}_old_{}".format(self
.download_dir
, prev_count
)
446 os
.rename(self
.download_dir
, target_dir
)
448 prev_dir
= "{}_{}".format(self
.download_dir
, slugify(self
.last_time
.__str
__()))
449 os
.rename(self
.download_dir
, prev_dir
)
451 # Get the list of files to download
457 if not self
.last_time
:
458 # If we don't have anything to copy from, then it is all new.
459 logging
.debug("No last time, downloading all files")
460 new_file_links
= self
._file
_links
461 new_last_time
= new_file_links
[0].last_update
463 for file_link
in new_file_links
:
464 new_last_time
= max(new_last_time
, file_link
.last_update
)
465 logging
.debug("New timestamp will be {}".format(new_last_time
))
467 new_last_time
= self
.last_time
468 for file_link
in self
._file
_links
:
469 if file_link
.last_update
> self
.last_time
:
470 new_file_links
.append(file_link
)
471 new_last_time
= max(new_last_time
, file_link
.last_update
)
473 old_file_links
.append(file_link
)
475 logging
.debug("new timestamp {}".format(new_last_time
))
477 # OK. Time to get to work.
478 logging
.debug("Generating download_dir")
479 os
.mkdir(self
.download_dir
)
480 filelist_file
= os
.path
.join(self
.download_dir
, "filelist.txt")
481 with
open(filelist_file
, 'w', encoding
="utf-8") as fl_handle
:
482 for fl
in self
._file
_links
:
485 fl
.link
=requests
.get(fl
.link
, allow_redirects
=False).headers
['location']
487 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
490 fl_handle
.write("{},{},{}, {}\n".format(fl
.link
, fl
.name
, fl
.last_update
, base_link
))
493 # First grab the cached files (if any)
494 logging
.info("Copying {} unchanged files.".format(len(old_file_links
)))
495 for file_link
in old_file_links
:
496 old_file
= os
.path
.join(prev_dir
, file_link
.name
)
497 new_file
= truncate_name(os
.path
.join(self
.download_dir
, file_link
.name
))
499 logging
.debug("Copying {} to {}".format(old_file
, new_file
))
500 copyfile(old_file
, new_file
)
501 except FileNotFoundError
:
503 "Unable to find {} in old archive, redownloading".format(file_link
["title"]))
504 new_file_links
.append(file_link
)
506 # Now download the new ones
507 logging
.info("Downloading {} new files of {}".format(
508 len(new_file_links
), len(self
._file
_links
)))
510 for file_link
in new_file_links
:
511 file_name
= truncate_name(os
.path
.join(self
.download_dir
, file_link
.name
))
512 logging
.debug("Downloading {} from {} to {}".format(
513 file_link
.name
, file_link
.link
, file_name
))
514 data_req
= requests
.get(file_link
.link
)
515 with
open(file_name
, 'wb') as handle
:
516 handle
.write(data_req
.content
)
517 except Exception as exception
:
518 logging
.error("Failed to download {} - {}".format(file_link
.name
, exception
))
519 fail_dir(self
.download_dir
)
523 # People like images. But this doesn't work yet.
524 image_dir
= os
.path
.join(self
.download_dir
, 'images')
525 logging
.info("Downloading {} images.".format(len(self
._image
_links
)))
528 for imagelink
in self
._image
_links
:
529 filename
= os
.path
.basename(imagelink
)
530 if filename
.endswith('stl'):
531 filename
= "{}.png".format(filename
)
532 image_req
= requests
.get(imagelink
)
533 with
open(truncate_name(os
.path
.join(image_dir
, filename
)), 'wb') as handle
:
534 handle
.write(image_req
.content
)
535 except Exception as exception
:
536 print("Failed to download {} - {}".format(filename
, exception
))
537 fail_dir(self
.download_dir
)
541 # instructions are good too.
542 logging.info("Downloading readme")
544 readme_txt = soup.find('meta', property='og:description')[
546 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
547 readme_handle.write("{}\n".format(readme_txt))
548 except (TypeError, KeyError) as exception:
549 logging.warning("No readme? {}".format(exception))
550 except IOError as exception:
551 logging.warning("Failed to write readme! {}".format(exception))
554 # Best get some licenses
555 logging
.info("Downloading license")
558 with
open(truncate_name(os
.path
.join(self
.download_dir
, 'license.txt')), 'w', encoding
="utf-8") as license_handle
:
559 license_handle
.write("{}\n".format(self
._license
))
560 except IOError as exception
:
561 logging
.warning("Failed to write license! {}".format(exception
))
564 # Now write the timestamp
565 with
open(timestamp_file
, 'w', encoding
="utf-8") as timestamp_handle
:
566 timestamp_handle
.write(new_last_time
.__str
__())
567 except Exception as exception
:
568 print("Failed to write timestamp file - {}".format(exception
))
569 fail_dir(self
.download_dir
)
571 self
._needs
_download
= False
572 logging
.debug("Download of {} finished".format(self
.title
))
576 def do_batch(batch_file
, download_dir
, quick
):
577 """ Read a file in line by line, parsing each as a set of calls to this script."""
578 with
open(batch_file
) as handle
:
584 logging
.info("Handling instruction {}".format(line
))
585 command_arr
= line
.split()
586 if command_arr
[0] == "thing":
588 "Handling batch thing instruction: {}".format(line
))
589 Thing(command_arr
[1]).download(download_dir
)
591 if command_arr
[0] == "collection":
593 "Handling batch collection instruction: {}".format(line
))
594 Collection(command_arr
[1], command_arr
[2],
595 download_dir
, quick
).download()
597 if command_arr
[0] == "user":
599 "Handling batch collection instruction: {}".format(line
))
600 Designs(command_arr
[1], download_dir
, quick
).download()
602 logging
.warning("Unable to parse current instruction. Skipping.")
606 """ Entry point for script being run as a command. """
607 parser
= argparse
.ArgumentParser()
608 parser
.add_argument("-l", "--log-level", choices
=[
609 'debug', 'info', 'warning'], default
='info', help="level of logging desired")
610 parser
.add_argument("-d", "--directory",
611 help="Target directory to download into")
612 parser
.add_argument("-f", "--log-file",
613 help="Place to log debug information to")
614 parser
.add_argument("-q", "--quick", action
="store_true",
615 help="Assume date ordering on posts")
617 subparsers
= parser
.add_subparsers(
618 help="Type of thing to download", dest
="subcommand")
619 collection_parser
= subparsers
.add_parser(
620 'collection', help="Download one or more entire collection(s)")
621 collection_parser
.add_argument(
622 "owner", help="The owner of the collection(s) to get")
623 collection_parser
.add_argument(
624 "collections", nargs
="+", help="Space seperated list of the name(s) of collection to get")
625 thing_parser
= subparsers
.add_parser(
626 'thing', help="Download a single thing.")
627 thing_parser
.add_argument(
628 "things", nargs
="*", help="Space seperated list of thing ID(s) to download")
629 user_parser
= subparsers
.add_parser(
630 "user", help="Download all things by one or more users")
631 user_parser
.add_argument(
632 "users", nargs
="+", help="A space seperated list of the user(s) to get the designs of")
633 batch_parser
= subparsers
.add_parser(
634 "batch", help="Perform multiple actions written in a text file")
635 batch_parser
.add_argument(
636 "batch_file", help="The name of the file to read.")
637 subparsers
.add_parser("version", help="Show the current version")
639 args
= parser
.parse_args()
640 if not args
.subcommand
:
643 if not args
.directory
:
644 args
.directory
= os
.getcwd()
646 logger
= logging
.getLogger()
647 formatter
= logging
.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
648 logger
.setLevel(logging
.DEBUG
)
649 console_handler
= logging
.StreamHandler()
650 console_handler
.setLevel(args
.log_level
.upper())
652 logger
.addHandler(console_handler
)
654 file_handler
= logging
.FileHandler(args
.log_file
)
655 file_handler
.setLevel(logging
.DEBUG
)
656 file_handler
.setFormatter(formatter
)
657 logger
.addHandler(file_handler
)
661 thing_queue
= multiprocessing
.JoinableQueue()
662 logging
.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT
))
663 downloaders
= [Downloader(thing_queue
, args
.directory
) for _
in range(DOWNLOADER_COUNT
)]
664 for downloader
in downloaders
:
668 if args
.subcommand
.startswith("collection"):
669 for collection
in args
.collections
:
670 Collection(args
.owner
, collection
, args
.directory
, args
.quick
).download()
671 if args
.subcommand
== "thing":
672 for thing
in args
.things
:
673 thing_queue
.put(thing
)
674 if args
.subcommand
== "user":
675 for user
in args
.users
:
676 Designs(user
, args
.directory
, args
.quick
).download()
677 if args
.subcommand
== "version":
678 print("thingy_grabber.py version {}".format(VERSION
))
679 if args
.subcommand
== "batch":
680 do_batch(args
.batch_file
, args
.directory
, args
.quick
)
682 # Stop the downloader processes
683 for downloader
in downloaders
:
684 thing_queue
.put(None)
686 atexit
.register(BROWSER
.quit
)
688 if __name__
== "__main__":
689 multiprocessing
.freeze_support()