3 Thingiverse bulk downloader
13 import multiprocessing
16 from shutil
import copyfile
17 from bs4
import BeautifulSoup
18 from dataclasses
import dataclass
20 from selenium
import webdriver
21 from selenium
.webdriver
.common
.by
import By
22 from selenium
.webdriver
.support
.ui
import WebDriverWait
23 from selenium
.webdriver
.support
import expected_conditions
as EC
24 from selenium
.webdriver
.firefox
.options
import Options
26 URL_BASE
= "https://www.thingiverse.com"
27 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
28 USER_COLLECTION
= URL_BASE
+ "/ajax/user/designs"
30 ID_REGEX
= re
.compile(r
'"id":(\d*),')
31 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
32 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
33 # This appears to be fixed at 12, but if it changes would screw the rest up.
34 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
35 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
43 #BROWSER = webdriver.PhantomJS('./phantomjs')
45 options
.add_argument("--headless")
46 BROWSER
= webdriver
.Firefox(options
=options
)
48 BROWSER
.set_window_size(1980, 1080)
55 link
: datetime
.datetime
58 class State(enum
.Enum
):
61 ALREADY_DOWNLOADED
= enum
.auto()
65 """ Remove whitespace from a string """
66 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
69 def strip_invalid_chars(value
):
71 Normalizes string, converts to lowercase, removes non-alpha characters.
73 return unicodedata
.normalize('NFKD', value
).encode(
74 'ascii', 'ignore').decode()
79 Normalizes string, converts to lowercase, removes non-alpha characters,
80 and converts spaces to hyphens.
82 value
= strip_invalid_chars(value
)
83 value
= str(re
.sub(r
'[^\w\s-]', '', value
).strip())
84 value
= strip_ws(value
)
87 class PageChecker(object):
91 self
.file_count
= None
97 def __call__(self
, _
):
99 self
.log
.append("call")
100 if self
.title
is None:
101 # first find the name
102 name
= EC
._find
_element
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingPage__modelName]"))
105 self
.title
= name
.text
107 if self
.file_count
is None:
108 # OK. Do we know how many files we have to download?
109 metrics
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=MetricButton]"))
110 self
.log
.append("got some metrics: {}".format(len(metrics
)))
111 cur_count
= int([x
.text
.split("\n")[0] for x
in metrics
if x
.text
.endswith("\nThing Files")][0])
112 self
.log
.append(cur_count
)
115 self
.file_count
= cur_count
117 self
.log
.append("looking for {} files".format(self
.file_count
))
118 fileRows
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingFile__fileRow]"))
119 self
.log
.append("found {} files".format(len(fileRows
)))
120 if len(fileRows
) < self
.file_count
:
123 self
.log
.append("Looking for images")
124 # By this point _should_ have loaded all the images
125 self
.images
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=thumb]"))
126 self
.license
= EC
._find
_element
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=License__licenseText]")).text
127 self
.log
.append("found {} images".format(len(self
.images
)))
128 self
.files
= fileRows
136 class Downloader(multiprocessing
.Process
):
138 Class to handle downloading the things we have found to get.
141 def __init__(self
, thing_queue
, download_directory
):
142 multiprocessing
.Process
.__init
__(self
)
143 # TODO: add parameters
144 self
.thing_queue
= thing_queue
145 self
.download_directory
= download_directory
148 """ actual download loop.
151 thing_id
= self
.thing_queue
.get()
153 logging
.info("Shutting download queue")
154 self
.thing_queue
.task_done()
156 logging
.info("Handling id {}".format(thing_id
))
157 Thing(thing_id
).download(self
.download_directory
)
158 self
.thing_queue
.task_done()
166 """ Holds details of a group of things for download
167 This is effectively (although not actually) an abstract class
168 - use Collection or Designs instead.
171 def __init__(self
, quick
):
177 # Should we stop downloading when we hit a known datestamp?
179 # These should be set by child classes.
181 self
.download_dir
= None
182 self
.collection_url
= None
184 def _get_small_grouping(self
, req
):
185 """ Handle small groupings """
186 soup
= BeautifulSoup(req
.text
, features
='lxml')
187 links
= soup
.find_all('a', {'class': 'card-img-holder'})
188 self
.things
= [x
['href'].split(':')[1] for x
in links
]
189 self
.total
= len(self
.things
)
194 """ retrieve the things of the grouping. """
196 # We've already done it.
199 # Check for initialisation:
201 logging
.error("No URL set - object not initialised properly?")
202 raise ValueError("No URL set - object not initialised properly?")
204 # Get the internal details of the grouping.
205 logging
.debug("Querying {}".format(self
.url
))
206 c_req
= requests
.get(self
.url
)
207 total
= TOTAL_REGEX
.search(c_req
.text
)
209 # This is a small (<13) items grouping. Pull the list from this req.
210 return self
._get
_small
_grouping
(c_req
)
211 self
.total
= total
.groups()[0]
212 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
213 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
214 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
216 'base_url': self
.url
,
221 for current_page
in range(1, self
.last_page
+ 1):
222 parameters
['page'] = current_page
223 req
= requests
.post(self
.collection_url
, parameters
)
224 soup
= BeautifulSoup(req
.text
, features
='lxml')
225 links
= soup
.find_all('a', {'class': 'card-img-holder'})
226 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
231 """ Downloads all the files in a collection """
235 if not self
.download_dir
:
237 "No download_dir set - invalidly initialised object?")
239 base_dir
= os
.getcwd()
241 os
.mkdir(self
.download_dir
)
242 except FileExistsError
:
243 logging
.info("Target directory {} already exists. Assuming a resume."
244 .format(self
.download_dir
))
245 logging
.info("Downloading {} thing(s).".format(self
.total
))
246 for idx
, thing
in enumerate(self
.things
):
247 logging
.info("Downloading thing {} - {}".format(idx
, thing
))
248 RC
= Thing(thing
).download(self
.download_dir
)
249 if self
.quick
and RC
==State
.ALREADY_DOWNLOADED
:
250 logging
.info("Caught up, stopping.")
254 class Collection(Grouping
):
255 """ Holds details of a collection. """
257 def __init__(self
, user
, name
, directory
, quick
):
258 Grouping
.__init
__(self
, quick
)
261 self
.url
= "{}/{}/collections/{}".format(
262 URL_BASE
, self
.user
, strip_ws(self
.name
))
263 self
.download_dir
= os
.path
.join(directory
,
264 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
265 self
.collection_url
= URL_COLLECTION
268 class Designs(Grouping
):
269 """ Holds details of all of a users' designs. """
271 def __init__(self
, user
, directory
, quick
):
272 Grouping
.__init
__(self
, quick
)
274 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
275 self
.download_dir
= os
.path
.join(
276 directory
, "{} designs".format(slugify(self
.user
)))
277 self
.collection_url
= USER_COLLECTION
281 """ An individual design on thingiverse. """
283 def __init__(self
, thing_id
):
284 self
.thing_id
= thing_id
285 self
.last_time
= None
287 self
._needs
_download
= True
290 self
.download_dir
= None
292 def _parse(self
, base_dir
):
293 """ Work out what, if anything needs to be done. """
297 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
300 wait
= WebDriverWait(BROWSER
, 60)
303 except requests
.exceptions
.ConnectionError
as error
:
304 logging
.error("Unable to connect for thing {}: {}".format(
305 self
.thing_id
, error
))
307 except selenium
.common
.exceptions
.TimeoutException
:
308 logging
.error(pc
.log
)
309 logging
.error("Timeout trying to parse thing {}".format(self
.thing_id
))
312 self
.title
= pc
.title
314 for link
in pc
.files
:
315 logging
.debug("Parsing link: {}".format(link
.text
))
316 link_link
= link
.find_element_by_xpath(".//a").get_attribute("href")
317 if link_link
.endswith("/zip"):
321 link_title
, link_details
, _
= link
.text
.split("\n")
323 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
324 _
, link_title
, link_details
, _
= link
.text
.split("\n")
326 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
327 #need to convert from M D Y to Y M D
328 link_date
= [int(x
) for x
in link_details
.split("|")[1].split()[-1].split("-")]
330 self
._file
_links
.append(FileLink(strip_invalid_chars(link_title
), datetime
.datetime(link_date
[2], link_date
[0], link_date
[1]), link_link
))
332 logging
.error(link_date
)
334 self
._image
_links
=[x
.find_element_by_xpath(".//img").get_attribute("src") for x
in pc
.images
]
335 self
._license
= pc
.license
339 self
.old_download_dir
= os
.path
.join(base_dir
, slugify(self
.title
))
340 self
.download_dir
= os
.path
.join(base_dir
, "{} - {}".format(self
.thing_id
, slugify(self
.title
)))
342 logging
.debug("Parsing {} ({})".format(self
.thing_id
, self
.title
))
344 if not os
.path
.exists(self
.download_dir
):
345 logging
.info("Looking for old dir at {}".format(self
.old_download_dir
))
346 if os
.path
.exists(self
.old_download_dir
):
347 logging
.warning("Found previous style download directory. Moving it from {} to {}".format(self
.old_download_dir
, self
.download_dir
))
348 os
.rename(self
.old_download_dir
, self
.download_dir
)
354 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
355 if not os
.path
.exists(timestamp_file
):
356 # Old download from before
358 "Old-style download directory found. Assuming update required.")
363 with
open(timestamp_file
, 'r') as timestamp_handle
:
364 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
365 last_bits
= [int(x
) for x
in timestamp_handle
.readlines()[0].split(' ')[0].split("-")]
366 logging
.warning(last_bits
)
368 self
.last_time
= datetime
.datetime(last_bits
[0], last_bits
[1], last_bits
[2])
370 # This one appears to be M D Y
371 self
.last_time
= datetime
.datetime(last_bits
[2], last_bits
[0], last_bits
[1])
373 logging
.info("last downloaded version: {}".format(self
.last_time
))
374 except FileNotFoundError
:
375 # Not run on this thing before.
377 "Old-style download directory found. Assuming update required.")
378 self
.last_time
= None
379 self
._needs
_download
= True
383 # OK, so we have a timestamp, lets see if there is anything new to get
384 for file_link
in self
._file
_links
:
385 if file_link
.last_update
> self
.last_time
:
387 "Found new/updated file {} - {}".format(file_link
.name
, file_link
.last_update
))
388 self
._needs
_download
= True
392 # Got here, so nope, no new files.
393 self
._needs
_download
= False
396 def download(self
, base_dir
):
397 """ Download all files for a given thing.
398 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
401 self
._parse
(base_dir
)
405 "Unable to parse {} - aborting download".format(self
.thing_id
))
408 if not self
._needs
_download
:
409 print("{} - {} already downloaded - skipping.".format(self
.thing_id
, self
.title
))
410 return State
.ALREADY_DOWNLOADED
412 # Have we already downloaded some things?
413 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
415 if os
.path
.exists(self
.download_dir
):
416 if not os
.path
.exists(timestamp_file
):
417 # edge case: old style dir w/out timestamp.
418 logging
.warning("Old style download dir found at {}".format(self
.title
))
420 target_dir
= "{}_old".format(self
.download_dir
)
421 while os
.path
.exists(target_dir
):
422 prev_count
= prev_count
+ 1
423 target_dir
= "{}_old_{}".format(self
.download_dir
, prev_count
)
424 os
.rename(self
.download_dir
, target_dir
)
426 prev_dir
= "{}_{}".format(self
.download_dir
, slugify(self
.last_time
.__str
__()))
427 os
.rename(self
.download_dir
, prev_dir
)
429 # Get the list of files to download
435 if not self
.last_time
:
436 # If we don't have anything to copy from, then it is all new.
437 logging
.debug("No last time, downloading all files")
438 new_file_links
= self
._file
_links
439 new_last_time
= new_file_links
[0].last_update
441 for file_link
in new_file_links
:
442 new_last_time
= max(new_last_time
, file_link
.last_update
)
443 logging
.debug("New timestamp will be {}".format(new_last_time
))
445 new_last_time
= self
.last_time
446 for file_link
in self
._file
_links
:
447 if file_link
.last_update
> self
.last_time
:
448 new_file_links
.append(file_link
)
449 new_last_time
= max(new_last_time
, file_link
.last_update
)
451 old_file_links
.append(file_link
)
453 logging
.debug("new timestamp {}".format(new_last_time
))
455 # OK. Time to get to work.
456 logging
.debug("Generating download_dir")
457 os
.mkdir(self
.download_dir
)
458 filelist_file
= os
.path
.join(self
.download_dir
, "filelist.txt")
459 with
open(filelist_file
, 'w') as fl_handle
:
460 for fl
in self
._file
_links
:
463 fl
.link
=requests
.get(fl
.link
, allow_redirects
=False).headers
['location']
465 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
468 fl_handle
.write("{},{},{}, {}\n".format(fl
.link
, fl
.name
, fl
.last_update
, base_link
))
471 # First grab the cached files (if any)
472 logging
.info("Copying {} unchanged files.".format(len(old_file_links
)))
473 for file_link
in old_file_links
:
474 old_file
= os
.path
.join(prev_dir
, file_link
.name
)
475 new_file
= os
.path
.join(self
.download_dir
, file_link
.name
)
477 logging
.debug("Copying {} to {}".format(old_file
, new_file
))
478 copyfile(old_file
, new_file
)
479 except FileNotFoundError
:
481 "Unable to find {} in old archive, redownloading".format(file_link
["title"]))
482 new_file_links
.append(file_link
)
484 # Now download the new ones
485 logging
.info("Downloading {} new files of {}".format(
486 len(new_file_links
), len(self
._file
_links
)))
488 for file_link
in new_file_links
:
489 file_name
= os
.path
.join(self
.download_dir
, file_link
.name
)
490 logging
.debug("Downloading {} from {} to {}".format(
491 file_link
.name
, file_link
.link
, file_name
))
492 data_req
= requests
.get(file_link
.link
)
493 with
open(file_name
, 'wb') as handle
:
494 handle
.write(data_req
.content
)
495 except Exception as exception
:
496 logging
.error("Failed to download {} - {}".format(file_link
.name
, exception
))
497 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
501 # People like images. But this doesn't work yet.
502 image_dir
= os
.path
.join(self
.download_dir
, 'images')
503 logging
.info("Downloading {} images.".format(len(self
._image
_links
)))
506 for imagelink
in self
._image
_links
:
507 filename
= os
.path
.basename(imagelink
)
508 if filename
.endswith('stl'):
509 filename
= "{}.png".format(filename
)
510 image_req
= requests
.get(imagelink
)
511 with
open(os
.path
.join(image_dir
, filename
), 'wb') as handle
:
512 handle
.write(image_req
.content
)
513 except Exception as exception
:
514 print("Failed to download {} - {}".format(filename
, exception
))
515 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
519 # instructions are good too.
520 logging.info("Downloading readme")
522 readme_txt = soup.find('meta', property='og:description')[
524 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
525 readme_handle.write("{}\n".format(readme_txt))
526 except (TypeError, KeyError) as exception:
527 logging.warning("No readme? {}".format(exception))
528 except IOError as exception:
529 logging.warning("Failed to write readme! {}".format(exception))
532 # Best get some licenses
533 logging
.info("Downloading license")
536 with
open(os
.path
.join(self
.download_dir
, 'license.txt'), 'w') as license_handle
:
537 license_handle
.write("{}\n".format(self
._license
))
538 except IOError as exception
:
539 logging
.warning("Failed to write license! {}".format(exception
))
542 # Now write the timestamp
543 with
open(timestamp_file
, 'w') as timestamp_handle
:
544 timestamp_handle
.write(new_last_time
.__str
__())
545 except Exception as exception
:
546 print("Failed to write timestamp file - {}".format(exception
))
547 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
549 self
._needs
_download
= False
550 logging
.debug("Download of {} finished".format(self
.title
))
554 def do_batch(batch_file
, download_dir
, quick
):
555 """ Read a file in line by line, parsing each as a set of calls to this script."""
556 with
open(batch_file
) as handle
:
562 logging
.info("Handling instruction {}".format(line
))
563 command_arr
= line
.split()
564 if command_arr
[0] == "thing":
566 "Handling batch thing instruction: {}".format(line
))
567 Thing(command_arr
[1]).download(download_dir
)
569 if command_arr
[0] == "collection":
571 "Handling batch collection instruction: {}".format(line
))
572 Collection(command_arr
[1], command_arr
[2],
573 download_dir
, quick
).download()
575 if command_arr
[0] == "user":
577 "Handling batch collection instruction: {}".format(line
))
578 Designs(command_arr
[1], download_dir
, quick
).download()
580 logging
.warning("Unable to parse current instruction. Skipping.")
584 """ Entry point for script being run as a command. """
585 parser
= argparse
.ArgumentParser()
586 parser
.add_argument("-l", "--log-level", choices
=[
587 'debug', 'info', 'warning'], default
='info', help="level of logging desired")
588 parser
.add_argument("-d", "--directory",
589 help="Target directory to download into")
590 parser
.add_argument("-f", "--log-file",
591 help="Place to log debug information to")
592 parser
.add_argument("-q", "--quick", action
="store_true",
593 help="Assume date ordering on posts")
595 subparsers
= parser
.add_subparsers(
596 help="Type of thing to download", dest
="subcommand")
597 collection_parser
= subparsers
.add_parser(
598 'collection', help="Download one or more entire collection(s)")
599 collection_parser
.add_argument(
600 "owner", help="The owner of the collection(s) to get")
601 collection_parser
.add_argument(
602 "collections", nargs
="+", help="Space seperated list of the name(s) of collection to get")
603 thing_parser
= subparsers
.add_parser(
604 'thing', help="Download a single thing.")
605 thing_parser
.add_argument(
606 "things", nargs
="*", help="Space seperated list of thing ID(s) to download")
607 user_parser
= subparsers
.add_parser(
608 "user", help="Download all things by one or more users")
609 user_parser
.add_argument(
610 "users", nargs
="+", help="A space seperated list of the user(s) to get the designs of")
611 batch_parser
= subparsers
.add_parser(
612 "batch", help="Perform multiple actions written in a text file")
613 batch_parser
.add_argument(
614 "batch_file", help="The name of the file to read.")
615 subparsers
.add_parser("version", help="Show the current version")
617 args
= parser
.parse_args()
618 if not args
.subcommand
:
621 if not args
.directory
:
622 args
.directory
= os
.getcwd()
624 logger
= logging
.getLogger()
625 formatter
= logging
.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
626 logger
.setLevel(logging
.DEBUG
)
627 console_handler
= logging
.StreamHandler()
628 console_handler
.setLevel(args
.log_level
.upper())
630 logger
.addHandler(console_handler
)
632 file_handler
= logging
.FileHandler(args
.log_file
)
633 file_handler
.setLevel(logging
.DEBUG
)
634 file_handler
.setFormatter(formatter
)
635 logger
.addHandler(file_handler
)
639 thing_queue
= multiprocessing
.JoinableQueue()
640 logging
.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT
))
641 downloaders
= [Downloader(thing_queue
, args
.directory
) for _
in range(DOWNLOADER_COUNT
)]
642 for downloader
in downloaders
:
646 if args
.subcommand
.startswith("collection"):
647 for collection
in args
.collections
:
648 Collection(args
.owner
, collection
, args
.directory
, args
.quick
).download()
649 if args
.subcommand
== "thing":
650 for thing
in args
.things
:
651 thing_queue
.put(thing
)
652 if args
.subcommand
== "user":
653 for user
in args
.users
:
654 Designs(user
, args
.directory
, args
.quick
).download()
655 if args
.subcommand
== "version":
656 print("thingy_grabber.py version {}".format(VERSION
))
657 if args
.subcommand
== "batch":
658 do_batch(args
.batch_file
, args
.directory
, args
.quick
)
660 # Stop the downloader processes
661 for downloader
in downloaders
:
662 thing_queue
.put(None)
664 if __name__
== "__main__":
665 multiprocessing
.freeze_support()