3 Thingiverse bulk downloader
13 import multiprocessing
16 from shutil
import copyfile
17 from bs4
import BeautifulSoup
18 from dataclasses
import dataclass
20 from selenium
import webdriver
21 from selenium
.webdriver
.common
.by
import By
22 from selenium
.webdriver
.support
.ui
import WebDriverWait
23 from selenium
.webdriver
.support
import expected_conditions
as EC
24 from selenium
.webdriver
.firefox
.options
import Options
26 URL_BASE
= "https://www.thingiverse.com"
27 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
28 USER_COLLECTION
= URL_BASE
+ "/ajax/user/designs"
30 ID_REGEX
= re
.compile(r
'"id":(\d*),')
31 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
32 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
33 # This appears to be fixed at 12, but if it changes would screw the rest up.
34 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
35 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
43 #BROWSER = webdriver.PhantomJS('./phantomjs')
45 options
.add_argument("--headless")
46 BROWSER
= webdriver
.Firefox(options
=options
)
48 BROWSER
.set_window_size(1980, 1080)
55 link
: datetime
.datetime
58 class State(enum
.Enum
):
61 ALREADY_DOWNLOADED
= enum
.auto()
65 """ Remove whitespace from a string """
66 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
71 Normalizes string, converts to lowercase, removes non-alpha characters,
72 and converts spaces to hyphens.
74 value
= unicodedata
.normalize('NFKD', value
).encode(
75 'ascii', 'ignore').decode()
76 value
= str(re
.sub(r
'[^\w\s-]', '', value
).strip())
77 value
= str(NO_WHITESPACE_REGEX
.sub('-', value
))
80 class PageChecker(object):
84 self
.file_count
= None
90 def __call__(self
, _
):
92 self
.log
.append("call")
93 if self
.title
is None:
95 name
= EC
._find
_element
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingPage__modelName]"))
98 self
.title
= name
.text
100 if self
.file_count
is None:
101 # OK. Do we know how many files we have to download?
102 metrics
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=MetricButton]"))
103 self
.log
.append("got some metrics: {}".format(len(metrics
)))
104 cur_count
= int([x
.text
.split("\n")[0] for x
in metrics
if x
.text
.endswith("\nThing Files")][0])
105 self
.log
.append(cur_count
)
108 self
.file_count
= cur_count
110 self
.log
.append("looking for {} files".format(self
.file_count
))
111 fileRows
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingFile__fileRow]"))
112 self
.log
.append("found {} files".format(len(fileRows
)))
113 if len(fileRows
) < self
.file_count
:
116 self
.log
.append("Looking for images")
117 # By this point _should_ have loaded all the images
118 self
.images
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=thumb]"))
119 self
.license
= EC
._find
_element
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=License__licenseText]")).text
120 self
.log
.append("found {} images".format(len(self
.images
)))
121 self
.files
= fileRows
129 class Downloader(multiprocessing
.Process
):
131 Class to handle downloading the things we have found to get.
134 def __init__(self
, thing_queue
, download_directory
):
135 multiprocessing
.Process
.__init
__(self
)
136 # TODO: add parameters
137 self
.thing_queue
= thing_queue
138 self
.download_directory
= download_directory
141 """ actual download loop.
144 thing_id
= self
.thing_queue
.get()
146 logging
.info("Shutting download queue")
147 self
.thing_queue
.task_done()
149 logging
.info("Handling id {}".format(thing_id
))
150 Thing(thing_id
).download(self
.download_directory
)
151 self
.thing_queue
.task_done()
159 """ Holds details of a group of things for download
160 This is effectively (although not actually) an abstract class
161 - use Collection or Designs instead.
164 def __init__(self
, quick
):
170 # Should we stop downloading when we hit a known datestamp?
172 # These should be set by child classes.
174 self
.download_dir
= None
175 self
.collection_url
= None
177 def _get_small_grouping(self
, req
):
178 """ Handle small groupings """
179 soup
= BeautifulSoup(req
.text
, features
='lxml')
180 links
= soup
.find_all('a', {'class': 'card-img-holder'})
181 self
.things
= [x
['href'].split(':')[1] for x
in links
]
182 self
.total
= len(self
.things
)
187 """ retrieve the things of the grouping. """
189 # We've already done it.
192 # Check for initialisation:
194 logging
.error("No URL set - object not initialised properly?")
195 raise ValueError("No URL set - object not initialised properly?")
197 # Get the internal details of the grouping.
198 logging
.debug("Querying {}".format(self
.url
))
199 c_req
= requests
.get(self
.url
)
200 total
= TOTAL_REGEX
.search(c_req
.text
)
202 # This is a small (<13) items grouping. Pull the list from this req.
203 return self
._get
_small
_grouping
(c_req
)
204 self
.total
= total
.groups()[0]
205 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
206 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
207 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
209 'base_url': self
.url
,
214 for current_page
in range(1, self
.last_page
+ 1):
215 parameters
['page'] = current_page
216 req
= requests
.post(self
.collection_url
, parameters
)
217 soup
= BeautifulSoup(req
.text
, features
='lxml')
218 links
= soup
.find_all('a', {'class': 'card-img-holder'})
219 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
224 """ Downloads all the files in a collection """
228 if not self
.download_dir
:
230 "No download_dir set - invalidly initialised object?")
232 base_dir
= os
.getcwd()
234 os
.mkdir(self
.download_dir
)
235 except FileExistsError
:
236 logging
.info("Target directory {} already exists. Assuming a resume."
237 .format(self
.download_dir
))
238 logging
.info("Downloading {} thing(s).".format(self
.total
))
239 for idx
, thing
in enumerate(self
.things
):
240 logging
.info("Downloading thing {} - {}".format(idx
, thing
))
241 RC
= Thing(thing
).download(self
.download_dir
)
242 if self
.quick
and RC
==State
.ALREADY_DOWNLOADED
:
243 logging
.info("Caught up, stopping.")
247 class Collection(Grouping
):
248 """ Holds details of a collection. """
250 def __init__(self
, user
, name
, directory
, quick
):
251 Grouping
.__init
__(self
, quick
)
254 self
.url
= "{}/{}/collections/{}".format(
255 URL_BASE
, self
.user
, strip_ws(self
.name
))
256 self
.download_dir
= os
.path
.join(directory
,
257 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
258 self
.collection_url
= URL_COLLECTION
261 class Designs(Grouping
):
262 """ Holds details of all of a users' designs. """
264 def __init__(self
, user
, directory
, quick
):
265 Grouping
.__init
__(self
, quick
)
267 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
268 self
.download_dir
= os
.path
.join(
269 directory
, "{} designs".format(slugify(self
.user
)))
270 self
.collection_url
= USER_COLLECTION
274 """ An individual design on thingiverse. """
276 def __init__(self
, thing_id
):
277 self
.thing_id
= thing_id
278 self
.last_time
= None
280 self
._needs
_download
= True
283 self
.download_dir
= None
285 def _parse(self
, base_dir
):
286 """ Work out what, if anything needs to be done. """
290 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
293 wait
= WebDriverWait(BROWSER
, 60)
296 except requests
.exceptions
.ConnectionError
as error
:
297 logging
.error("Unable to connect for thing {}: {}".format(
298 self
.thing_id
, error
))
300 except selenium
.common
.exceptions
.TimeoutException
:
301 logging
.error(pc
.log
)
302 logging
.error("Timeout trying to parse thing {}".format(self
.thing_id
))
305 self
.title
= pc
.title
307 for link
in pc
.files
:
308 logging
.debug("Parsing link: {}".format(link
.text
))
309 link_link
= link
.find_element_by_xpath(".//a").get_attribute("href")
310 if link_link
.endswith("/zip"):
314 link_title
, link_details
, _
= link
.text
.split("\n")
316 # If it is a filetype that doesn't generate a picture, then we get an extra field at the start.
317 _
, link_title
, link_details
, _
= link
.text
.split("\n")
319 #link_details will be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
320 #need to convert from M D Y to Y M D
321 link_date
= [int(x
) for x
in link_details
.split("|")[1].split()[-1].split("-")]
322 logging
.error(link_details
)
324 self
._file
_links
.append(FileLink(link_title
, datetime
.datetime(link_date
[2], link_date
[0], link_date
[1]), link_link
))
326 logging
.error(link_date
)
328 self
._image
_links
=[x
.find_element_by_xpath(".//img").get_attribute("src") for x
in pc
.images
]
329 self
._license
= pc
.license
333 self
.old_download_dir
= os
.path
.join(base_dir
, slugify(self
.title
))
334 self
.download_dir
= os
.path
.join(base_dir
, "{} - {}".format(self
.thing_id
, slugify(self
.title
)))
336 logging
.debug("Parsing {} ({})".format(self
.thing_id
, self
.title
))
338 if not os
.path
.exists(self
.download_dir
):
339 logging
.info("Looking for old dir at {}".format(self
.old_download_dir
))
340 if os
.path
.exists(self
.old_download_dir
):
341 logging
.warning("Found previous style download directory. Moving it from {} to {}".format(self
.old_download_dir
, self
.download_dir
))
342 os
.rename(self
.old_download_dir
, self
.download_dir
)
348 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
349 if not os
.path
.exists(timestamp_file
):
350 # Old download from before
352 "Old-style download directory found. Assuming update required.")
357 with
open(timestamp_file
, 'r') as timestamp_handle
:
358 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
359 last_bits
= [int(x
) for x
in timestamp_handle
.readlines()[0].split(' ')[0].split("-")]
360 logging
.warning(last_bits
)
362 self
.last_time
= datetime
.datetime(last_bits
[0], last_bits
[1], last_bits
[2])
364 # This one appears to be M D Y
365 self
.last_time
= datetime
.datetime(last_bits
[2], last_bits
[0], last_bits
[1])
367 logging
.info("last downloaded version: {}".format(self
.last_time
))
368 except FileNotFoundError
:
369 # Not run on this thing before.
371 "Old-style download directory found. Assuming update required.")
372 self
.last_time
= None
373 self
._needs
_download
= True
377 # OK, so we have a timestamp, lets see if there is anything new to get
378 for file_link
in self
._file
_links
:
379 if file_link
.last_update
> self
.last_time
:
381 "Found new/updated file {} - {}".format(file_link
.name
, file_link
.last_update
))
382 self
._needs
_download
= True
386 # Got here, so nope, no new files.
387 self
._needs
_download
= False
390 def download(self
, base_dir
):
391 """ Download all files for a given thing.
392 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
395 self
._parse
(base_dir
)
399 "Unable to parse {} - aborting download".format(self
.thing_id
))
402 if not self
._needs
_download
:
403 print("{} - {} already downloaded - skipping.".format(self
.thing_id
, self
.title
))
404 return State
.ALREADY_DOWNLOADED
406 # Have we already downloaded some things?
407 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
409 if os
.path
.exists(self
.download_dir
):
410 if not os
.path
.exists(timestamp_file
):
411 # edge case: old style dir w/out timestamp.
412 logging
.warning("Old style download dir found at {}".format(self
.title
))
414 target_dir
= "{}_old".format(self
.download_dir
)
415 while os
.path
.exists(target_dir
):
416 prev_count
= prev_count
+ 1
417 target_dir
= "{}_old_{}".format(self
.download_dir
, prev_count
)
418 os
.rename(self
.download_dir
, target_dir
)
420 prev_dir
= "{}_{}".format(self
.download_dir
, slugify(self
.last_time
.__str
__()))
421 os
.rename(self
.download_dir
, prev_dir
)
423 # Get the list of files to download
429 if not self
.last_time
:
430 # If we don't have anything to copy from, then it is all new.
431 logging
.debug("No last time, downloading all files")
432 new_file_links
= self
._file
_links
433 new_last_time
= new_file_links
[0].last_update
435 for file_link
in new_file_links
:
436 new_last_time
= max(new_last_time
, file_link
.last_update
)
437 logging
.debug("New timestamp will be {}".format(new_last_time
))
439 new_last_time
= self
.last_time
440 for file_link
in self
._file
_links
:
441 if file_link
.last_update
> self
.last_time
:
442 new_file_links
.append(file_link
)
443 new_last_time
= max(new_last_time
, file_link
.last_update
)
445 old_file_links
.append(file_link
)
447 logging
.debug("new timestamp {}".format(new_last_time
))
449 # OK. Time to get to work.
450 logging
.debug("Generating download_dir")
451 os
.mkdir(self
.download_dir
)
452 filelist_file
= os
.path
.join(self
.download_dir
, "filelist.txt")
453 with
open(filelist_file
, 'w') as fl_handle
:
454 for fl
in self
._file
_links
:
457 fl
.link
=requests
.get(fl
.link
, allow_redirects
=False).headers
['location']
459 # Sometimes Thingiverse just gives us the direct link the first time. Not sure why.
462 fl_handle
.write("{},{},{}, {}\n".format(fl
.link
, fl
.name
, fl
.last_update
, base_link
))
465 # First grab the cached files (if any)
466 logging
.info("Copying {} unchanged files.".format(len(old_file_links
)))
467 for file_link
in old_file_links
:
468 old_file
= os
.path
.join(prev_dir
, file_link
.name
)
469 new_file
= os
.path
.join(self
.download_dir
, file_link
.name
)
471 logging
.debug("Copying {} to {}".format(old_file
, new_file
))
472 copyfile(old_file
, new_file
)
473 except FileNotFoundError
:
475 "Unable to find {} in old archive, redownloading".format(file_link
["title"]))
476 new_file_links
.append(file_link
)
478 # Now download the new ones
479 logging
.info("Downloading {} new files of {}".format(
480 len(new_file_links
), len(self
._file
_links
)))
482 for file_link
in new_file_links
:
483 file_name
= os
.path
.join(self
.download_dir
, file_link
.name
)
484 logging
.debug("Downloading {} from {} to {}".format(
485 file_link
.name
, file_link
.link
, file_name
))
486 data_req
= requests
.get(file_link
.link
)
487 with
open(file_name
, 'wb') as handle
:
488 handle
.write(data_req
.content
)
489 except Exception as exception
:
490 logging
.error("Failed to download {} - {}".format(file_link
.name
, exception
))
491 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
495 # People like images. But this doesn't work yet.
496 image_dir
= os
.path
.join(self
.download_dir
, 'images')
497 logging
.info("Downloading {} images.".format(len(self
._image
_links
)))
500 for imagelink
in self
._image
_links
:
501 filename
= os
.path
.basename(imagelink
)
502 if filename
.endswith('stl'):
503 filename
= "{}.png".format(filename
)
504 image_req
= requests
.get(imagelink
)
505 with
open(os
.path
.join(image_dir
, filename
), 'wb') as handle
:
506 handle
.write(image_req
.content
)
507 except Exception as exception
:
508 print("Failed to download {} - {}".format(filename
, exception
))
509 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
513 # instructions are good too.
514 logging.info("Downloading readme")
516 readme_txt = soup.find('meta', property='og:description')[
518 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
519 readme_handle.write("{}\n".format(readme_txt))
520 except (TypeError, KeyError) as exception:
521 logging.warning("No readme? {}".format(exception))
522 except IOError as exception:
523 logging.warning("Failed to write readme! {}".format(exception))
526 # Best get some licenses
527 logging
.info("Downloading license")
530 with
open(os
.path
.join(self
.download_dir
, 'license.txt'), 'w') as license_handle
:
531 license_handle
.write("{}\n".format(self
._license
))
532 except IOError as exception
:
533 logging
.warning("Failed to write license! {}".format(exception
))
536 # Now write the timestamp
537 with
open(timestamp_file
, 'w') as timestamp_handle
:
538 timestamp_handle
.write(new_last_time
.__str
__())
539 except Exception as exception
:
540 print("Failed to write timestamp file - {}".format(exception
))
541 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
543 self
._needs
_download
= False
544 logging
.debug("Download of {} finished".format(self
.title
))
548 def do_batch(batch_file
, download_dir
, quick
):
549 """ Read a file in line by line, parsing each as a set of calls to this script."""
550 with
open(batch_file
) as handle
:
556 logging
.info("Handling instruction {}".format(line
))
557 command_arr
= line
.split()
558 if command_arr
[0] == "thing":
560 "Handling batch thing instruction: {}".format(line
))
561 Thing(command_arr
[1]).download(download_dir
)
563 if command_arr
[0] == "collection":
565 "Handling batch collection instruction: {}".format(line
))
566 Collection(command_arr
[1], command_arr
[2],
567 download_dir
, quick
).download()
569 if command_arr
[0] == "user":
571 "Handling batch collection instruction: {}".format(line
))
572 Designs(command_arr
[1], download_dir
, quick
).download()
574 logging
.warning("Unable to parse current instruction. Skipping.")
578 """ Entry point for script being run as a command. """
579 parser
= argparse
.ArgumentParser()
580 parser
.add_argument("-l", "--log-level", choices
=[
581 'debug', 'info', 'warning'], default
='info', help="level of logging desired")
582 parser
.add_argument("-d", "--directory",
583 help="Target directory to download into")
584 parser
.add_argument("-f", "--log-file",
585 help="Place to log debug information to")
586 parser
.add_argument("-q", "--quick", action
="store_true",
587 help="Assume date ordering on posts")
589 subparsers
= parser
.add_subparsers(
590 help="Type of thing to download", dest
="subcommand")
591 collection_parser
= subparsers
.add_parser(
592 'collection', help="Download one or more entire collection(s)")
593 collection_parser
.add_argument(
594 "owner", help="The owner of the collection(s) to get")
595 collection_parser
.add_argument(
596 "collections", nargs
="+", help="Space seperated list of the name(s) of collection to get")
597 thing_parser
= subparsers
.add_parser(
598 'thing', help="Download a single thing.")
599 thing_parser
.add_argument(
600 "things", nargs
="*", help="Space seperated list of thing ID(s) to download")
601 user_parser
= subparsers
.add_parser(
602 "user", help="Download all things by one or more users")
603 user_parser
.add_argument(
604 "users", nargs
="+", help="A space seperated list of the user(s) to get the designs of")
605 batch_parser
= subparsers
.add_parser(
606 "batch", help="Perform multiple actions written in a text file")
607 batch_parser
.add_argument(
608 "batch_file", help="The name of the file to read.")
609 subparsers
.add_parser("version", help="Show the current version")
611 args
= parser
.parse_args()
612 if not args
.subcommand
:
615 if not args
.directory
:
616 args
.directory
= os
.getcwd()
618 logger
= logging
.getLogger()
619 formatter
= logging
.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
620 logger
.setLevel(logging
.DEBUG
)
621 console_handler
= logging
.StreamHandler()
622 console_handler
.setLevel(args
.log_level
.upper())
624 logger
.addHandler(console_handler
)
626 file_handler
= logging
.FileHandler(args
.log_file
)
627 file_handler
.setLevel(logging
.DEBUG
)
628 file_handler
.setFormatter(formatter
)
629 logger
.addHandler(file_handler
)
633 thing_queue
= multiprocessing
.JoinableQueue()
634 logging
.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT
))
635 downloaders
= [Downloader(thing_queue
, args
.directory
) for _
in range(DOWNLOADER_COUNT
)]
636 for downloader
in downloaders
:
640 if args
.subcommand
.startswith("collection"):
641 for collection
in args
.collections
:
642 Collection(args
.owner
, collection
, args
.directory
, args
.quick
).download()
643 if args
.subcommand
== "thing":
644 for thing
in args
.things
:
645 thing_queue
.put(thing
)
646 if args
.subcommand
== "user":
647 for user
in args
.users
:
648 Designs(user
, args
.directory
, args
.quick
).download()
649 if args
.subcommand
== "version":
650 print("thingy_grabber.py version {}".format(VERSION
))
651 if args
.subcommand
== "batch":
652 do_batch(args
.batch_file
, args
.directory
, args
.quick
)
654 # Stop the downloader processes
655 for downloader
in downloaders
:
656 thing_queue
.put(None)
658 if __name__
== "__main__":
659 multiprocessing
.freeze_support()