3 Thingiverse bulk downloader
13 import multiprocessing
15 from shutil
import copyfile
16 from bs4
import BeautifulSoup
17 from dataclasses
import dataclass
19 from selenium
import webdriver
20 from selenium
.webdriver
.common
.by
import By
21 from selenium
.webdriver
.support
.ui
import WebDriverWait
22 from selenium
.webdriver
.support
import expected_conditions
as EC
23 from selenium
.webdriver
.firefox
.options
import Options
25 URL_BASE
= "https://www.thingiverse.com"
26 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
27 USER_COLLECTION
= URL_BASE
+ "/ajax/user/designs"
29 ID_REGEX
= re
.compile(r
'"id":(\d*),')
30 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
31 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
32 # This appears to be fixed at 12, but if it changes would screw the rest up.
33 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
34 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
42 #BROWSER = webdriver.PhantomJS('./phantomjs')
44 BROWSER
= webdriver
.Firefox(options
=options
)
46 BROWSER
.set_window_size(1980, 1080)
56 class State(enum
.Enum
):
59 ALREADY_DOWNLOADED
= enum
.auto()
63 """ Remove whitespace from a string """
64 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
69 Normalizes string, converts to lowercase, removes non-alpha characters,
70 and converts spaces to hyphens.
72 value
= unicodedata
.normalize('NFKD', value
).encode(
73 'ascii', 'ignore').decode()
74 value
= str(re
.sub(r
'[^\w\s-]', '', value
).strip())
75 value
= str(NO_WHITESPACE_REGEX
.sub('-', value
))
78 class PageChecker(object):
82 self
.file_count
= None
86 def __call__(self
, _
):
88 self
.log
.append("call")
89 if self
.title
is None:
91 name
= EC
._find
_element
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingPage__modelName]"))
94 self
.title
= name
.text
96 if self
.file_count
is None:
97 # OK. Do we know how many files we have to download?
98 metrics
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=MetricButton]"))
99 self
.log
.append("got some metrics: {}".format(len(metrics
)))
100 cur_count
= int([x
.text
.split("\n")[0] for x
in metrics
if x
.text
.endswith("\nThing Files")][0])
101 self
.log
.append(cur_count
)
104 self
.file_count
= cur_count
106 self
.log
.append("looking for {} files".format(self
.file_count
))
107 fileRows
= EC
._find
_elements
(BROWSER
, (By
.CSS_SELECTOR
, "[class^=ThingFile__fileRow]"))
108 self
.log
.append("found {} files".format(len(fileRows
)))
109 if len(fileRows
) >= self
.file_count
:
110 self
.files
= fileRows
119 class Downloader(multiprocessing
.Process
):
121 Class to handle downloading the things we have found to get.
124 def __init__(self
, thing_queue
, download_directory
):
125 multiprocessing
.Process
.__init
__(self
)
126 # TODO: add parameters
127 self
.thing_queue
= thing_queue
128 self
.download_directory
= download_directory
131 """ actual download loop.
134 thing_id
= self
.thing_queue
.get()
136 logging
.info("Shutting download queue")
137 self
.thing_queue
.task_done()
139 logging
.info("Handling id {}".format(thing_id
))
140 Thing(thing_id
).download(self
.download_directory
)
141 self
.thing_queue
.task_done()
149 """ Holds details of a group of things for download
150 This is effectively (although not actually) an abstract class
151 - use Collection or Designs instead.
154 def __init__(self
, quick
):
160 # Should we stop downloading when we hit a known datestamp?
162 # These should be set by child classes.
164 self
.download_dir
= None
165 self
.collection_url
= None
167 def _get_small_grouping(self
, req
):
168 """ Handle small groupings """
169 soup
= BeautifulSoup(req
.text
, features
='lxml')
170 links
= soup
.find_all('a', {'class': 'card-img-holder'})
171 self
.things
= [x
['href'].split(':')[1] for x
in links
]
172 self
.total
= len(self
.things
)
177 """ retrieve the things of the grouping. """
179 # We've already done it.
182 # Check for initialisation:
184 logging
.error("No URL set - object not initialised properly?")
185 raise ValueError("No URL set - object not initialised properly?")
187 # Get the internal details of the grouping.
188 logging
.debug("Querying {}".format(self
.url
))
189 c_req
= requests
.get(self
.url
)
190 total
= TOTAL_REGEX
.search(c_req
.text
)
192 # This is a small (<13) items grouping. Pull the list from this req.
193 return self
._get
_small
_grouping
(c_req
)
194 self
.total
= total
.groups()[0]
195 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
196 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
197 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
199 'base_url': self
.url
,
204 for current_page
in range(1, self
.last_page
+ 1):
205 parameters
['page'] = current_page
206 req
= requests
.post(self
.collection_url
, parameters
)
207 soup
= BeautifulSoup(req
.text
, features
='lxml')
208 links
= soup
.find_all('a', {'class': 'card-img-holder'})
209 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
214 """ Downloads all the files in a collection """
218 if not self
.download_dir
:
220 "No download_dir set - invalidly initialised object?")
222 base_dir
= os
.getcwd()
224 os
.mkdir(self
.download_dir
)
225 except FileExistsError
:
226 logging
.info("Target directory {} already exists. Assuming a resume."
227 .format(self
.download_dir
))
228 logging
.info("Downloading {} thing(s).".format(self
.total
))
229 for idx
, thing
in enumerate(self
.things
):
230 logging
.info("Downloading thing {}".format(idx
))
231 RC
= Thing(thing
).download(self
.download_dir
)
232 if self
.quick
and RC
==State
.ALREADY_DOWNLOADED
:
233 logging
.info("Caught up, stopping.")
237 class Collection(Grouping
):
238 """ Holds details of a collection. """
240 def __init__(self
, user
, name
, directory
, quick
):
241 Grouping
.__init
__(self
, quick
)
244 self
.url
= "{}/{}/collections/{}".format(
245 URL_BASE
, self
.user
, strip_ws(self
.name
))
246 self
.download_dir
= os
.path
.join(directory
,
247 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
248 self
.collection_url
= URL_COLLECTION
251 class Designs(Grouping
):
252 """ Holds details of all of a users' designs. """
254 def __init__(self
, user
, directory
, quick
):
255 Grouping
.__init
__(self
, quick
)
257 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
258 self
.download_dir
= os
.path
.join(
259 directory
, "{} designs".format(slugify(self
.user
)))
260 self
.collection_url
= USER_COLLECTION
264 """ An individual design on thingiverse. """
266 def __init__(self
, thing_id
):
267 self
.thing_id
= thing_id
268 self
.last_time
= None
270 self
._needs
_download
= True
273 self
.download_dir
= None
275 def _parse(self
, base_dir
):
276 """ Work out what, if anything needs to be done. """
280 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
283 wait
= WebDriverWait(BROWSER
, 20)
286 except requests
.exceptions
.ConnectionError
as error
:
287 logging
.error("Unable to connect for thing {}: {}".format(
288 self
.thing_id
, error
))
291 self
.title
= pc
.title
293 for link
in pc
.files
:
294 link_title
, link_details
, _
= link
.text
.split("\n")
295 #link_details we be something like '461 kb | Updated 06-11-2019 | 373 Downloads'
296 link_date
= link_details
.split("|")[1][10:-1]
297 link_link
= link
.find_element_by_xpath(".//a").get_attribute("href")
298 self
._file
_links
.append(FileLink(link_title
, link_date
, link_link
))
301 self
.old_download_dir
= os
.path
.join(base_dir
, self
.title
)
302 self
.download_dir
= os
.path
.join(base_dir
, "{} - {}".format(self
.thing_id
, self
.title
))
304 logging
.debug("Parsing {} ({})".format(self
.thing_id
, self
.title
))
306 if not os
.path
.exists(self
.download_dir
):
307 if os
.path
.exists(self
.old_download_dir
):
308 logging
.info("Found previous style download directory. Moving it")
309 copyfile(self
.old_download_dir
, self
.download_dir
)
315 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
316 if not os
.path
.exists(timestamp_file
):
317 # Old download from before
319 "Old-style download directory found. Assuming update required.")
324 with
open(timestamp_file
, 'r') as timestamp_handle
:
325 # add the .split(' ')[0] to remove the timestamp from the old style timestamps
326 self
.last_time
= timestamp_handle
.readlines()[0].split(' ')[0]
327 logging
.info("last downloaded version: {}".format(self
.last_time
))
328 except FileNotFoundError
:
329 # Not run on this thing before.
331 "Old-style download directory found. Assuming update required.")
332 self
.last_time
= None
333 self
._needs
_download
= True
337 # OK, so we have a timestamp, lets see if there is anything new to get
338 for file_link
in self
._file
_links
:
339 if file_link
.last_update
> self
.last_time
:
341 "Found new/updated file {}".format(file_link
["title"]))
342 self
._needs
_download
= True
346 # Got here, so nope, no new files.
347 self
._needs
_download
= False
350 def download(self
, base_dir
):
351 """ Download all files for a given thing.
352 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
355 self
._parse
(base_dir
)
359 "Unable to parse {} - aborting download".format(self
.thing_id
))
362 if not self
._needs
_download
:
363 print("{} - {} already downloaded - skipping.".format(self
.thing_id
, self
.title
))
364 return State
.ALREADY_DOWNLOADED
366 # Have we already downloaded some things?
367 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
369 if os
.path
.exists(self
.download_dir
):
370 if not os
.path
.exists(timestamp_file
):
371 # edge case: old style dir w/out timestamp.
373 "Old style download dir found for {}".format(self
.title
))
375 target_dir
= "{}_old".format(self
.download_dir
)
376 while os
.path
.exists(target_dir
):
377 prev_count
= prev_count
+ 1
378 target_dir
= "{}_old_{}".format(self
.download_dir
, prev_count
)
379 os
.rename(self
.download_dir
, target_dir
)
381 prev_dir
= "{}_{}".format(self
.download_dir
, slugify(self
.last_time
))
382 os
.rename(self
.download_dir
, prev_dir
)
384 # Get the list of files to download
390 if not self
.last_time
:
391 # If we don't have anything to copy from, then it is all new.
392 logging
.debug("No last time, downloading all files")
393 new_file_links
= self
._file
_links
394 new_last_time
= new_file_links
[0].last_update
396 for file_link
in new_file_links
:
397 new_last_time
= max(new_last_time
, file_link
.last_update
)
398 logging
.debug("New timestamp will be {}".format(new_last_time
))
400 new_last_time
= self
.last_time
401 for file_link
in self
._file
_links
:
402 if file_link
.last_update
> self
.last_time
:
403 new_file_links
.append(file_link
)
404 new_last_time
= max(new_last_time
, file_link
.last_update
)
406 old_file_links
.append(file_link
)
408 logging
.debug("new timestamp {}".format(new_last_time
))
410 # OK. Time to get to work.
411 logging
.debug("Generating download_dir")
412 os
.mkdir(self
.download_dir
)
413 filelist_file
= os
.path
.join(self
.download_dir
, "filelist.txt")
414 with
open(filelist_file
, 'w') as fl_handle
:
415 for fl
in self
._file
_links
:
418 fl
.link
=requests
.get(fl
.link
, allow_redirects
=False).headers
['location']
420 logging
.warn("Unable to get actual target for {}".format(base_link
))
422 fl_handle
.write("{},{},{}\n".format(fl
.link
, fl
.name
, fl
.last_update
, base_link
))
425 # First grab the cached files (if any)
426 logging
.info("Copying {} unchanged files.".format(len(old_file_links
)))
427 for file_link
in old_file_links
:
428 old_file
= os
.path
.join(prev_dir
, file_link
.name
)
429 new_file
= os
.path
.join(self
.download_dir
, file_link
.name
)
431 logging
.debug("Copying {} to {}".format(old_file
, new_file
))
432 copyfile(old_file
, new_file
)
433 except FileNotFoundError
:
435 "Unable to find {} in old archive, redownloading".format(file_link
["title"]))
436 new_file_links
.append(file_link
)
438 # Now download the new ones
439 logging
.info("Downloading {} new files of {}".format(
440 len(new_file_links
), len(self
._file
_links
)))
442 for file_link
in new_file_links
:
443 file_name
= os
.path
.join(self
.download_dir
, file_link
.name
)
444 logging
.debug("Downloading {} from {} to {}".format(
445 file_link
.name
, file_link
.link
, file_name
))
446 data_req
= requests
.get(file_link
.link
)
447 with
open(file_name
, 'wb') as handle
:
448 handle
.write(data_req
.content
)
449 except Exception as exception
:
450 logging
.error("Failed to download {} - {}".format(file_link
.name
, exception
))
451 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
456 # People like images. But this doesn't work yet.
457 image_dir = os.path.join(self.download_dir, 'images')
458 imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
459 .find_all('div', {'class': 'gallery-photo'})
460 logging.info("Downloading {} images.".format(len(imagelinks)))
463 for imagelink in imagelinks:
464 url = next(filter(None, [imagelink[x] for x in ['data-full',
467 'data-thumb']]), None)
470 "Unable to find any urls for {}".format(imagelink))
473 filename = os.path.basename(url)
474 if filename.endswith('stl'):
475 filename = "{}.png".format(filename)
476 image_req = requests.get(url)
477 with open(os.path.join(image_dir, filename), 'wb') as handle:
478 handle.write(image_req.content)
479 except Exception as exception:
480 print("Failed to download {} - {}".format(filename, exception))
481 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
484 # instructions are good too.
485 logging.info("Downloading readme")
487 readme_txt = soup.find('meta', property='og:description')[
489 with open(os.path.join(self.download_dir, 'readme.txt'), 'w') as readme_handle:
490 readme_handle.write("{}\n".format(readme_txt))
491 except (TypeError, KeyError) as exception:
492 logging.warning("No readme? {}".format(exception))
493 except IOError as exception:
494 logging.warning("Failed to write readme! {}".format(exception))
496 # Best get some licenses
497 logging.info("Downloading license")
499 license_txt = soup.find('div', {'class': 'license-text'}).text
501 with open(os.path.join(self.download_dir, 'license.txt'), 'w') as license_handle:
502 license_handle.write("{}\n".format(license_txt))
503 except AttributeError as exception:
504 logging.warning("No license? {}".format(exception))
505 except IOError as exception:
506 logging.warning("Failed to write license! {}".format(exception))
509 # Now write the timestamp
510 with
open(timestamp_file
, 'w') as timestamp_handle
:
511 timestamp_handle
.write(new_last_time
)
512 except Exception as exception
:
513 print("Failed to write timestamp file - {}".format(exception
))
514 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
516 self
._needs
_download
= False
517 logging
.debug("Download of {} finished".format(self
.title
))
521 def do_batch(batch_file
, download_dir
, quick
):
522 """ Read a file in line by line, parsing each as a set of calls to this script."""
523 with
open(batch_file
) as handle
:
529 logging
.info("Handling instruction {}".format(line
))
530 command_arr
= line
.split()
531 if command_arr
[0] == "thing":
533 "Handling batch thing instruction: {}".format(line
))
534 Thing(command_arr
[1]).download(download_dir
)
536 if command_arr
[0] == "collection":
538 "Handling batch collection instruction: {}".format(line
))
539 Collection(command_arr
[1], command_arr
[2],
540 download_dir
, quick
).download()
542 if command_arr
[0] == "user":
544 "Handling batch collection instruction: {}".format(line
))
545 Designs(command_arr
[1], download_dir
, quick
).download()
547 logging
.warning("Unable to parse current instruction. Skipping.")
551 """ Entry point for script being run as a command. """
552 parser
= argparse
.ArgumentParser()
553 parser
.add_argument("-l", "--log-level", choices
=[
554 'debug', 'info', 'warning'], default
='info', help="level of logging desired")
555 parser
.add_argument("-d", "--directory",
556 help="Target directory to download into")
557 parser
.add_argument("-f", "--log-file",
558 help="Place to log debug information to")
559 parser
.add_argument("-q", "--quick", action
="store_true",
560 help="Assume date ordering on posts")
562 subparsers
= parser
.add_subparsers(
563 help="Type of thing to download", dest
="subcommand")
564 collection_parser
= subparsers
.add_parser(
565 'collection', help="Download one or more entire collection(s)")
566 collection_parser
.add_argument(
567 "owner", help="The owner of the collection(s) to get")
568 collection_parser
.add_argument(
569 "collections", nargs
="+", help="Space seperated list of the name(s) of collection to get")
570 thing_parser
= subparsers
.add_parser(
571 'thing', help="Download a single thing.")
572 thing_parser
.add_argument(
573 "things", nargs
="*", help="Space seperated list of thing ID(s) to download")
574 user_parser
= subparsers
.add_parser(
575 "user", help="Download all things by one or more users")
576 user_parser
.add_argument(
577 "users", nargs
="+", help="A space seperated list of the user(s) to get the designs of")
578 batch_parser
= subparsers
.add_parser(
579 "batch", help="Perform multiple actions written in a text file")
580 batch_parser
.add_argument(
581 "batch_file", help="The name of the file to read.")
582 subparsers
.add_parser("version", help="Show the current version")
584 args
= parser
.parse_args()
585 if not args
.subcommand
:
588 if not args
.directory
:
589 args
.directory
= os
.getcwd()
591 logger
= logging
.getLogger()
592 formatter
= logging
.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
593 logger
.setLevel(logging
.DEBUG
)
594 console_handler
= logging
.StreamHandler()
595 console_handler
.setLevel(args
.log_level
.upper())
597 logger
.addHandler(console_handler
)
599 file_handler
= logging
.FileHandler(args
.log_file
)
600 file_handler
.setLevel(logging
.DEBUG
)
601 file_handler
.setFormatter(formatter
)
602 logger
.addHandler(file_handler
)
606 thing_queue
= multiprocessing
.JoinableQueue()
607 logging
.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT
))
608 downloaders
= [Downloader(thing_queue
, args
.directory
) for _
in range(DOWNLOADER_COUNT
)]
609 for downloader
in downloaders
:
613 if args
.subcommand
.startswith("collection"):
614 for collection
in args
.collections
:
615 Collection(args
.owner
, collection
, args
.directory
, args
.quick
).download()
616 if args
.subcommand
== "thing":
617 for thing
in args
.things
:
618 thing_queue
.put(thing
)
619 if args
.subcommand
== "user":
620 for user
in args
.users
:
621 Designs(user
, args
.directory
, args
.quick
).download()
622 if args
.subcommand
== "version":
623 print("thingy_grabber.py version {}".format(VERSION
))
624 if args
.subcommand
== "batch":
625 do_batch(args
.batch_file
, args
.directory
, args
.quick
)
627 # Stop the downloader processes
628 for downloader
in downloaders
:
629 thing_queue
.put(None)
631 if __name__
== "__main__":
632 multiprocessing
.freeze_support()