765b564807b303390259148e29b705870a6a0213
3 Thingiverse bulk downloader
13 import multiprocessing
15 from shutil
import copyfile
16 from bs4
import BeautifulSoup
18 URL_BASE
= "https://www.thingiverse.com"
19 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
20 USER_COLLECTION
= URL_BASE
+ "/ajax/user/designs"
22 ID_REGEX
= re
.compile(r
'"id":(\d*),')
23 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
24 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
25 # This appears to be fixed at 12, but if it changes would screw the rest up.
26 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
27 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
34 class State(enum
.Enum
):
37 ALREADY_DOWNLOADED
= enum
.auto()
41 """ Remove whitespace from a string """
42 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
47 Normalizes string, converts to lowercase, removes non-alpha characters,
48 and converts spaces to hyphens.
50 value
= unicodedata
.normalize('NFKD', value
).encode(
51 'ascii', 'ignore').decode()
52 value
= str(re
.sub(r
'[^\w\s-]', '', value
).strip())
53 value
= str(NO_WHITESPACE_REGEX
.sub('-', value
))
56 class Downloader(multiprocessing
.Process
):
58 Class to handle downloading the things we have found to get.
61 def __init__(self
, thing_queue
, download_directory
):
62 multiprocessing
.Process
.__init
__(self
)
63 # TODO: add parameters
64 self
.thing_queue
= thing_queue
65 self
.download_directory
= download_directory
68 """ actual download loop.
71 thing_id
= self
.thing_queue
.get()
73 logging
.info("Shutting download queue")
74 self
.thing_queue
.task_done()
76 logging
.info("Handling id {}".format(thing_id
))
77 Thing(thing_id
).download(self
.download_directory
)
78 self
.thing_queue
.task_done()
86 """ Holds details of a group of things for download
87 This is effectively (although not actually) an abstract class
88 - use Collection or Designs instead.
91 def __init__(self
, quick
):
97 # Should we stop downloading when we hit a known datestamp?
99 # These should be set by child classes.
101 self
.download_dir
= None
102 self
.collection_url
= None
104 def _get_small_grouping(self
, req
):
105 """ Handle small groupings """
106 soup
= BeautifulSoup(req
.text
, features
='lxml')
107 links
= soup
.find_all('a', {'class': 'card-img-holder'})
108 self
.things
= [x
['href'].split(':')[1] for x
in links
]
109 self
.total
= len(self
.things
)
114 """ retrieve the things of the grouping. """
116 # We've already done it.
119 # Check for initialisation:
121 logging
.error("No URL set - object not initialised properly?")
122 raise ValueError("No URL set - object not initialised properly?")
124 # Get the internal details of the grouping.
125 logging
.debug("Querying {}".format(self
.url
))
126 c_req
= requests
.get(self
.url
)
127 total
= TOTAL_REGEX
.search(c_req
.text
)
129 # This is a small (<13) items grouping. Pull the list from this req.
130 return self
._get
_small
_grouping
(c_req
)
131 self
.total
= total
.groups()[0]
132 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
133 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
134 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
136 'base_url': self
.url
,
141 for current_page
in range(1, self
.last_page
+ 1):
142 parameters
['page'] = current_page
143 req
= requests
.post(self
.collection_url
, parameters
)
144 soup
= BeautifulSoup(req
.text
, features
='lxml')
145 links
= soup
.find_all('a', {'class': 'card-img-holder'})
146 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
151 """ Downloads all the files in a collection """
155 if not self
.download_dir
:
157 "No download_dir set - invalidly initialised object?")
159 base_dir
= os
.getcwd()
161 os
.mkdir(self
.download_dir
)
162 except FileExistsError
:
163 logging
.info("Target directory {} already exists. Assuming a resume."
164 .format(self
.download_dir
))
165 logging
.info("Downloading {} thing(s).".format(self
.total
))
166 for idx
, thing
in enumerate(self
.things
):
167 logging
.info("Downloading thing {}".format(idx
))
168 RC
= Thing(thing
).download(self
.download_dir
)
169 if self
.quick
and RC
==State
.ALREADY_DOWNLOADED
:
170 logging
.info("Caught up, stopping.")
174 class Collection(Grouping
):
175 """ Holds details of a collection. """
177 def __init__(self
, user
, name
, directory
, quick
):
178 Grouping
.__init
__(self
, quick
)
181 self
.url
= "{}/{}/collections/{}".format(
182 URL_BASE
, self
.user
, strip_ws(self
.name
))
183 self
.download_dir
= os
.path
.join(directory
,
184 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
185 self
.collection_url
= URL_COLLECTION
188 class Designs(Grouping
):
189 """ Holds details of all of a users' designs. """
191 def __init__(self
, user
, directory
, quick
):
192 Grouping
.__init
__(self
, quick
)
194 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
195 self
.download_dir
= os
.path
.join(
196 directory
, "{} designs".format(slugify(self
.user
)))
197 self
.collection_url
= USER_COLLECTION
201 """ An individual design on thingiverse. """
203 def __init__(self
, thing_id
):
204 self
.thing_id
= thing_id
205 self
.last_time
= None
207 self
._needs
_download
= True
210 self
.download_dir
= None
212 def _parse(self
, base_dir
):
213 """ Work out what, if anything needs to be done. """
217 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
219 req
= requests
.get(url
)
220 except requests
.exceptions
.ConnectionError
as error
:
221 logging
.error("Unable to connect for thing {}: {}".format(
222 self
.thing_id
, error
))
226 soup
= BeautifulSoup(self
.text
, features
='lxml')
228 #code.interact(local=dict(globals(), **locals()))
230 self
.title
= slugify(soup
.find_all('h1')[0].text
.strip())
233 "No title found for thing {}".format(self
.thing_id
))
234 self
.title
= self
.thing_id
236 if req
.status_code
== 404:
238 "404 for thing {} - DMCA or invalid number?".format(self
.thing_id
))
241 if req
.status_code
> 299:
243 "bad status code {} for thing {} - try again later?".format(req
.status_code
, self
.thing_id
))
246 self
.old_download_dir
= os
.path
.join(base_dir
, self
.title
)
247 self
.download_dir
= os
.path
.join(base_dir
, "{} - {}".format(self
.thing_id
, self
.title
))
249 logging
.debug("Parsing {} ({})".format(self
.thing_id
, self
.title
))
251 if not os
.path
.exists(self
.download_dir
):
252 if os
.path
.exists(self
.old_download_dir
):
253 logging
.info("Found previous style download directory. Moving it")
254 copyfile(self
.old_download_dir
, self
.download_dir
)
260 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
261 if not os
.path
.exists(timestamp_file
):
262 # Old download from before
264 "Old-style download directory found. Assuming update required.")
269 with
open(timestamp_file
, 'r') as timestamp_handle
:
270 self
.last_time
= timestamp_handle
.readlines()[0]
271 logging
.info("last downloaded version: {}".format(self
.last_time
))
272 except FileNotFoundError
:
273 # Not run on this thing before.
275 "Old-style download directory found. Assuming update required.")
276 self
.last_time
= None
280 # OK, so we have a timestamp, lets see if there is anything new to get
281 file_links
= soup
.find_all('a', {'class': 'file-download'})
282 for file_link
in file_links
:
283 timestamp
= file_link
.find_all('time')[0]['datetime']
284 logging
.debug("Checking {} (updated {})".format(
285 file_link
["title"], timestamp
))
286 if timestamp
> self
.last_time
:
288 "Found new/updated file {}".format(file_link
["title"]))
289 self
._needs
_download
= True
292 # Got here, so nope, no new files.
293 self
._needs
_download
= False
296 def download(self
, base_dir
):
297 """ Download all files for a given thing.
298 Returns True iff the thing is now downloaded (not iff it downloads the thing!)
301 self
._parse
(base_dir
)
305 "Unable to parse {} - aborting download".format(self
.thing_id
))
308 if not self
._needs
_download
:
309 print("{} - {} already downloaded - skipping.".format(self
.thing_id
, self
.title
))
310 return State
.ALREADY_DOWNLOADED
312 # Have we already downloaded some things?
313 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
315 if os
.path
.exists(self
.download_dir
):
316 if not os
.path
.exists(timestamp_file
):
317 # edge case: old style dir w/out timestamp.
319 "Old style download dir found for {}".format(self
.title
))
321 target_dir
= "{}_old".format(self
.download_dir
)
322 while os
.path
.exists(target_dir
):
323 prev_count
= prev_count
+ 1
324 target_dir
= "{}_old_{}".format(self
.download_dir
, prev_count
)
325 os
.rename(self
.download_dir
, target_dir
)
327 prev_dir
= "{}_{}".format(self
.download_dir
, slugify(self
.last_time
))
328 os
.rename(self
.download_dir
, prev_dir
)
330 # Get the list of files to download
331 soup
= BeautifulSoup(self
.text
, features
='lxml')
332 file_links
= soup
.find_all('a', {'class': 'file-download'})
338 if not self
.last_time
:
339 # If we don't have anything to copy from, then it is all new.
340 new_file_links
= file_links
342 new_last_time
= file_links
[0].find_all('time')[0]['datetime']
345 code
.interact(local
=dict(globals(), **locals()))
347 for file_link
in file_links
:
348 timestamp
= file_link
.find_all('time')[0]['datetime']
349 logging
.debug("Found file {} from {}".format(
350 file_link
["title"], timestamp
))
351 if timestamp
> new_last_time
:
352 new_last_time
= timestamp
354 for file_link
in file_links
:
355 timestamp
= file_link
.find_all('time')[0]['datetime']
356 logging
.debug("Checking {} (updated {})".format(
357 file_link
["title"], timestamp
))
358 if timestamp
> self
.last_time
:
359 new_file_links
.append(file_link
)
361 old_file_links
.append(file_link
)
362 if not new_last_time
or timestamp
> new_last_time
:
363 new_last_time
= timestamp
365 logging
.debug("new timestamp {}".format(new_last_time
))
367 # OK. Time to get to work.
368 logging
.debug("Generating download_dir")
369 os
.mkdir(self
.download_dir
)
370 # First grab the cached files (if any)
371 logging
.info("Copying {} unchanged files.".format(len(old_file_links
)))
372 for file_link
in old_file_links
:
373 old_file
= os
.path
.join(prev_dir
, file_link
["title"])
374 new_file
= os
.path
.join(self
.download_dir
, file_link
["title"])
376 logging
.debug("Copying {} to {}".format(old_file
, new_file
))
377 copyfile(old_file
, new_file
)
378 except FileNotFoundError
:
380 "Unable to find {} in old archive, redownloading".format(file_link
["title"]))
381 new_file_links
.append(file_link
)
383 # Now download the new ones
384 files
= [("{}{}".format(URL_BASE
, x
['href']), x
["title"])
385 for x
in new_file_links
]
386 logging
.info("Downloading {} new files of {}".format(
387 len(new_file_links
), len(file_links
)))
389 for url
, name
in files
:
390 file_name
= os
.path
.join(self
.download_dir
, name
)
391 logging
.debug("Downloading {} from {} to {}".format(
392 name
, url
, file_name
))
393 data_req
= requests
.get(url
)
394 with
open(file_name
, 'wb') as handle
:
395 handle
.write(data_req
.content
)
396 except Exception as exception
:
397 logging
.error("Failed to download {} - {}".format(name
, exception
))
398 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
402 image_dir
= os
.path
.join(self
.download_dir
, 'images')
403 imagelinks
= soup
.find_all('span', {'class': 'gallery-slider'})[0] \
404 .find_all('div', {'class': 'gallery-photo'})
405 logging
.info("Downloading {} images.".format(len(imagelinks
)))
408 for imagelink
in imagelinks
:
409 url
= next(filter(None, [imagelink
[x
] for x
in ['data-full',
412 'data-thumb']]), None)
415 "Unable to find any urls for {}".format(imagelink
))
418 filename
= os
.path
.basename(url
)
419 if filename
.endswith('stl'):
420 filename
= "{}.png".format(filename
)
421 image_req
= requests
.get(url
)
422 with
open(os
.path
.join(image_dir
, filename
), 'wb') as handle
:
423 handle
.write(image_req
.content
)
424 except Exception as exception
:
425 print("Failed to download {} - {}".format(filename
, exception
))
426 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
429 # instructions are good too.
430 logging
.info("Downloading readme")
432 readme_txt
= soup
.find('meta', property='og:description')[
434 with
open(os
.path
.join(self
.download_dir
, 'readme.txt'), 'w') as readme_handle
:
435 readme_handle
.write("{}\n".format(readme_txt
))
436 except (TypeError, KeyError) as exception
:
437 logging
.warning("No readme? {}".format(exception
))
438 except IOError as exception
:
439 logging
.warning("Failed to write readme! {}".format(exception
))
441 # Best get some licenses
442 logging
.info("Downloading license")
444 license_txt
= soup
.find('div', {'class': 'license-text'}).text
446 with
open(os
.path
.join(self
.download_dir
, 'license.txt'), 'w') as license_handle
:
447 license_handle
.write("{}\n".format(license_txt
))
448 except AttributeError as exception
:
449 logging
.warning("No license? {}".format(exception
))
450 except IOError as exception
:
451 logging
.warning("Failed to write license! {}".format(exception
))
454 # Now write the timestamp
455 with
open(timestamp_file
, 'w') as timestamp_handle
:
456 timestamp_handle
.write(new_last_time
)
457 except Exception as exception
:
458 print("Failed to write timestamp file - {}".format(exception
))
459 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
461 self
._needs
_download
= False
462 logging
.debug("Download of {} finished".format(self
.title
))
466 def do_batch(batch_file
, download_dir
, quick
):
467 """ Read a file in line by line, parsing each as a set of calls to this script."""
468 with
open(batch_file
) as handle
:
474 logging
.info("Handling instruction {}".format(line
))
475 command_arr
= line
.split()
476 if command_arr
[0] == "thing":
478 "Handling batch thing instruction: {}".format(line
))
479 Thing(command_arr
[1]).download(download_dir
)
481 if command_arr
[0] == "collection":
483 "Handling batch collection instruction: {}".format(line
))
484 Collection(command_arr
[1], command_arr
[2],
485 download_dir
, quick
).download()
487 if command_arr
[0] == "user":
489 "Handling batch collection instruction: {}".format(line
))
490 Designs(command_arr
[1], download_dir
, quick
).download()
492 logging
.warning("Unable to parse current instruction. Skipping.")
496 """ Entry point for script being run as a command. """
497 parser
= argparse
.ArgumentParser()
498 parser
.add_argument("-l", "--log-level", choices
=[
499 'debug', 'info', 'warning'], default
='info', help="level of logging desired")
500 parser
.add_argument("-d", "--directory",
501 help="Target directory to download into")
502 parser
.add_argument("-f", "--log-file",
503 help="Place to log debug information to")
504 parser
.add_argument("-q", "--quick", action
="store_true",
505 help="Assume date ordering on posts")
507 subparsers
= parser
.add_subparsers(
508 help="Type of thing to download", dest
="subcommand")
509 collection_parser
= subparsers
.add_parser(
510 'collection', help="Download one or more entire collection(s)")
511 collection_parser
.add_argument(
512 "owner", help="The owner of the collection(s) to get")
513 collection_parser
.add_argument(
514 "collections", nargs
="+", help="Space seperated list of the name(s) of collection to get")
515 thing_parser
= subparsers
.add_parser(
516 'thing', help="Download a single thing.")
517 thing_parser
.add_argument(
518 "things", nargs
="*", help="Space seperated list of thing ID(s) to download")
519 user_parser
= subparsers
.add_parser(
520 "user", help="Download all things by one or more users")
521 user_parser
.add_argument(
522 "users", nargs
="+", help="A space seperated list of the user(s) to get the designs of")
523 batch_parser
= subparsers
.add_parser(
524 "batch", help="Perform multiple actions written in a text file")
525 batch_parser
.add_argument(
526 "batch_file", help="The name of the file to read.")
527 subparsers
.add_parser("version", help="Show the current version")
529 args
= parser
.parse_args()
530 if not args
.subcommand
:
533 if not args
.directory
:
534 args
.directory
= os
.getcwd()
536 logger
= logging
.getLogger()
537 formatter
= logging
.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
538 logger
.setLevel(logging
.DEBUG
)
539 console_handler
= logging
.StreamHandler()
540 console_handler
.setLevel(args
.log_level
.upper())
542 logger
.addHandler(console_handler
)
544 file_handler
= logging
.FileHandler(args
.log_file
)
545 file_handler
.setLevel(logging
.DEBUG
)
546 file_handler
.setFormatter(formatter
)
547 logger
.addHandler(file_handler
)
551 thing_queue
= multiprocessing
.JoinableQueue()
552 logging
.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT
))
553 downloaders
= [Downloader(thing_queue
, args
.directory
) for _
in range(DOWNLOADER_COUNT
)]
554 for downloader
in downloaders
:
558 if args
.subcommand
.startswith("collection"):
559 for collection
in args
.collections
:
560 Collection(args
.owner
, collection
, args
.directory
, args
.quick
).download()
561 if args
.subcommand
== "thing":
562 for thing
in args
.things
:
563 thing_queue
.put(thing
)
564 if args
.subcommand
== "user":
565 for user
in args
.users
:
566 Designs(user
, args
.directory
, args
.quick
).download()
567 if args
.subcommand
== "version":
568 print("thingy_grabber.py version {}".format(VERSION
))
569 if args
.subcommand
== "batch":
570 do_batch(args
.batch_file
, args
.directory
, args
.quick
)
572 # Stop the downloader processes
573 for downloader
in downloaders
:
574 thing_queue
.put(None)
576 if __name__
== "__main__":