9c84e4106289705cbfdb403d1e238a55435da26c
3 Thingiverse bulk downloader
13 from shutil
import copyfile
14 from bs4
import BeautifulSoup
16 URL_BASE
= "https://www.thingiverse.com"
17 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
18 USER_COLLECTION
= URL_BASE
+ "/ajax/user/designs"
20 ID_REGEX
= re
.compile(r
'"id":(\d*),')
21 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
22 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
23 # This appears to be fixed at 12, but if it changes would screw the rest up.
24 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
25 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
30 """ Remove whitespace from a string """
31 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
36 Normalizes string, converts to lowercase, removes non-alpha characters,
37 and converts spaces to hyphens.
39 value
= unicodedata
.normalize('NFKD', value
).encode(
40 'ascii', 'ignore').decode()
41 value
= str(re
.sub(r
'[^\w\s-]', '', value
).strip())
42 value
= str(NO_WHITESPACE_REGEX
.sub('-', value
))
43 #value = str(re.sub(r'[-\s]+', '-', value))
48 """ Holds details of a group of things for download
49 This is effectively (although not actually) an abstract class
50 - use Collection or Designs instead.
59 # These should be set by child classes.
61 self
.download_dir
= None
62 self
.collection_url
= None
64 def _get_small_grouping(self
, req
):
65 """ Handle small groupings """
66 soup
= BeautifulSoup(req
.text
, features
='lxml')
67 links
= soup
.find_all('a', {'class': 'card-img-holder'})
68 self
.things
= [x
['href'].split(':')[1] for x
in links
]
69 self
.total
= len(self
.things
)
74 """ retrieve the things of the grouping. """
76 # We've already done it.
79 # Check for initialisation:
81 logging
.error("No URL set - object not initialised properly?")
82 raise ValueError("No URL set - object not initialised properly?")
84 # Get the internal details of the grouping.
85 logging
.debug("Querying {}".format(self
.url
))
86 c_req
= requests
.get(self
.url
)
87 total
= TOTAL_REGEX
.search(c_req
.text
)
89 # This is a small (<13) items grouping. Pull the list from this req.
90 return self
._get
_small
_grouping
(c_req
)
91 self
.total
= total
.groups()[0]
92 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
93 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
94 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
101 for current_page
in range(1, self
.last_page
+ 1):
102 parameters
['page'] = current_page
103 req
= requests
.post(self
.collection_url
, parameters
)
104 soup
= BeautifulSoup(req
.text
, features
='lxml')
105 links
= soup
.find_all('a', {'class': 'card-img-holder'})
106 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
111 """ Downloads all the files in a collection """
115 if not self
.download_dir
:
117 "No download_dir set - invalidly initialised object?")
119 base_dir
= os
.getcwd()
121 os
.mkdir(self
.download_dir
)
122 except FileExistsError
:
123 logging
.info("Target directory {} already exists. Assuming a resume."
124 .format(self
.download_dir
))
125 logging
.info("Downloading {} thing(s).".format(self
.total
))
126 for idx
, thing
in enumerate(self
.things
):
127 logging
.info("Downloading thing {}".format(idx
))
128 Thing(thing
).download(self
.download_dir
)
131 class Collection(Grouping
):
132 """ Holds details of a collection. """
134 def __init__(self
, user
, name
, directory
):
135 Grouping
.__init
__(self
)
138 self
.url
= "{}/{}/collections/{}".format(
139 URL_BASE
, self
.user
, strip_ws(self
.name
))
140 self
.download_dir
= os
.path
.join(directory
,
141 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
142 self
.collection_url
= URL_COLLECTION
145 class Designs(Grouping
):
146 """ Holds details of all of a users' designs. """
148 def __init__(self
, user
, directory
):
149 Grouping
.__init
__(self
)
151 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
152 self
.download_dir
= os
.path
.join(
153 directory
, "{} designs".format(slugify(self
.user
)))
154 self
.collection_url
= USER_COLLECTION
158 """ An individual design on thingiverse. """
160 def __init__(self
, thing_id
):
161 self
.thing_id
= thing_id
162 self
.last_time
= None
164 self
._needs
_download
= True
167 self
.download_dir
= None
169 def _parse(self
, base_dir
):
170 """ Work out what, if anything needs to be done. """
174 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
176 req
= requests
.get(url
)
177 except requests
.exceptions
.ConnectionError
as error
:
178 logging
.error("Unable to connect for thing {}: {}".format(self
.thing_id
, error
))
182 soup
= BeautifulSoup(self
.text
, features
='lxml')
184 #code.interact(local=dict(globals(), **locals()))
186 self
.title
= slugify(soup
.find_all('h1')[0].text
.strip())
188 logging
.warning("No title found for thing {}".format(self
.thing_id
))
189 self
.title
= self
.thing_id
191 if req
.status_code
== 404:
192 logging
.warning("404 for thing {} - DMCA or invalid number?".format(self
.thing_id
))
195 if req
.status_code
> 299:
196 logging
.warning("bad status code {} for thing {} - try again later?".format(req
.status_code
, self
.thing_id
))
199 self
.old_download_dir
= os
.path
.join(base_dir
, self
.title
)
200 self
.download_dir
= os
.path
.join(base_dir
, " - ".format(self
.thing_id
, self
.title
))
202 logging
.debug("Parsing {} ({})".format(self
.thing_id
, self
.title
))
204 if not os
.path
.exists(self
.download_dir
):
205 if os
.path
.exists(self
.old_download_dir
):
206 logging
.info("Found previous style download directory. Moving it")
207 copyfile(self
.old_download_dir
, self
.download_dir
)
213 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
214 if not os
.path
.exists(timestamp_file
):
215 # Old download from before
217 "Old-style download directory found. Assuming update required.")
222 with
open(timestamp_file
, 'r') as timestamp_handle
:
223 self
.last_time
= timestamp_handle
.readlines()[0]
224 logging
.info("last downloaded version: {}".format(self
.last_time
))
225 except FileNotFoundError
:
226 # Not run on this thing before.
228 "Old-style download directory found. Assuming update required.")
229 self
.last_time
= None
233 # OK, so we have a timestamp, lets see if there is anything new to get
234 file_links
= soup
.find_all('a', {'class': 'file-download'})
235 for file_link
in file_links
:
236 timestamp
= file_link
.find_all('time')[0]['datetime']
237 logging
.debug("Checking {} (updated {})".format(
238 file_link
["title"], timestamp
))
239 if timestamp
> self
.last_time
:
241 "Found new/updated file {}".format(file_link
["title"]))
242 self
._needs
_download
= True
245 # Got here, so nope, no new files.
246 self
._needs
_download
= False
249 def download(self
, base_dir
):
250 """ Download all files for a given thing. """
252 self
._parse
(base_dir
)
255 logging
.error("Unable to parse {} - aborting download".format(self
.thing_id
))
258 if not self
._needs
_download
:
259 print("{} already downloaded - skipping.".format(self
.title
))
262 # Have we already downloaded some things?
263 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
265 if os
.path
.exists(self
.download_dir
):
266 if not os
.path
.exists(timestamp_file
):
267 # edge case: old style dir w/out timestamp.
269 "Old style download dir found for {}".format(self
.title
))
271 target_dir
= "{}_old".format(self
.download_dir
)
272 while os
.path
.exists(target_dir
):
273 prev_count
= prev_count
+ 1
274 target_dir
= "{}_old_{}".format(self
.download_dir
, prev_count
)
275 os
.rename(self
.download_dir
, target_dir
)
277 prev_dir
= "{}_{}".format(self
.download_dir
, self
.last_time
)
278 os
.rename(self
.download_dir
, prev_dir
)
280 # Get the list of files to download
281 soup
= BeautifulSoup(self
.text
, features
='lxml')
282 file_links
= soup
.find_all('a', {'class': 'file-download'})
288 if not self
.last_time
:
289 # If we don't have anything to copy from, then it is all new.
290 new_file_links
= file_links
292 new_last_time
= file_links
[0].find_all('time')[0]['datetime']
295 code
.interact(local
=dict(globals(), **locals()))
297 for file_link
in file_links
:
298 timestamp
= file_link
.find_all('time')[0]['datetime']
299 logging
.debug("Found file {} from {}".format(
300 file_link
["title"], timestamp
))
301 if timestamp
> new_last_time
:
302 new_last_time
= timestamp
304 for file_link
in file_links
:
305 timestamp
= file_link
.find_all('time')[0]['datetime']
306 logging
.debug("Checking {} (updated {})".format(
307 file_link
["title"], timestamp
))
308 if timestamp
> self
.last_time
:
309 new_file_links
.append(file_link
)
311 old_file_links
.append(file_link
)
312 if not new_last_time
or timestamp
> new_last_time
:
313 new_last_time
= timestamp
315 logging
.debug("new timestamp {}".format(new_last_time
))
317 # OK. Time to get to work.
318 logging
.debug("Generating download_dir")
319 os
.mkdir(self
.download_dir
)
320 # First grab the cached files (if any)
321 logging
.info("Copying {} unchanged files.".format(len(old_file_links
)))
322 for file_link
in old_file_links
:
323 old_file
= os
.path
.join(prev_dir
, file_link
["title"])
324 new_file
= os
.path
.join(self
.download_dir
, file_link
["title"])
326 logging
.debug("Copying {} to {}".format(old_file
, new_file
))
327 copyfile(old_file
, new_file
)
328 except FileNotFoundError
:
330 "Unable to find {} in old archive, redownloading".format(file_link
["title"]))
331 new_file_links
.append(file_link
)
333 # Now download the new ones
334 files
= [("{}{}".format(URL_BASE
, x
['href']), x
["title"])
335 for x
in new_file_links
]
336 logging
.info("Downloading {} new files of {}".format(
337 len(new_file_links
), len(file_links
)))
339 for url
, name
in files
:
340 file_name
= os
.path
.join(self
.download_dir
, name
)
341 logging
.debug("Downloading {} from {} to {}".format(
342 name
, url
, file_name
))
343 data_req
= requests
.get(url
)
344 with
open(file_name
, 'wb') as handle
:
345 handle
.write(data_req
.content
)
346 except Exception as exception
:
347 logging
.error("Failed to download {} - {}".format(name
, exception
))
348 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
352 image_dir
= os
.path
.join(self
.download_dir
, 'images')
353 imagelinks
= soup
.find_all('span', {'class': 'gallery-slider'})[0] \
354 .find_all('div', {'class': 'gallery-photo'})
355 logging
.info("Downloading {} images.".format(len(imagelinks
)))
358 for imagelink
in imagelinks
:
359 url
= next(filter(None,[imagelink
[x
] for x
in ['data-full',
362 'data-thumb']]), None)
364 logging
.warning("Unable to find any urls for {}".format(imagelink
))
367 filename
= os
.path
.basename(url
)
368 if filename
.endswith('stl'):
369 filename
= "{}.png".format(filename
)
370 image_req
= requests
.get(url
)
371 with
open(os
.path
.join(image_dir
, filename
), 'wb') as handle
:
372 handle
.write(image_req
.content
)
373 except Exception as exception
:
374 print("Failed to download {} - {}".format(filename
, exception
))
375 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
378 # instructions are good too.
379 logging
.info("Downloading readme")
381 readme_txt
= soup
.find('meta', property='og:description')['content']
382 with
open(os
.path
.join(self
.download_dir
,'readme.txt'), 'w') as readme_handle
:
383 readme_handle
.write("{}\n".format(readme_txt
))
384 except (TypeError, KeyError) as exception
:
385 logging
.warning("No readme? {}".format(exception
))
386 except IOError as exception
:
387 logging
.warning("Failed to write readme! {}".format(exception
))
389 # Best get some licenses
390 logging
.info("Downloading license")
392 license_txt
= soup
.find('div',{'class':'license-text'}).text
394 with
open(os
.path
.join(self
.download_dir
,'license.txt'), 'w') as license_handle
:
395 license_handle
.write("{}\n".format(license_txt
))
396 except AttributeError as exception
:
397 logging
.warning("No license? {}".format(exception
))
398 except IOError as exception
:
399 logging
.warning("Failed to write license! {}".format(exception
))
403 # Now write the timestamp
404 with
open(timestamp_file
, 'w') as timestamp_handle
:
405 timestamp_handle
.write(new_last_time
)
406 except Exception as exception
:
407 print("Failed to write timestamp file - {}".format(exception
))
408 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
410 self
._needs
_download
= False
411 logging
.debug("Download of {} finished".format(self
.title
))
414 def do_batch(batch_file
, download_dir
):
415 """ Read a file in line by line, parsing each as a set of calls to this script."""
416 with
open(batch_file
) as handle
:
419 logging
.info("Handling instruction {}".format(line
))
420 command_arr
= line
.split()
421 if command_arr
[0] == "thing":
423 "Handling batch thing instruction: {}".format(line
))
424 Thing(command_arr
[1]).download(download_dir
)
426 if command_arr
[0] == "collection":
428 "Handling batch collection instruction: {}".format(line
))
429 Collection(command_arr
[1], command_arr
[2],
430 download_dir
).download()
432 if command_arr
[0] == "user":
434 "Handling batch collection instruction: {}".format(line
))
435 Designs(command_arr
[1], download_dir
).download()
437 logging
.warning("Unable to parse current instruction. Skipping.")
441 """ Entry point for script being run as a command. """
442 parser
= argparse
.ArgumentParser()
443 parser
.add_argument("-l", "--log-level", choices
=[
444 'debug', 'info', 'warning'], default
='info', help="level of logging desired")
445 parser
.add_argument("-d", "--directory",
446 help="Target directory to download into")
447 parser
.add_argument("-f", "--log-file",
448 help="Place to log debug information to")
449 subparsers
= parser
.add_subparsers(
450 help="Type of thing to download", dest
="subcommand")
451 collection_parser
= subparsers
.add_parser(
452 'collection', help="Download one or more entire collection(s)")
453 collection_parser
.add_argument(
454 "owner", help="The owner of the collection(s) to get")
455 collection_parser
.add_argument(
456 "collections", nargs
="+", help="Space seperated list of the name(s) of collection to get")
457 thing_parser
= subparsers
.add_parser(
458 'thing', help="Download a single thing.")
459 thing_parser
.add_argument("things", nargs
="*", help="Space seperated list of thing ID(s) to download")
460 user_parser
= subparsers
.add_parser(
461 "user", help="Download all things by one or more users")
462 user_parser
.add_argument("users", nargs
="+", help="A space seperated list of the user(s) to get the designs of")
463 batch_parser
= subparsers
.add_parser(
464 "batch", help="Perform multiple actions written in a text file")
465 batch_parser
.add_argument(
466 "batch_file", help="The name of the file to read.")
467 subparsers
.add_parser("version", help="Show the current version")
469 args
= parser
.parse_args()
470 if not args
.subcommand
:
473 if not args
.directory
:
474 args
.directory
= os
.getcwd()
476 logger
= logging
.getLogger()
477 formatter
= logging
.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
478 logger
.setLevel(logging
.DEBUG
)
479 console_handler
= logging
.StreamHandler()
480 console_handler
.setLevel(args
.log_level
.upper())
482 logger
.addHandler(console_handler
)
484 file_handler
= logging
.FileHandler(args
.log_file
)
485 file_handler
.setLevel(logging
.DEBUG
)
486 file_handler
.setFormatter(formatter
)
487 logger
.addHandler(file_handler
)
489 if args
.subcommand
.startswith("collection"):
490 for collection
in args
.collections
:
491 Collection(args
.owner
, collection
, args
.directory
).download()
492 if args
.subcommand
== "thing":
493 for thing
in args
.things
:
494 Thing(thing
).download(args
.directory
)
495 if args
.subcommand
== "user":
496 for user
in args
.users
:
497 Designs(user
, args
.directory
).download()
498 if args
.subcommand
== "version":
499 print("thingy_grabber.py version {}".format(VERSION
))
500 if args
.subcommand
== "batch":
501 do_batch(args
.batch_file
, args
.directory
)
504 if __name__
== "__main__":