3 Thingiverse bulk downloader
13 from shutil
import copyfile
14 from bs4
import BeautifulSoup
16 URL_BASE
= "https://www.thingiverse.com"
17 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
18 USER_COLLECTION
= URL_BASE
+ "/ajax/user/designs"
20 ID_REGEX
= re
.compile(r
'"id":(\d*),')
21 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
22 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
23 # This appears to be fixed at 12, but if it changes would screw the rest up.
24 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
25 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
30 """ Remove whitespace from a string """
31 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
35 Normalizes string, converts to lowercase, removes non-alpha characters,
36 and converts spaces to hyphens.
38 value
= unicodedata
.normalize('NFKD', value
).encode('ascii', 'ignore').decode()
39 value
= str(re
.sub(r
'[^\w\s-]', '', value
).strip())
40 value
= str(NO_WHITESPACE_REGEX
.sub('-', value
))
41 #value = str(re.sub(r'[-\s]+', '-', value))
45 """ Holds details of a group of things for download
46 This is effectively (although not actually) an abstract class
47 - use Collection or Designs instead.
55 # These should be set by child classes.
57 self
.download_dir
= None
58 self
.collection_url
= None
60 def _get_small_grouping(self
, req
):
61 """ Handle small groupings """
62 soup
= BeautifulSoup(req
.text
, features
='lxml')
63 links
= soup
.find_all('a', {'class':'card-img-holder'})
64 self
.things
= [x
['href'].split(':')[1] for x
in links
]
65 self
.total
= len(self
.things
)
70 """ retrieve the things of the grouping. """
72 # We've already done it.
75 # Check for initialisation:
77 logging
.error("No URL set - object not initialised properly?")
78 raise ValueError("No URL set - object not initialised properly?")
80 # Get the internal details of the grouping.
81 logging
.debug("Querying {}".format(self
.url
))
82 c_req
= requests
.get(self
.url
)
83 total
= TOTAL_REGEX
.search(c_req
.text
)
85 # This is a small (<13) items grouping. Pull the list from this req.
86 return self
._get
_small
_grouping
(c_req
)
87 self
.total
= total
.groups()[0]
88 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
89 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
90 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
97 for current_page
in range(1, self
.last_page
+ 1):
98 parameters
['page'] = current_page
99 req
= requests
.post(self
.collection_url
, parameters
)
100 soup
= BeautifulSoup(req
.text
, features
='lxml')
101 links
= soup
.find_all('a', {'class':'card-img-holder'})
102 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
107 """ Downloads all the files in a collection """
111 if not self
.download_dir
:
112 raise ValueError("No download_dir set - invalidly initialised object?")
114 base_dir
= os
.getcwd()
116 os
.mkdir(self
.download_dir
)
117 except FileExistsError
:
118 logging
.info("Target directory {} already exists. Assuming a resume."
119 .format(self
.download_dir
))
120 logging
.info("Downloading {} thing(s).".format(self
.total
))
121 for idx
,thing
in enumerate(self
.things
):
122 logging
.info("Downloading thing {}".format(idx
))
123 Thing(thing
).download(self
.download_dir
)
125 class Collection(Grouping
):
126 """ Holds details of a collection. """
127 def __init__(self
, user
, name
, directory
):
128 Grouping
.__init
__(self
)
131 self
.url
= "{}/{}/collections/{}".format(
132 URL_BASE
, self
.user
, strip_ws(self
.name
))
133 self
.download_dir
= os
.path
.join(directory
,
134 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
135 self
.collection_url
= URL_COLLECTION
137 class Designs(Grouping
):
138 """ Holds details of all of a users' designs. """
139 def __init__(self
, user
, directory
):
140 Grouping
.__init
__(self
)
142 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
143 self
.download_dir
= os
.path
.join(directory
, "{} designs".format(slugify(self
.user
)))
144 self
.collection_url
= USER_COLLECTION
147 """ An individual design on thingiverse. """
148 def __init__(self
, thing_id
):
149 self
.thing_id
= thing_id
150 self
.last_time
= None
152 self
._needs
_download
= True
155 self
.download_dir
= None
157 def _parse(self
, base_dir
):
158 """ Work out what, if anything needs to be done. """
162 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
163 req
= requests
.get(url
)
165 soup
= BeautifulSoup(self
.text
, features
='lxml')
167 #code.interact(local=dict(globals(), **locals()))
168 self
.title
= slugify(soup
.find_all('h1')[0].text
.strip())
169 self
.download_dir
= os
.path
.join(base_dir
, self
.title
)
171 logging
.debug("Parsing {} ({})".format(self
.thing_id
, self
.title
))
173 if not os
.path
.exists(self
.download_dir
):
178 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
179 if not os
.path
.exists(timestamp_file
):
180 # Old download from before
181 logging
.warning("Old-style download directory found. Assuming update required.")
186 with
open(timestamp_file
, 'r') as timestamp_handle
:
187 self
.last_time
= timestamp_handle
.readlines()[0]
188 logging
.info("last downloaded version: {}".format(self
.last_time
))
189 except FileNotFoundError
:
190 # Not run on this thing before.
191 logging
.info("Old-style download directory found. Assuming update required.")
192 self
.last_time
= None
196 # OK, so we have a timestamp, lets see if there is anything new to get
197 file_links
= soup
.find_all('a', {'class':'file-download'})
198 for file_link
in file_links
:
199 timestamp
= file_link
.find_all('time')[0]['datetime']
200 logging
.debug("Checking {} (updated {})".format(file_link
["title"], timestamp
))
201 if timestamp
> self
.last_time
:
202 logging
.info("Found new/updated file {}".format(file_link
["title"]))
203 self
._needs
_download
= True
206 # Got here, so nope, no new files.
207 self
._needs
_download
= False
210 def download(self
, base_dir
):
211 """ Download all files for a given thing. """
213 self
._parse
(base_dir
)
215 if not self
._needs
_download
:
216 print("{} already downloaded - skipping.".format(self
.title
))
219 # Have we already downloaded some things?
220 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
222 if os
.path
.exists(self
.download_dir
):
223 if not os
.path
.exists(timestamp_file
):
224 # edge case: old style dir w/out timestamp.
225 logging
.warning("Old style download dir found for {}".format(self
.title
))
226 os
.rename(self
.download_dir
, "{}_old".format(self
.download_dir
))
228 prev_dir
= "{}_{}".format(self
.download_dir
, self
.last_time
)
229 os
.rename(self
.download_dir
, prev_dir
)
231 # Get the list of files to download
232 soup
= BeautifulSoup(self
.text
, features
='lxml')
233 file_links
= soup
.find_all('a', {'class':'file-download'})
239 if not self
.last_time
:
240 # If we don't have anything to copy from, then it is all new.
241 new_file_links
= file_links
242 new_last_time
= file_links
[0].find_all('time')[0]['datetime']
243 for file_link
in file_links
:
244 timestamp
= file_link
.find_all('time')[0]['datetime']
245 logging
.debug("Found file {} from {}".format(file_link
["title"], timestamp
))
246 if timestamp
> new_last_time
:
247 new_last_time
= timestamp
249 for file_link
in file_links
:
250 timestamp
= file_link
.find_all('time')[0]['datetime']
251 logging
.debug("Checking {} (updated {})".format(file_link
["title"], timestamp
))
252 if timestamp
> self
.last_time
:
253 new_file_links
.append(file_link
)
255 old_file_links
.append(file_link
)
256 if not new_last_time
or timestamp
> new_last_time
:
257 new_last_time
= timestamp
259 logging
.debug("new timestamp {}".format(new_last_time
))
261 # OK. Time to get to work.
262 logging
.debug("Generating download_dir")
263 os
.mkdir(self
.download_dir
)
264 # First grab the cached files (if any)
265 logging
.info("Copying {} unchanged files.".format(len(old_file_links
)))
266 for file_link
in old_file_links
:
267 old_file
= os
.path
.join(prev_dir
, file_link
["title"])
268 new_file
= os
.path
.join(self
.download_dir
, file_link
["title"])
270 logging
.debug("Copying {} to {}".format(old_file
, new_file
))
271 copyfile(old_file
, new_file
)
272 except FileNotFoundError
:
273 logging
.warning("Unable to find {} in old archive, redownloading".format(file_link
["title"]))
274 new_file_links
.append(file_link
)
276 # Now download the new ones
277 files
= [("{}{}".format(URL_BASE
, x
['href']), x
["title"]) for x
in new_file_links
]
278 logging
.info("Downloading {} new files of {}".format(len(new_file_links
), len(file_links
)))
280 for url
, name
in files
:
281 file_name
= os
.path
.join(self
.download_dir
, name
)
282 logging
.debug("Downloading {} from {} to {}".format(name
, url
, file_name
))
283 data_req
= requests
.get(url
)
284 with
open(file_name
, 'wb') as handle
:
285 handle
.write(data_req
.content
)
286 except Exception as exception
:
287 logging
.error("Failed to download {} - {}".format(name
, exception
))
288 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
292 image_dir
= os
.path
.join(self
.download_dir
, 'images')
293 imagelinks
= soup
.find_all('span', {'class':'gallery-slider'})[0] \
294 .find_all('div', {'class':'gallery-photo'})
295 logging
.info("Downloading {} images.".format(len(imagelinks
)))
298 for imagelink
in imagelinks
:
299 url
= imagelink
['data-full']
300 filename
= os
.path
.basename(url
)
301 if filename
.endswith('stl'):
302 filename
= "{}.png".format(filename
)
303 image_req
= requests
.get(url
)
304 with
open(os
.path
.join(image_dir
, filename
), 'wb') as handle
:
305 handle
.write(image_req
.content
)
306 except Exception as exception
:
307 print("Failed to download {} - {}".format(filename
, exception
))
308 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
315 # Now write the timestamp
316 with
open(timestamp_file
, 'w') as timestamp_handle
:
317 timestamp_handle
.write(new_last_time
)
318 except Exception as exception
:
319 print("Failed to write timestamp file - {}".format(exception
))
320 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
322 self
._needs
_download
= False
323 logging
.debug("Download of {} finished".format(self
.title
))
325 def do_batch(batch_file
, download_dir
):
326 """ Read a file in line by line, parsing each as a set of calls to this script."""
327 with
open(batch_file
) as handle
:
330 logging
.info("Handling instruction {}".format(line
))
331 command_arr
= line
.split()
332 if command_arr
[0] == "thing":
333 logging
.debug("Handling batch thing instruction: {}".format(line
))
334 Thing(command_arr
[1]).download(download_dir
)
336 if command_arr
[0] == "collection":
337 logging
.debug("Handling batch collection instruction: {}".format(line
))
338 Collection(command_arr
[1], command_arr
[2], download_dir
).download()
340 if command_arr
[0] == "user":
341 logging
.debug("Handling batch collection instruction: {}".format(line
))
342 Designs(command_arr
[1], download_dir
).download()
344 logging
.warning("Unable to parse current instruction. Skipping.")
347 """ Entry point for script being run as a command. """
348 parser
= argparse
.ArgumentParser()
349 parser
.add_argument("-l", "--log-level", choices
=['debug','info','warning'], default
='info', help="level of logging desired")
350 parser
.add_argument("-d", "--directory", help="Target directory to download into")
351 subparsers
= parser
.add_subparsers(help="Type of thing to download", dest
="subcommand")
352 collection_parser
= subparsers
.add_parser('collection', help="Download an entire collection")
353 collection_parser
.add_argument("owner", help="The owner of the collection to get")
354 collection_parser
.add_argument("collection", help="The name of the collection to get")
355 thing_parser
= subparsers
.add_parser('thing', help="Download a single thing.")
356 thing_parser
.add_argument("thing", help="Thing ID to download")
357 user_parser
= subparsers
.add_parser("user", help="Download all things by a user")
358 user_parser
.add_argument("user", help="The user to get the designs of")
359 batch_parser
= subparsers
.add_parser("batch", help="Perform multiple actions written in a text file")
360 batch_parser
.add_argument("batch_file", help="The name of the file to read.")
361 subparsers
.add_parser("version", help="Show the current version")
363 args
= parser
.parse_args()
364 if not args
.subcommand
:
367 if not args
.directory
:
368 args
.directory
= os
.getcwd()
369 logging
.basicConfig(level
=getattr(logging
, args
.log_level
.upper()))
372 if args
.subcommand
.startswith("collection"):
373 Collection(args
.owner
, args
.collection
, args
.directory
).download()
374 if args
.subcommand
== "thing":
375 Thing(args
.thing
).download(args
.directory
)
376 if args
.subcommand
== "user":
377 Designs(args
.user
, args
.directory
).download()
378 if args
.subcommand
== "version":
379 print("thingy_grabber.py version {}".format(VERSION
))
380 if args
.subcommand
== "batch":
381 do_batch(args
.batch_file
, args
.directory
)
384 if __name__
== "__main__":