3 Thingiverse bulk downloader
12 from shutil
import copyfile
13 from bs4
import BeautifulSoup
15 URL_BASE
= "https://www.thingiverse.com"
16 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
18 ID_REGEX
= re
.compile(r
'"id":(\d*),')
19 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
20 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
21 # This appears to be fixed at 12, but if it changes would screw the rest up.
22 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
23 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
28 """ Remove whitespace from a string """
29 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
33 Normalizes string, converts to lowercase, removes non-alpha characters,
34 and converts spaces to hyphens.
36 value
= unicodedata
.normalize('NFKD', value
).encode('ascii', 'ignore').decode()
37 value
= str(re
.sub(r
'[^\w\s-]', '', value
).strip())
38 value
= str(NO_WHITESPACE_REGEX
.sub('-', value
))
39 #value = str(re.sub(r'[-\s]+', '-', value))
43 """ Holds details of a group of things.
44 This is effectively (although not actually) an abstract class
45 - use Collection or Designs instead.
53 # These two should be set by child classes.
55 self
.download_dir
= None
57 def _get_small_grouping(self
, req
):
58 """ Handle small groupings """
59 soup
= BeautifulSoup(req
.text
, features
='lxml')
60 links
= soup
.find_all('a', {'class':'card-img-holder'})
61 self
.things
= [x
['href'].split(':')[1] for x
in links
]
66 """ retrieve the things of the grouping. """
68 # We've already done it.
71 # Check for initialisation:
73 print("No URL set - object not initialised properly?")
74 raise ValueError("No URL set - object not initialised properly?")
76 # Get the internal details of the grouping.
78 print("Querying {}".format(self
.url
))
79 c_req
= requests
.get(self
.url
)
80 total
= TOTAL_REGEX
.search(c_req
.text
)
82 # This is a small (<13) items grouping. Pull the list from this req.
83 return self
._get
_small
_grouping
(c_req
)
84 self
.total
= total
.groups()[0]
85 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
86 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
87 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
94 for current_page
in range(1, self
.last_page
+ 1):
95 parameters
['page'] = current_page
96 req
= requests
.post(URL_COLLECTION
, parameters
)
97 soup
= BeautifulSoup(req
.text
, features
='lxml')
98 links
= soup
.find_all('a', {'class':'card-img-holder'})
99 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
104 """ Downloads all the files in a collection """
108 if not self
.download_dir
:
109 raise ValueError("No download_dir set - invalidly initialised object?")
111 base_dir
= os
.getcwd()
113 os
.mkdir(self
.download_dir
)
114 except FileExistsError
:
115 print("Target directory {} already exists. Assuming a resume."
116 .format(self
.download_dir
))
117 for thing
in self
.things
:
118 Thing(thing
).download(self
.download_dir
)
120 class Collection(Grouping
):
121 """ Holds details of a collection. """
122 def __init__(self
, user
, name
):
123 Grouping
.__init
__(self
)
126 self
.url
= "{}/{}/collections/{}".format(
127 URL_BASE
, self
.user
, strip_ws(self
.name
))
128 self
.download_dir
= os
.path
.join(os
.getcwd(),
129 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
131 class Designs(Grouping
):
132 """ Holds details of all of a users' designs. """
133 def __init__(self
, user
):
134 Grouping
.__init
__(self
)
136 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
137 self
.download_dir
= os
.path
.join(os
.getcwd(), "{} designs".format(slugify(self
.user
)))
140 """ An individual design on thingiverse. """
141 def __init__(self
, thing_id
):
142 self
.thing_id
= thing_id
143 self
.last_time
= None
145 self
._needs
_download
= True
148 self
.download_dir
= None
150 def _parse(self
, base_dir
):
151 """ Work out what, if anything needs to be done. """
155 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
156 req
= requests
.get(url
)
158 soup
= BeautifulSoup(self
.text
, features
='lxml')
160 self
.title
= slugify(soup
.find_all('h1')[0].text
.strip())
161 self
.download_dir
= os
.path
.join(base_dir
, self
.title
)
163 if not os
.path
.exists(self
.download_dir
):
168 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
169 if not os
.path
.exists(timestamp_file
):
170 # Old download from before
172 print("Old-style download directory found. Assuming update required.")
177 with
open(timestamp_file
, 'r') as timestamp_handle
:
178 self
.last_time
= timestamp_handle
.readlines()[0]
180 print("last downloaded version: {}".format(self
.last_time
))
181 except FileNotFoundError
:
182 # Not run on this thing before.
184 print("Old-style download directory found. Assuming update required.")
185 self
.last_time
= None
189 # OK, so we have a timestamp, lets see if there is anything new to get
190 file_links
= soup
.find_all('a', {'class':'file-download'})
191 for file_link
in file_links
:
192 timestamp
= file_link
.find_all('time')[0]['datetime']
194 print("Checking {} (updated {})".format(file_link
["title"], timestamp
))
195 if timestamp
> self
.last_time
:
196 print("Found new/updated file {}".format(file_link
["title"]))
197 self
._needs
_download
= True
200 # Got here, so nope, no new files.
201 print("Found no new files for {}".format(self
.title
))
202 self
._needs
_download
= False
205 def download(self
, base_dir
):
206 """ Download all files for a given thing. """
208 self
._parse
(base_dir
)
210 if not self
._needs
_download
:
212 print("{} already downloaded - skipping.".format(self
.title
))
215 # Have we already downloaded some things?
216 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
218 if os
.path
.exists(self
.download_dir
):
219 if not os
.path
.exists(timestamp_file
):
220 # edge case: old style dir w/out timestamp.
221 print("Old style download dir found for {}".format(self
.title
))
222 os
.rename(self
.download_dir
, "{}_old".format(self
.download_dir
))
224 prev_dir
= "{}_{}".format(self
.download_dir
, self
.last_time
)
225 os
.rename(self
.download_dir
, prev_dir
)
227 # Get the list of files to download
228 soup
= BeautifulSoup(self
.text
, features
='lxml')
229 file_links
= soup
.find_all('a', {'class':'file-download'})
235 if not self
.last_time
:
236 # If we don't have anything to copy from, then it is all new.
237 new_file_links
= file_links
238 new_last_time
= file_links
[0].find_all('time')[0]['datetime']
239 for file_link
in file_links
:
240 timestamp
= file_link
.find_all('time')[0]['datetime']
242 print("Found file {} from {}".format(file_link
["title"], timestamp
))
243 if timestamp
> new_last_time
:
244 new_last_time
= timestamp
246 for file_link
in file_links
:
247 timestamp
= file_link
.find_all('time')[0]['datetime']
249 print("Checking {} (updated {})".format(file_link
["title"], timestamp
))
250 if timestamp
> self
.last_time
:
251 new_file_links
.append(file_link
)
253 old_file_links
.append(file_link
)
254 if not new_last_time
or timestamp
> new_last_time
:
255 new_last_time
= timestamp
258 print("new timestamp {}".format(new_last_time
))
260 # OK. Time to get to work.
261 os
.mkdir(self
.download_dir
)
262 # First grab the cached files (if any)
263 for file_link
in old_file_links
:
264 old_file
= os
.path
.join(prev_dir
, file_link
["title"])
265 new_file
= os
.path
.join(self
.download_dir
, file_link
["title"])
268 print("Copying {} to {}".format(old_file
, new_file
))
269 copyfile(old_file
, new_file
)
270 except FileNotFoundError
:
271 print("Unable to find {} in old archive, redownloading".format(file_link
["title"]))
272 new_file_links
.append(file_link
)
274 # Now download the new ones
275 files
= [("{}{}".format(URL_BASE
, x
['href']), x
["title"]) for x
in new_file_links
]
277 for url
, name
in files
:
278 file_name
= os
.path
.join(self
.download_dir
, name
)
280 print("Downloading {} from {} to {}".format(name
, url
, file_name
))
281 data_req
= requests
.get(url
)
282 with
open(file_name
, 'wb') as handle
:
283 handle
.write(data_req
.content
)
284 except Exception as exception
:
285 print("Failed to download {} - {}".format(name
, exception
))
286 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
290 # Now write the timestamp
291 with
open(timestamp_file
, 'w') as timestamp_handle
:
292 timestamp_handle
.write(new_last_time
)
293 except Exception as exception
:
294 print("Failed to write timestamp file - {}".format(exception
))
295 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
297 self
._needs
_download
= False
299 print("Download of {} finished".format(self
.title
))
302 """ Entry point for script being run as a command. """
303 parser
= argparse
.ArgumentParser()
304 parser
.add_argument("-v", "--verbose", help="Be more verbose", action
="store_true")
305 subparsers
= parser
.add_subparsers(help="Type of thing to download", dest
="subcommand")
306 collection_parser
= subparsers
.add_parser('collection', help="Download an entire collection")
307 collection_parser
.add_argument("owner", help="The owner of the collection to get")
308 collection_parser
.add_argument("collection", help="The name of the collection to get")
309 thing_parser
= subparsers
.add_parser('thing', help="Download a single thing.")
310 thing_parser
.add_argument("thing", help="Thing ID to download")
311 user_parser
= subparsers
.add_parser("user", help="Download all things by a user")
312 user_parser
.add_argument("user", help="The user to get the designs of")
314 args
= parser
.parse_args()
315 if not args
.subcommand
:
319 VERBOSE
= args
.verbose
320 if args
.subcommand
.startswith("collection"):
321 collection
= Collection(args
.owner
, args
.collection
)
322 print(collection
.get())
323 collection
.download()
324 if args
.subcommand
== "thing":
325 Thing(args
.thing
).download(os
.getcwd())
326 if args
.subcommand
== "user":
327 designs
= Designs(args
.user
)
333 if __name__
== "__main__":