88c624562dce6b64e65e4a012dc949fc4c361be0
3 Thingiverse bulk downloader
12 from shutil
import copyfile
13 from bs4
import BeautifulSoup
15 URL_BASE
= "https://www.thingiverse.com"
16 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
17 USER_COLLECTION
= URL_BASE
+ "/ajax/user/designs"
19 ID_REGEX
= re
.compile(r
'"id":(\d*),')
20 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
21 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
22 # This appears to be fixed at 12, but if it changes would screw the rest up.
23 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
24 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
29 """ Remove whitespace from a string """
30 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
34 Normalizes string, converts to lowercase, removes non-alpha characters,
35 and converts spaces to hyphens.
37 value
= unicodedata
.normalize('NFKD', value
).encode('ascii', 'ignore').decode()
38 value
= str(re
.sub(r
'[^\w\s-]', '', value
).strip())
39 value
= str(NO_WHITESPACE_REGEX
.sub('-', value
))
40 #value = str(re.sub(r'[-\s]+', '-', value))
44 """ Holds details of a group of things.
45 This is effectively (although not actually) an abstract class
46 - use Collection or Designs instead.
54 # These should be set by child classes.
56 self
.download_dir
= None
57 self
.collection_url
= None
59 def _get_small_grouping(self
, req
):
60 """ Handle small groupings """
61 soup
= BeautifulSoup(req
.text
, features
='lxml')
62 links
= soup
.find_all('a', {'class':'card-img-holder'})
63 self
.things
= [x
['href'].split(':')[1] for x
in links
]
68 """ retrieve the things of the grouping. """
70 # We've already done it.
73 # Check for initialisation:
75 print("No URL set - object not initialised properly?")
76 raise ValueError("No URL set - object not initialised properly?")
78 # Get the internal details of the grouping.
80 print("Querying {}".format(self
.url
))
81 c_req
= requests
.get(self
.url
)
82 total
= TOTAL_REGEX
.search(c_req
.text
)
84 # This is a small (<13) items grouping. Pull the list from this req.
85 return self
._get
_small
_grouping
(c_req
)
86 self
.total
= total
.groups()[0]
87 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
88 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
89 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
96 for current_page
in range(1, self
.last_page
+ 1):
97 parameters
['page'] = current_page
98 req
= requests
.post(self
.collection_url
, parameters
)
99 soup
= BeautifulSoup(req
.text
, features
='lxml')
100 links
= soup
.find_all('a', {'class':'card-img-holder'})
101 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
106 """ Downloads all the files in a collection """
110 if not self
.download_dir
:
111 raise ValueError("No download_dir set - invalidly initialised object?")
113 base_dir
= os
.getcwd()
115 os
.mkdir(self
.download_dir
)
116 except FileExistsError
:
117 print("Target directory {} already exists. Assuming a resume."
118 .format(self
.download_dir
))
120 print("Downloading {} things.".format(self
.total
))
121 for thing
in self
.things
:
122 Thing(thing
).download(self
.download_dir
)
124 class Collection(Grouping
):
125 """ Holds details of a collection. """
126 def __init__(self
, user
, name
):
127 Grouping
.__init
__(self
)
130 self
.url
= "{}/{}/collections/{}".format(
131 URL_BASE
, self
.user
, strip_ws(self
.name
))
132 self
.download_dir
= os
.path
.join(os
.getcwd(),
133 "{}-{}".format(slugify(self
.user
), slugify(self
.name
)))
134 self
.collection_url
= URL_COLLECTION
136 class Designs(Grouping
):
137 """ Holds details of all of a users' designs. """
138 def __init__(self
, user
):
139 Grouping
.__init
__(self
)
141 self
.url
= "{}/{}/designs".format(URL_BASE
, self
.user
)
142 self
.download_dir
= os
.path
.join(os
.getcwd(), "{} designs".format(slugify(self
.user
)))
143 self
.collection_url
= USER_COLLECTION
146 """ An individual design on thingiverse. """
147 def __init__(self
, thing_id
):
148 self
.thing_id
= thing_id
149 self
.last_time
= None
151 self
._needs
_download
= True
154 self
.download_dir
= None
156 def _parse(self
, base_dir
):
157 """ Work out what, if anything needs to be done. """
161 url
= "{}/thing:{}/files".format(URL_BASE
, self
.thing_id
)
162 req
= requests
.get(url
)
164 soup
= BeautifulSoup(self
.text
, features
='lxml')
166 self
.title
= slugify(soup
.find_all('h1')[0].text
.strip())
167 self
.download_dir
= os
.path
.join(base_dir
, self
.title
)
169 if not os
.path
.exists(self
.download_dir
):
174 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
175 if not os
.path
.exists(timestamp_file
):
176 # Old download from before
178 print("Old-style download directory found. Assuming update required.")
183 with
open(timestamp_file
, 'r') as timestamp_handle
:
184 self
.last_time
= timestamp_handle
.readlines()[0]
186 print("last downloaded version: {}".format(self
.last_time
))
187 except FileNotFoundError
:
188 # Not run on this thing before.
190 print("Old-style download directory found. Assuming update required.")
191 self
.last_time
= None
195 # OK, so we have a timestamp, lets see if there is anything new to get
196 file_links
= soup
.find_all('a', {'class':'file-download'})
197 for file_link
in file_links
:
198 timestamp
= file_link
.find_all('time')[0]['datetime']
200 print("Checking {} (updated {})".format(file_link
["title"], timestamp
))
201 if timestamp
> self
.last_time
:
202 print("Found new/updated file {}".format(file_link
["title"]))
203 self
._needs
_download
= True
206 # Got here, so nope, no new files.
207 print("Found no new files for {}".format(self
.title
))
208 self
._needs
_download
= False
211 def download(self
, base_dir
):
212 """ Download all files for a given thing. """
214 self
._parse
(base_dir
)
216 if not self
._needs
_download
:
218 print("{} already downloaded - skipping.".format(self
.title
))
221 # Have we already downloaded some things?
222 timestamp_file
= os
.path
.join(self
.download_dir
, 'timestamp.txt')
224 if os
.path
.exists(self
.download_dir
):
225 if not os
.path
.exists(timestamp_file
):
226 # edge case: old style dir w/out timestamp.
227 print("Old style download dir found for {}".format(self
.title
))
228 os
.rename(self
.download_dir
, "{}_old".format(self
.download_dir
))
230 prev_dir
= "{}_{}".format(self
.download_dir
, self
.last_time
)
231 os
.rename(self
.download_dir
, prev_dir
)
233 # Get the list of files to download
234 soup
= BeautifulSoup(self
.text
, features
='lxml')
235 file_links
= soup
.find_all('a', {'class':'file-download'})
241 if not self
.last_time
:
242 # If we don't have anything to copy from, then it is all new.
243 new_file_links
= file_links
244 new_last_time
= file_links
[0].find_all('time')[0]['datetime']
245 for file_link
in file_links
:
246 timestamp
= file_link
.find_all('time')[0]['datetime']
248 print("Found file {} from {}".format(file_link
["title"], timestamp
))
249 if timestamp
> new_last_time
:
250 new_last_time
= timestamp
252 for file_link
in file_links
:
253 timestamp
= file_link
.find_all('time')[0]['datetime']
255 print("Checking {} (updated {})".format(file_link
["title"], timestamp
))
256 if timestamp
> self
.last_time
:
257 new_file_links
.append(file_link
)
259 old_file_links
.append(file_link
)
260 if not new_last_time
or timestamp
> new_last_time
:
261 new_last_time
= timestamp
264 print("new timestamp {}".format(new_last_time
))
266 # OK. Time to get to work.
267 os
.mkdir(self
.download_dir
)
268 # First grab the cached files (if any)
269 for file_link
in old_file_links
:
270 old_file
= os
.path
.join(prev_dir
, file_link
["title"])
271 new_file
= os
.path
.join(self
.download_dir
, file_link
["title"])
274 print("Copying {} to {}".format(old_file
, new_file
))
275 copyfile(old_file
, new_file
)
276 except FileNotFoundError
:
277 print("Unable to find {} in old archive, redownloading".format(file_link
["title"]))
278 new_file_links
.append(file_link
)
280 # Now download the new ones
281 files
= [("{}{}".format(URL_BASE
, x
['href']), x
["title"]) for x
in new_file_links
]
283 for url
, name
in files
:
284 file_name
= os
.path
.join(self
.download_dir
, name
)
286 print("Downloading {} from {} to {}".format(name
, url
, file_name
))
287 data_req
= requests
.get(url
)
288 with
open(file_name
, 'wb') as handle
:
289 handle
.write(data_req
.content
)
290 except Exception as exception
:
291 print("Failed to download {} - {}".format(name
, exception
))
292 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
296 # Now write the timestamp
297 with
open(timestamp_file
, 'w') as timestamp_handle
:
298 timestamp_handle
.write(new_last_time
)
299 except Exception as exception
:
300 print("Failed to write timestamp file - {}".format(exception
))
301 os
.rename(self
.download_dir
, "{}_failed".format(self
.download_dir
))
303 self
._needs
_download
= False
305 print("Download of {} finished".format(self
.title
))
308 """ Entry point for script being run as a command. """
309 parser
= argparse
.ArgumentParser()
310 parser
.add_argument("-v", "--verbose", help="Be more verbose", action
="store_true")
311 subparsers
= parser
.add_subparsers(help="Type of thing to download", dest
="subcommand")
312 collection_parser
= subparsers
.add_parser('collection', help="Download an entire collection")
313 collection_parser
.add_argument("owner", help="The owner of the collection to get")
314 collection_parser
.add_argument("collection", help="The name of the collection to get")
315 thing_parser
= subparsers
.add_parser('thing', help="Download a single thing.")
316 thing_parser
.add_argument("thing", help="Thing ID to download")
317 user_parser
= subparsers
.add_parser("user", help="Download all things by a user")
318 user_parser
.add_argument("user", help="The user to get the designs of")
320 args
= parser
.parse_args()
321 if not args
.subcommand
:
325 VERBOSE
= args
.verbose
326 if args
.subcommand
.startswith("collection"):
327 collection
= Collection(args
.owner
, args
.collection
)
328 print(collection
.get())
329 collection
.download()
330 if args
.subcommand
== "thing":
331 Thing(args
.thing
).download(os
.getcwd())
332 if args
.subcommand
== "user":
333 designs
= Designs(args
.user
)
339 if __name__
== "__main__":