587d47ef9aae203b56d56de9d8f4bea9f717eeaf
3 Thingiverse bulk downloader
11 from bs4
import BeautifulSoup
13 URL_BASE
= "https://www.thingiverse.com"
14 URL_COLLECTION
= URL_BASE
+ "/ajax/thingcollection/list_collected_things"
16 ID_REGEX
= re
.compile(r
'"id":(\d*),')
17 TOTAL_REGEX
= re
.compile(r
'"total":(\d*),')
18 LAST_PAGE_REGEX
= re
.compile(r
'"last_page":(\d*),')
19 # This appears to be fixed at 12, but if it changes would screw the rest up.
20 PER_PAGE_REGEX
= re
.compile(r
'"per_page":(\d*),')
21 NO_WHITESPACE_REGEX
= re
.compile(r
'[-\s]+')
26 """ Remove whitespace from a string """
27 return str(NO_WHITESPACE_REGEX
.sub('-', value
))
31 Normalizes string, converts to lowercase, removes non-alpha characters,
32 and converts spaces to hyphens.
34 value
= unicodedata
.normalize('NFKD', value
).encode('ascii', 'ignore').decode()
35 value
= str(re
.sub(r
'[^\w\s-]', '', value
).strip())
36 value
= str(NO_WHITESPACE_REGEX
.sub('-', value
))
37 #value = str(re.sub(r'[-\s]+', '-', value))
41 """ Holds details of a collection. """
42 def __init__(self
, user
, name
):
51 def _get_small_collection(self
, req
):
52 """ Handle small collections """
53 soup
= BeautifulSoup(req
.text
, features
='lxml')
54 links
= soup
.find_all('a', {'class':'card-img-holder'})
55 self
.things
= [x
['href'].split(':')[1] for x
in links
]
59 def get_collection(self
):
60 """ retrieve the things of the collection. """
62 # We've already done it.
65 # Get the internal details of the collection.
66 c_url
= "{}/{}/collections/{}".format(URL_BASE
, self
.user
, strip_ws(self
.name
))
68 print("Querying {}".format(c_url
))
69 c_req
= requests
.get(c_url
)
70 total
= TOTAL_REGEX
.search(c_req
.text
)
72 # This is a small (<13) items collection. Pull the list from this req.
73 return self
._get
_small
_collection
(c_req
)
74 self
.total
= total
.groups()[0]
75 self
.req_id
= ID_REGEX
.search(c_req
.text
).groups()[0]
76 self
.last_page
= int(LAST_PAGE_REGEX
.search(c_req
.text
).groups()[0])
77 self
.per_page
= PER_PAGE_REGEX
.search(c_req
.text
).groups()[0]
79 'base_url':"{}/collections/{}".format(self
.user
, self
.name
),
84 for current_page
in range(1, self
.last_page
+ 1):
85 parameters
['page'] = current_page
86 req
= requests
.post(URL_COLLECTION
, parameters
)
87 soup
= BeautifulSoup(req
.text
, features
='lxml')
88 links
= soup
.find_all('a', {'class':'card-img-holder'})
89 self
.things
+= [x
['href'].split(':')[1] for x
in links
]
94 """ Downloads all the files in a collection """
97 base_dir
= os
.getcwd()
98 new_dir
= "{}-{}".format(slugify(self
.user
), slugify(self
.name
))
99 target_dir
= os
.path
.join(base_dir
, new_dir
)
102 except FileExistsError
:
103 print("Target directory {} already exists. Assuming a resume.".format(new_dir
))
105 for thing
in self
.things
:
106 download_thing(thing
)
109 def download_thing(thing
):
110 """ Downloads all the files for a given thing. """
111 file_url
= "{}/thing:{}/files".format(URL_BASE
, thing
)
112 file_req
= requests
.get(file_url
)
113 file_soup
= BeautifulSoup(file_req
.text
, features
='lxml')
115 title
= slugify(file_soup
.find_all('h1')[0].text
.strip())
116 base_dir
= os
.getcwd()
119 except FileExistsError
:
122 print("Downloading {} ({})".format(thing
, title
))
127 with
open('timestamp.txt', 'r') as fh
:
128 last_time
= fh
.readlines()[0]
130 print("last downloaded version: {}".format(last_time
))
131 except FileNotFoundError
:
132 # Not run on this thing before.
134 print('Directory for thing already exists, checking for update.')
137 file_links
= file_soup
.find_all('a', {'class':'file-download'})
138 new_last_time
= last_time
141 for file_link
in file_links
:
142 timestamp
= file_link
.find_all('time')[0]['datetime']
144 print("Checking {} (updated {})".format(file_link
["title"], timestamp
))
145 if not last_time
or timestamp
> last_time
:
146 new_file_links
.append(file_link
)
147 if not new_last_time
or timestamp
> new_last_time
:
148 new_last_time
= timestamp
150 if last_time
and new_last_time
<= last_time
:
151 print("Thing already downloaded. Skipping.")
152 files
= [("{}{}".format(URL_BASE
, x
['href']), x
["title"]) for x
in new_file_links
]
155 for url
, name
in files
:
157 print("Downloading {} from {}".format(name
, url
))
158 data_req
= requests
.get(url
)
159 with
open(name
, 'wb') as handle
:
160 handle
.write(data_req
.content
)
161 # now write timestamp
162 with
open('timestamp.txt', 'w') as fh
:
163 fh
.write(new_last_time
)
164 except Exception as exception
:
165 print("Failed to download {} - {}".format(name
, exception
))
167 os
.rename(title
, "{}_failed".format(title
))
174 """ Entry point for script being run as a command. """
175 parser
= argparse
.ArgumentParser()
176 parser
.add_argument("owner", help="The owner of the collection to get")
177 parser
.add_argument("collection", help="The name of the collection to get")
178 parser
.add_argument("-v", "--verbose", help="Be more verbose", action
="store_true")
179 args
= parser
.parse_args()
181 VERBOSE
= args
.verbose
183 collection
= Collection(args
.owner
, args
.collection
)
184 print(collection
.get_collection())
185 collection
.download()
187 if __name__
== "__main__":