add version command
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 from shutil import copyfile
13 from bs4 import BeautifulSoup
14
15 URL_BASE = "https://www.thingiverse.com"
16 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
17 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
18
19 ID_REGEX = re.compile(r'"id":(\d*),')
20 TOTAL_REGEX = re.compile(r'"total":(\d*),')
21 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
22 # This appears to be fixed at 12, but if it changes would screw the rest up.
23 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
24 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
25
26 VERSION = "0.4.0"
27
28 VERBOSE = False
29
30 def strip_ws(value):
31 """ Remove whitespace from a string """
32 return str(NO_WHITESPACE_REGEX.sub('-', value))
33
34 def slugify(value):
35 """
36 Normalizes string, converts to lowercase, removes non-alpha characters,
37 and converts spaces to hyphens.
38 """
39 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
40 value = str(re.sub(r'[^\w\s-]', '', value).strip())
41 value = str(NO_WHITESPACE_REGEX.sub('-', value))
42 #value = str(re.sub(r'[-\s]+', '-', value))
43 return value
44
45 class Grouping:
46 """ Holds details of a group of things for download
47 This is effectively (although not actually) an abstract class
48 - use Collection or Designs instead.
49 """
50 def __init__(self):
51 self.things = []
52 self.total = 0
53 self.req_id = None
54 self.last_page = 0
55 self.per_page = None
56 # These should be set by child classes.
57 self.url = None
58 self.download_dir = None
59 self.collection_url = None
60
61 def _get_small_grouping(self, req):
62 """ Handle small groupings """
63 soup = BeautifulSoup(req.text, features='lxml')
64 links = soup.find_all('a', {'class':'card-img-holder'})
65 self.things = [x['href'].split(':')[1] for x in links]
66
67 return self.things
68
69 def get(self):
70 """ retrieve the things of the grouping. """
71 if self.things:
72 # We've already done it.
73 return self.things
74
75 # Check for initialisation:
76 if not self.url:
77 print("No URL set - object not initialised properly?")
78 raise ValueError("No URL set - object not initialised properly?")
79
80 # Get the internal details of the grouping.
81 if VERBOSE:
82 print("Querying {}".format(self.url))
83 c_req = requests.get(self.url)
84 total = TOTAL_REGEX.search(c_req.text)
85 if total is None:
86 # This is a small (<13) items grouping. Pull the list from this req.
87 return self._get_small_grouping(c_req)
88 self.total = total.groups()[0]
89 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
90 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
91 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
92 parameters = {
93 'base_url':self.url,
94 'page':'1',
95 'per_page':'12',
96 'id':self.req_id
97 }
98 for current_page in range(1, self.last_page + 1):
99 parameters['page'] = current_page
100 req = requests.post(self.collection_url, parameters)
101 soup = BeautifulSoup(req.text, features='lxml')
102 links = soup.find_all('a', {'class':'card-img-holder'})
103 self.things += [x['href'].split(':')[1] for x in links]
104
105 return self.things
106
107 def download(self):
108 """ Downloads all the files in a collection """
109 if not self.things:
110 self.get()
111
112 if not self.download_dir:
113 raise ValueError("No download_dir set - invalidly initialised object?")
114
115 base_dir = os.getcwd()
116 try:
117 os.mkdir(self.download_dir)
118 except FileExistsError:
119 print("Target directory {} already exists. Assuming a resume."
120 .format(self.download_dir))
121 if VERBOSE:
122 print("Downloading {} things.".format(self.total))
123 for thing in self.things:
124 Thing(thing).download(self.download_dir)
125
126 class Collection(Grouping):
127 """ Holds details of a collection. """
128 def __init__(self, user, name, directory):
129 Grouping.__init__(self)
130 self.user = user
131 self.name = name
132 self.url = "{}/{}/collections/{}".format(
133 URL_BASE, self.user, strip_ws(self.name))
134 self.download_dir = os.path.join(directory,
135 "{}-{}".format(slugify(self.user), slugify(self.name)))
136 self.collection_url = URL_COLLECTION
137
138 class Designs(Grouping):
139 """ Holds details of all of a users' designs. """
140 def __init__(self, user, directory):
141 Grouping.__init__(self)
142 self.user = user
143 self.url = "{}/{}/designs".format(URL_BASE, self.user)
144 self.download_dir = os.path.join(directory, "{} designs".format(slugify(self.user)))
145 self.collection_url = USER_COLLECTION
146
147 class Thing:
148 """ An individual design on thingiverse. """
149 def __init__(self, thing_id):
150 self.thing_id = thing_id
151 self.last_time = None
152 self._parsed = False
153 self._needs_download = True
154 self.text = None
155 self.title = None
156 self.download_dir = None
157
158 def _parse(self, base_dir):
159 """ Work out what, if anything needs to be done. """
160 if self._parsed:
161 return
162
163 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
164 req = requests.get(url)
165 self.text = req.text
166 soup = BeautifulSoup(self.text, features='lxml')
167
168 self.title = slugify(soup.find_all('h1')[0].text.strip())
169 self.download_dir = os.path.join(base_dir, self.title)
170
171 if not os.path.exists(self.download_dir):
172 # Not yet downloaded
173 self._parsed = True
174 return
175
176 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
177 if not os.path.exists(timestamp_file):
178 # Old download from before
179 if VERBOSE:
180 print("Old-style download directory found. Assuming update required.")
181 self._parsed = True
182 return
183
184 try:
185 with open(timestamp_file, 'r') as timestamp_handle:
186 self.last_time = timestamp_handle.readlines()[0]
187 if VERBOSE:
188 print("last downloaded version: {}".format(self.last_time))
189 except FileNotFoundError:
190 # Not run on this thing before.
191 if VERBOSE:
192 print("Old-style download directory found. Assuming update required.")
193 self.last_time = None
194 self._parsed = True
195 return
196
197 # OK, so we have a timestamp, lets see if there is anything new to get
198 file_links = soup.find_all('a', {'class':'file-download'})
199 for file_link in file_links:
200 timestamp = file_link.find_all('time')[0]['datetime']
201 if VERBOSE:
202 print("Checking {} (updated {})".format(file_link["title"], timestamp))
203 if timestamp > self.last_time:
204 print("Found new/updated file {}".format(file_link["title"]))
205 self._needs_download = True
206 self._parsed = True
207 return
208 # Got here, so nope, no new files.
209 print("Found no new files for {}".format(self.title))
210 self._needs_download = False
211 self._parsed = True
212
213 def download(self, base_dir):
214 """ Download all files for a given thing. """
215 if not self._parsed:
216 self._parse(base_dir)
217
218 if not self._needs_download:
219 if VERBOSE:
220 print("{} already downloaded - skipping.".format(self.title))
221 return
222
223 # Have we already downloaded some things?
224 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
225 prev_dir = None
226 if os.path.exists(self.download_dir):
227 if not os.path.exists(timestamp_file):
228 # edge case: old style dir w/out timestamp.
229 print("Old style download dir found for {}".format(self.title))
230 os.rename(self.download_dir, "{}_old".format(self.download_dir))
231 else:
232 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
233 os.rename(self.download_dir, prev_dir)
234
235 # Get the list of files to download
236 soup = BeautifulSoup(self.text, features='lxml')
237 file_links = soup.find_all('a', {'class':'file-download'})
238
239 new_file_links = []
240 old_file_links = []
241 new_last_time = None
242
243 if not self.last_time:
244 # If we don't have anything to copy from, then it is all new.
245 new_file_links = file_links
246 new_last_time = file_links[0].find_all('time')[0]['datetime']
247 for file_link in file_links:
248 timestamp = file_link.find_all('time')[0]['datetime']
249 if VERBOSE:
250 print("Found file {} from {}".format(file_link["title"], timestamp))
251 if timestamp > new_last_time:
252 new_last_time = timestamp
253 else:
254 for file_link in file_links:
255 timestamp = file_link.find_all('time')[0]['datetime']
256 if VERBOSE:
257 print("Checking {} (updated {})".format(file_link["title"], timestamp))
258 if timestamp > self.last_time:
259 new_file_links.append(file_link)
260 else:
261 old_file_links.append(file_link)
262 if not new_last_time or timestamp > new_last_time:
263 new_last_time = timestamp
264
265 if VERBOSE:
266 print("new timestamp {}".format(new_last_time))
267
268 # OK. Time to get to work.
269 os.mkdir(self.download_dir)
270 # First grab the cached files (if any)
271 for file_link in old_file_links:
272 old_file = os.path.join(prev_dir, file_link["title"])
273 new_file = os.path.join(self.download_dir, file_link["title"])
274 try:
275 if VERBOSE:
276 print("Copying {} to {}".format(old_file, new_file))
277 copyfile(old_file, new_file)
278 except FileNotFoundError:
279 print("Unable to find {} in old archive, redownloading".format(file_link["title"]))
280 new_file_links.append(file_link)
281
282 # Now download the new ones
283 files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
284 try:
285 for url, name in files:
286 file_name = os.path.join(self.download_dir, name)
287 if VERBOSE:
288 print("Downloading {} from {} to {}".format(name, url, file_name))
289 data_req = requests.get(url)
290 with open(file_name, 'wb') as handle:
291 handle.write(data_req.content)
292 except Exception as exception:
293 print("Failed to download {} - {}".format(name, exception))
294 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
295 return
296
297 try:
298 # Now write the timestamp
299 with open(timestamp_file, 'w') as timestamp_handle:
300 timestamp_handle.write(new_last_time)
301 except Exception as exception:
302 print("Failed to write timestamp file - {}".format(exception))
303 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
304 return
305 self._needs_download = False
306 if VERBOSE:
307 print("Download of {} finished".format(self.title))
308
309 def main():
310 """ Entry point for script being run as a command. """
311 parser = argparse.ArgumentParser()
312 parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
313 parser.add_argument("-d", "--directory", help="Target directory to download into")
314 subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
315 collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
316 collection_parser.add_argument("owner", help="The owner of the collection to get")
317 collection_parser.add_argument("collection", help="The name of the collection to get")
318 thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
319 thing_parser.add_argument("thing", help="Thing ID to download")
320 user_parser = subparsers.add_parser("user", help="Download all things by a user")
321 user_parser.add_argument("user", help="The user to get the designs of")
322 version_parser = subparsers.add_parser("version", help="Show the current version")
323
324 args = parser.parse_args()
325 if not args.subcommand:
326 parser.print_help()
327 sys.exit(1)
328 if not args.directory:
329 args.directory = os.getcwd()
330
331 global VERBOSE
332 VERBOSE = args.verbose
333 if args.subcommand.startswith("collection"):
334 collection = Collection(args.owner, args.collection, args.directory)
335 print(collection.get())
336 collection.download()
337 if args.subcommand == "thing":
338 Thing(args.thing).download(args.directory)
339 if args.subcommand == "user":
340 designs = Designs(args.user, args.directory)
341 print(designs.get())
342 designs.download()
343 if args.subcommand == "version":
344 print("thingy_grabber.py version {}".format(VERSION))
345
346 if __name__ == "__main__":
347 main()