add multiple entry support
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 import logging
13 from shutil import copyfile
14 from bs4 import BeautifulSoup
15
16 URL_BASE = "https://www.thingiverse.com"
17 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
18 USER_COLLECTION = URL_BASE + "/ajax/user/designs"
19
20 ID_REGEX = re.compile(r'"id":(\d*),')
21 TOTAL_REGEX = re.compile(r'"total":(\d*),')
22 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
23 # This appears to be fixed at 12, but if it changes would screw the rest up.
24 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
25 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
26
27 VERSION = "0.5.1"
28
29
30 def strip_ws(value):
31 """ Remove whitespace from a string """
32 return str(NO_WHITESPACE_REGEX.sub('-', value))
33
34
35 def slugify(value):
36 """
37 Normalizes string, converts to lowercase, removes non-alpha characters,
38 and converts spaces to hyphens.
39 """
40 value = unicodedata.normalize('NFKD', value).encode(
41 'ascii', 'ignore').decode()
42 value = str(re.sub(r'[^\w\s-]', '', value).strip())
43 value = str(NO_WHITESPACE_REGEX.sub('-', value))
44 #value = str(re.sub(r'[-\s]+', '-', value))
45 return value
46
47
48 class Grouping:
49 """ Holds details of a group of things for download
50 This is effectively (although not actually) an abstract class
51 - use Collection or Designs instead.
52 """
53
54 def __init__(self):
55 self.things = []
56 self.total = 0
57 self.req_id = None
58 self.last_page = 0
59 self.per_page = None
60 # These should be set by child classes.
61 self.url = None
62 self.download_dir = None
63 self.collection_url = None
64
65 def _get_small_grouping(self, req):
66 """ Handle small groupings """
67 soup = BeautifulSoup(req.text, features='lxml')
68 links = soup.find_all('a', {'class': 'card-img-holder'})
69 self.things = [x['href'].split(':')[1] for x in links]
70 self.total = len(self.things)
71
72 return self.things
73
74 def get(self):
75 """ retrieve the things of the grouping. """
76 if self.things:
77 # We've already done it.
78 return self.things
79
80 # Check for initialisation:
81 if not self.url:
82 logging.error("No URL set - object not initialised properly?")
83 raise ValueError("No URL set - object not initialised properly?")
84
85 # Get the internal details of the grouping.
86 logging.debug("Querying {}".format(self.url))
87 c_req = requests.get(self.url)
88 total = TOTAL_REGEX.search(c_req.text)
89 if total is None:
90 # This is a small (<13) items grouping. Pull the list from this req.
91 return self._get_small_grouping(c_req)
92 self.total = total.groups()[0]
93 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
94 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
95 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
96 parameters = {
97 'base_url': self.url,
98 'page': '1',
99 'per_page': '12',
100 'id': self.req_id
101 }
102 for current_page in range(1, self.last_page + 1):
103 parameters['page'] = current_page
104 req = requests.post(self.collection_url, parameters)
105 soup = BeautifulSoup(req.text, features='lxml')
106 links = soup.find_all('a', {'class': 'card-img-holder'})
107 self.things += [x['href'].split(':')[1] for x in links]
108
109 return self.things
110
111 def download(self):
112 """ Downloads all the files in a collection """
113 if not self.things:
114 self.get()
115
116 if not self.download_dir:
117 raise ValueError(
118 "No download_dir set - invalidly initialised object?")
119
120 base_dir = os.getcwd()
121 try:
122 os.mkdir(self.download_dir)
123 except FileExistsError:
124 logging.info("Target directory {} already exists. Assuming a resume."
125 .format(self.download_dir))
126 logging.info("Downloading {} thing(s).".format(self.total))
127 for idx, thing in enumerate(self.things):
128 logging.info("Downloading thing {}".format(idx))
129 Thing(thing).download(self.download_dir)
130
131
132 class Collection(Grouping):
133 """ Holds details of a collection. """
134
135 def __init__(self, user, name, directory):
136 Grouping.__init__(self)
137 self.user = user
138 self.name = name
139 self.url = "{}/{}/collections/{}".format(
140 URL_BASE, self.user, strip_ws(self.name))
141 self.download_dir = os.path.join(directory,
142 "{}-{}".format(slugify(self.user), slugify(self.name)))
143 self.collection_url = URL_COLLECTION
144
145
146 class Designs(Grouping):
147 """ Holds details of all of a users' designs. """
148
149 def __init__(self, user, directory):
150 Grouping.__init__(self)
151 self.user = user
152 self.url = "{}/{}/designs".format(URL_BASE, self.user)
153 self.download_dir = os.path.join(
154 directory, "{} designs".format(slugify(self.user)))
155 self.collection_url = USER_COLLECTION
156
157
158 class Thing:
159 """ An individual design on thingiverse. """
160
161 def __init__(self, thing_id):
162 self.thing_id = thing_id
163 self.last_time = None
164 self._parsed = False
165 self._needs_download = True
166 self.text = None
167 self.title = None
168 self.download_dir = None
169
170 def _parse(self, base_dir):
171 """ Work out what, if anything needs to be done. """
172 if self._parsed:
173 return
174
175 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
176 req = requests.get(url)
177 self.text = req.text
178 soup = BeautifulSoup(self.text, features='lxml')
179 #import code
180 #code.interact(local=dict(globals(), **locals()))
181 self.title = slugify(soup.find_all('h1')[0].text.strip())
182 self.download_dir = os.path.join(base_dir, self.title)
183
184 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
185
186 if not os.path.exists(self.download_dir):
187 # Not yet downloaded
188 self._parsed = True
189 return
190
191 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
192 if not os.path.exists(timestamp_file):
193 # Old download from before
194 logging.warning(
195 "Old-style download directory found. Assuming update required.")
196 self._parsed = True
197 return
198
199 try:
200 with open(timestamp_file, 'r') as timestamp_handle:
201 self.last_time = timestamp_handle.readlines()[0]
202 logging.info("last downloaded version: {}".format(self.last_time))
203 except FileNotFoundError:
204 # Not run on this thing before.
205 logging.info(
206 "Old-style download directory found. Assuming update required.")
207 self.last_time = None
208 self._parsed = True
209 return
210
211 # OK, so we have a timestamp, lets see if there is anything new to get
212 file_links = soup.find_all('a', {'class': 'file-download'})
213 for file_link in file_links:
214 timestamp = file_link.find_all('time')[0]['datetime']
215 logging.debug("Checking {} (updated {})".format(
216 file_link["title"], timestamp))
217 if timestamp > self.last_time:
218 logging.info(
219 "Found new/updated file {}".format(file_link["title"]))
220 self._needs_download = True
221 self._parsed = True
222 return
223 # Got here, so nope, no new files.
224 self._needs_download = False
225 self._parsed = True
226
227 def download(self, base_dir):
228 """ Download all files for a given thing. """
229 if not self._parsed:
230 self._parse(base_dir)
231
232 if not self._needs_download:
233 print("{} already downloaded - skipping.".format(self.title))
234 return
235
236 # Have we already downloaded some things?
237 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
238 prev_dir = None
239 if os.path.exists(self.download_dir):
240 if not os.path.exists(timestamp_file):
241 # edge case: old style dir w/out timestamp.
242 logging.warning(
243 "Old style download dir found for {}".format(self.title))
244 os.rename(self.download_dir,
245 "{}_old".format(self.download_dir))
246 else:
247 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
248 os.rename(self.download_dir, prev_dir)
249
250 # Get the list of files to download
251 soup = BeautifulSoup(self.text, features='lxml')
252 file_links = soup.find_all('a', {'class': 'file-download'})
253
254 new_file_links = []
255 old_file_links = []
256 new_last_time = None
257
258 if not self.last_time:
259 # If we don't have anything to copy from, then it is all new.
260 new_file_links = file_links
261 new_last_time = file_links[0].find_all('time')[0]['datetime']
262 for file_link in file_links:
263 timestamp = file_link.find_all('time')[0]['datetime']
264 logging.debug("Found file {} from {}".format(
265 file_link["title"], timestamp))
266 if timestamp > new_last_time:
267 new_last_time = timestamp
268 else:
269 for file_link in file_links:
270 timestamp = file_link.find_all('time')[0]['datetime']
271 logging.debug("Checking {} (updated {})".format(
272 file_link["title"], timestamp))
273 if timestamp > self.last_time:
274 new_file_links.append(file_link)
275 else:
276 old_file_links.append(file_link)
277 if not new_last_time or timestamp > new_last_time:
278 new_last_time = timestamp
279
280 logging.debug("new timestamp {}".format(new_last_time))
281
282 # OK. Time to get to work.
283 logging.debug("Generating download_dir")
284 os.mkdir(self.download_dir)
285 # First grab the cached files (if any)
286 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
287 for file_link in old_file_links:
288 old_file = os.path.join(prev_dir, file_link["title"])
289 new_file = os.path.join(self.download_dir, file_link["title"])
290 try:
291 logging.debug("Copying {} to {}".format(old_file, new_file))
292 copyfile(old_file, new_file)
293 except FileNotFoundError:
294 logging.warning(
295 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
296 new_file_links.append(file_link)
297
298 # Now download the new ones
299 files = [("{}{}".format(URL_BASE, x['href']), x["title"])
300 for x in new_file_links]
301 logging.info("Downloading {} new files of {}".format(
302 len(new_file_links), len(file_links)))
303 try:
304 for url, name in files:
305 file_name = os.path.join(self.download_dir, name)
306 logging.debug("Downloading {} from {} to {}".format(
307 name, url, file_name))
308 data_req = requests.get(url)
309 with open(file_name, 'wb') as handle:
310 handle.write(data_req.content)
311 except Exception as exception:
312 logging.error("Failed to download {} - {}".format(name, exception))
313 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
314 return
315
316 # People like images
317 image_dir = os.path.join(self.download_dir, 'images')
318 imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
319 .find_all('div', {'class': 'gallery-photo'})
320 logging.info("Downloading {} images.".format(len(imagelinks)))
321 try:
322 os.mkdir(image_dir)
323 for imagelink in imagelinks:
324 url = next(filter(None,[imagelink[x] for x in ['data-full',
325 'data-large',
326 'data-medium',
327 'data-thumb']]), None)
328 if not url:
329 logging.warning("Unable to find any urls for {}".format(imagelink))
330 continue
331
332 filename = os.path.basename(url)
333 if filename.endswith('stl'):
334 filename = "{}.png".format(filename)
335 image_req = requests.get(url)
336 with open(os.path.join(image_dir, filename), 'wb') as handle:
337 handle.write(image_req.content)
338 except Exception as exception:
339 print("Failed to download {} - {}".format(filename, exception))
340 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
341 return
342
343 try:
344 # Now write the timestamp
345 with open(timestamp_file, 'w') as timestamp_handle:
346 timestamp_handle.write(new_last_time)
347 except Exception as exception:
348 print("Failed to write timestamp file - {}".format(exception))
349 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
350 return
351 self._needs_download = False
352 logging.debug("Download of {} finished".format(self.title))
353
354
355 def do_batch(batch_file, download_dir):
356 """ Read a file in line by line, parsing each as a set of calls to this script."""
357 with open(batch_file) as handle:
358 for line in handle:
359 line = line.strip()
360 logging.info("Handling instruction {}".format(line))
361 command_arr = line.split()
362 if command_arr[0] == "thing":
363 logging.debug(
364 "Handling batch thing instruction: {}".format(line))
365 Thing(command_arr[1]).download(download_dir)
366 continue
367 if command_arr[0] == "collection":
368 logging.debug(
369 "Handling batch collection instruction: {}".format(line))
370 Collection(command_arr[1], command_arr[2],
371 download_dir).download()
372 continue
373 if command_arr[0] == "user":
374 logging.debug(
375 "Handling batch collection instruction: {}".format(line))
376 Designs(command_arr[1], download_dir).download()
377 continue
378 logging.warning("Unable to parse current instruction. Skipping.")
379
380
381 def main():
382 """ Entry point for script being run as a command. """
383 parser = argparse.ArgumentParser()
384 parser.add_argument("-l", "--log-level", choices=[
385 'debug', 'info', 'warning'], default='info', help="level of logging desired")
386 parser.add_argument("-d", "--directory",
387 help="Target directory to download into")
388 subparsers = parser.add_subparsers(
389 help="Type of thing to download", dest="subcommand")
390 collection_parser = subparsers.add_parser(
391 'collection', help="Download one or more entire collection(s)")
392 collection_parser.add_argument(
393 "owner", help="The owner of the collection(s) to get")
394 collection_parser.add_argument(
395 "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
396 thing_parser = subparsers.add_parser(
397 'thing', help="Download a single thing.")
398 thing_parser.add_argument("things", nargs="*", help="Space seperated list of thing ID(s) to download")
399 user_parser = subparsers.add_parser(
400 "user", help="Download all things by one or more users")
401 user_parser.add_argument("users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
402 batch_parser = subparsers.add_parser(
403 "batch", help="Perform multiple actions written in a text file")
404 batch_parser.add_argument(
405 "batch_file", help="The name of the file to read.")
406 subparsers.add_parser("version", help="Show the current version")
407
408 args = parser.parse_args()
409 if not args.subcommand:
410 parser.print_help()
411 sys.exit(1)
412 if not args.directory:
413 args.directory = os.getcwd()
414 logging.basicConfig(level=getattr(logging, args.log_level.upper()))
415
416 if args.subcommand.startswith("collection"):
417 for collection in args.collections:
418 Collection(args.owner, collection, args.directory).download()
419 if args.subcommand == "thing":
420 for thing in args.things:
421 Thing(thing).download(args.directory)
422 if args.subcommand == "user":
423 for user in args.users:
424 Designs(user, args.directory).download()
425 if args.subcommand == "version":
426 print("thingy_grabber.py version {}".format(VERSION))
427 if args.subcommand == "batch":
428 do_batch(args.batch_file, args.directory)
429
430
431 if __name__ == "__main__":
432 main()