autopep
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
3c82f75b 13from shutil import copyfile
975060c9
OM
14from bs4 import BeautifulSoup
15
16URL_BASE = "https://www.thingiverse.com"
17URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
948bd56f 18USER_COLLECTION = URL_BASE + "/ajax/user/designs"
975060c9
OM
19
20ID_REGEX = re.compile(r'"id":(\d*),')
21TOTAL_REGEX = re.compile(r'"total":(\d*),')
22LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
23# This appears to be fixed at 12, but if it changes would screw the rest up.
24PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
25NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
26
dbdb1782
OM
27VERSION = "0.5.1"
28
db8066ec 29
dd8c35f4
OM
30def strip_ws(value):
31 """ Remove whitespace from a string """
32 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9 33
dbdb1782 34
975060c9
OM
35def slugify(value):
36 """
37 Normalizes string, converts to lowercase, removes non-alpha characters,
38 and converts spaces to hyphens.
39 """
dbdb1782
OM
40 value = unicodedata.normalize('NFKD', value).encode(
41 'ascii', 'ignore').decode()
975060c9 42 value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4
OM
43 value = str(NO_WHITESPACE_REGEX.sub('-', value))
44 #value = str(re.sub(r'[-\s]+', '-', value))
975060c9
OM
45 return value
46
dbdb1782 47
3522a3bf 48class Grouping:
d66f1f78 49 """ Holds details of a group of things for download
3c82f75b
OM
50 This is effectively (although not actually) an abstract class
51 - use Collection or Designs instead.
52 """
dbdb1782 53
3522a3bf 54 def __init__(self):
975060c9
OM
55 self.things = []
56 self.total = 0
57 self.req_id = None
58 self.last_page = 0
59 self.per_page = None
948bd56f 60 # These should be set by child classes.
3522a3bf
OM
61 self.url = None
62 self.download_dir = None
948bd56f 63 self.collection_url = None
975060c9 64
3522a3bf
OM
65 def _get_small_grouping(self, req):
66 """ Handle small groupings """
975060c9 67 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 68 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9 69 self.things = [x['href'].split(':')[1] for x in links]
fa2f3251 70 self.total = len(self.things)
975060c9
OM
71
72 return self.things
73
3522a3bf
OM
74 def get(self):
75 """ retrieve the things of the grouping. """
975060c9
OM
76 if self.things:
77 # We've already done it.
78 return self.things
79
3522a3bf
OM
80 # Check for initialisation:
81 if not self.url:
fa2f3251 82 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
83 raise ValueError("No URL set - object not initialised properly?")
84
85 # Get the internal details of the grouping.
fa2f3251 86 logging.debug("Querying {}".format(self.url))
3522a3bf 87 c_req = requests.get(self.url)
975060c9
OM
88 total = TOTAL_REGEX.search(c_req.text)
89 if total is None:
3522a3bf
OM
90 # This is a small (<13) items grouping. Pull the list from this req.
91 return self._get_small_grouping(c_req)
975060c9
OM
92 self.total = total.groups()[0]
93 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
94 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
95 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
96 parameters = {
dbdb1782
OM
97 'base_url': self.url,
98 'page': '1',
99 'per_page': '12',
100 'id': self.req_id
975060c9
OM
101 }
102 for current_page in range(1, self.last_page + 1):
103 parameters['page'] = current_page
948bd56f 104 req = requests.post(self.collection_url, parameters)
975060c9 105 soup = BeautifulSoup(req.text, features='lxml')
dbdb1782 106 links = soup.find_all('a', {'class': 'card-img-holder'})
975060c9
OM
107 self.things += [x['href'].split(':')[1] for x in links]
108
109 return self.things
110
111 def download(self):
112 """ Downloads all the files in a collection """
113 if not self.things:
3522a3bf
OM
114 self.get()
115
116 if not self.download_dir:
dbdb1782
OM
117 raise ValueError(
118 "No download_dir set - invalidly initialised object?")
3522a3bf 119
975060c9 120 base_dir = os.getcwd()
975060c9 121 try:
3522a3bf 122 os.mkdir(self.download_dir)
975060c9 123 except FileExistsError:
fa2f3251 124 logging.info("Target directory {} already exists. Assuming a resume."
dbdb1782 125 .format(self.download_dir))
fa2f3251 126 logging.info("Downloading {} thing(s).".format(self.total))
dbdb1782 127 for idx, thing in enumerate(self.things):
fa2f3251 128 logging.info("Downloading thing {}".format(idx))
3c82f75b 129 Thing(thing).download(self.download_dir)
975060c9 130
dbdb1782 131
3522a3bf
OM
132class Collection(Grouping):
133 """ Holds details of a collection. """
dbdb1782 134
d66f1f78 135 def __init__(self, user, name, directory):
3522a3bf
OM
136 Grouping.__init__(self)
137 self.user = user
138 self.name = name
3c82f75b
OM
139 self.url = "{}/{}/collections/{}".format(
140 URL_BASE, self.user, strip_ws(self.name))
d66f1f78 141 self.download_dir = os.path.join(directory,
3c82f75b 142 "{}-{}".format(slugify(self.user), slugify(self.name)))
948bd56f 143 self.collection_url = URL_COLLECTION
3522a3bf 144
dbdb1782 145
3522a3bf
OM
146class Designs(Grouping):
147 """ Holds details of all of a users' designs. """
dbdb1782 148
d66f1f78 149 def __init__(self, user, directory):
3522a3bf
OM
150 Grouping.__init__(self)
151 self.user = user
152 self.url = "{}/{}/designs".format(URL_BASE, self.user)
dbdb1782
OM
153 self.download_dir = os.path.join(
154 directory, "{} designs".format(slugify(self.user)))
948bd56f 155 self.collection_url = USER_COLLECTION
975060c9 156
dbdb1782 157
3c82f75b
OM
158class Thing:
159 """ An individual design on thingiverse. """
dbdb1782 160
3c82f75b
OM
161 def __init__(self, thing_id):
162 self.thing_id = thing_id
163 self.last_time = None
164 self._parsed = False
165 self._needs_download = True
166 self.text = None
167 self.title = None
168 self.download_dir = None
975060c9 169
3c82f75b
OM
170 def _parse(self, base_dir):
171 """ Work out what, if anything needs to be done. """
172 if self._parsed:
173 return
e36c2a07 174
3c82f75b
OM
175 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
176 req = requests.get(url)
177 self.text = req.text
178 soup = BeautifulSoup(self.text, features='lxml')
680039fe
OM
179 #import code
180 #code.interact(local=dict(globals(), **locals()))
3c82f75b
OM
181 self.title = slugify(soup.find_all('h1')[0].text.strip())
182 self.download_dir = os.path.join(base_dir, self.title)
183
fa2f3251
OM
184 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
185
3c82f75b
OM
186 if not os.path.exists(self.download_dir):
187 # Not yet downloaded
188 self._parsed = True
189 return
190
191 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
192 if not os.path.exists(timestamp_file):
193 # Old download from before
dbdb1782
OM
194 logging.warning(
195 "Old-style download directory found. Assuming update required.")
3c82f75b
OM
196 self._parsed = True
197 return
198
199 try:
200 with open(timestamp_file, 'r') as timestamp_handle:
201 self.last_time = timestamp_handle.readlines()[0]
fa2f3251 202 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
203 except FileNotFoundError:
204 # Not run on this thing before.
dbdb1782
OM
205 logging.info(
206 "Old-style download directory found. Assuming update required.")
3c82f75b
OM
207 self.last_time = None
208 self._parsed = True
209 return
210
211 # OK, so we have a timestamp, lets see if there is anything new to get
dbdb1782 212 file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b
OM
213 for file_link in file_links:
214 timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782
OM
215 logging.debug("Checking {} (updated {})".format(
216 file_link["title"], timestamp))
3c82f75b 217 if timestamp > self.last_time:
dbdb1782
OM
218 logging.info(
219 "Found new/updated file {}".format(file_link["title"]))
3c82f75b
OM
220 self._needs_download = True
221 self._parsed = True
222 return
223 # Got here, so nope, no new files.
3c82f75b
OM
224 self._needs_download = False
225 self._parsed = True
226
227 def download(self, base_dir):
228 """ Download all files for a given thing. """
229 if not self._parsed:
230 self._parse(base_dir)
231
232 if not self._needs_download:
fa2f3251 233 print("{} already downloaded - skipping.".format(self.title))
3c82f75b
OM
234 return
235
236 # Have we already downloaded some things?
237 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
238 prev_dir = None
239 if os.path.exists(self.download_dir):
240 if not os.path.exists(timestamp_file):
241 # edge case: old style dir w/out timestamp.
dbdb1782
OM
242 logging.warning(
243 "Old style download dir found for {}".format(self.title))
244 os.rename(self.download_dir,
245 "{}_old".format(self.download_dir))
3c82f75b
OM
246 else:
247 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
248 os.rename(self.download_dir, prev_dir)
249
250 # Get the list of files to download
251 soup = BeautifulSoup(self.text, features='lxml')
dbdb1782 252 file_links = soup.find_all('a', {'class': 'file-download'})
3c82f75b
OM
253
254 new_file_links = []
255 old_file_links = []
256 new_last_time = None
257
258 if not self.last_time:
259 # If we don't have anything to copy from, then it is all new.
260 new_file_links = file_links
261 new_last_time = file_links[0].find_all('time')[0]['datetime']
262 for file_link in file_links:
263 timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782
OM
264 logging.debug("Found file {} from {}".format(
265 file_link["title"], timestamp))
3c82f75b
OM
266 if timestamp > new_last_time:
267 new_last_time = timestamp
268 else:
269 for file_link in file_links:
270 timestamp = file_link.find_all('time')[0]['datetime']
dbdb1782
OM
271 logging.debug("Checking {} (updated {})".format(
272 file_link["title"], timestamp))
3c82f75b
OM
273 if timestamp > self.last_time:
274 new_file_links.append(file_link)
275 else:
276 old_file_links.append(file_link)
277 if not new_last_time or timestamp > new_last_time:
278 new_last_time = timestamp
279
fa2f3251 280 logging.debug("new timestamp {}".format(new_last_time))
3c82f75b
OM
281
282 # OK. Time to get to work.
fa2f3251 283 logging.debug("Generating download_dir")
3c82f75b
OM
284 os.mkdir(self.download_dir)
285 # First grab the cached files (if any)
fa2f3251 286 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
3c82f75b
OM
287 for file_link in old_file_links:
288 old_file = os.path.join(prev_dir, file_link["title"])
289 new_file = os.path.join(self.download_dir, file_link["title"])
290 try:
fa2f3251 291 logging.debug("Copying {} to {}".format(old_file, new_file))
3c82f75b
OM
292 copyfile(old_file, new_file)
293 except FileNotFoundError:
dbdb1782
OM
294 logging.warning(
295 "Unable to find {} in old archive, redownloading".format(file_link["title"]))
3c82f75b
OM
296 new_file_links.append(file_link)
297
298 # Now download the new ones
dbdb1782
OM
299 files = [("{}{}".format(URL_BASE, x['href']), x["title"])
300 for x in new_file_links]
301 logging.info("Downloading {} new files of {}".format(
302 len(new_file_links), len(file_links)))
3c82f75b
OM
303 try:
304 for url, name in files:
305 file_name = os.path.join(self.download_dir, name)
dbdb1782
OM
306 logging.debug("Downloading {} from {} to {}".format(
307 name, url, file_name))
3c82f75b
OM
308 data_req = requests.get(url)
309 with open(file_name, 'wb') as handle:
310 handle.write(data_req.content)
311 except Exception as exception:
fa2f3251 312 logging.error("Failed to download {} - {}".format(name, exception))
3c82f75b
OM
313 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
314 return
315
680039fe
OM
316 # People like images
317 image_dir = os.path.join(self.download_dir, 'images')
dbdb1782
OM
318 imagelinks = soup.find_all('span', {'class': 'gallery-slider'})[0] \
319 .find_all('div', {'class': 'gallery-photo'})
fa2f3251 320 logging.info("Downloading {} images.".format(len(imagelinks)))
680039fe
OM
321 try:
322 os.mkdir(image_dir)
fa2f3251 323 for imagelink in imagelinks:
680039fe
OM
324 url = imagelink['data-full']
325 filename = os.path.basename(url)
326 if filename.endswith('stl'):
327 filename = "{}.png".format(filename)
328 image_req = requests.get(url)
329 with open(os.path.join(image_dir, filename), 'wb') as handle:
330 handle.write(image_req.content)
331 except Exception as exception:
332 print("Failed to download {} - {}".format(filename, exception))
333 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
334 return
335
3c82f75b
OM
336 try:
337 # Now write the timestamp
338 with open(timestamp_file, 'w') as timestamp_handle:
339 timestamp_handle.write(new_last_time)
340 except Exception as exception:
341 print("Failed to write timestamp file - {}".format(exception))
342 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
343 return
344 self._needs_download = False
fa2f3251 345 logging.debug("Download of {} finished".format(self.title))
975060c9 346
dbdb1782 347
1ab49020
OM
348def do_batch(batch_file, download_dir):
349 """ Read a file in line by line, parsing each as a set of calls to this script."""
350 with open(batch_file) as handle:
351 for line in handle:
352 line = line.strip()
353 logging.info("Handling instruction {}".format(line))
354 command_arr = line.split()
355 if command_arr[0] == "thing":
dbdb1782
OM
356 logging.debug(
357 "Handling batch thing instruction: {}".format(line))
1ab49020
OM
358 Thing(command_arr[1]).download(download_dir)
359 continue
360 if command_arr[0] == "collection":
dbdb1782
OM
361 logging.debug(
362 "Handling batch collection instruction: {}".format(line))
363 Collection(command_arr[1], command_arr[2],
364 download_dir).download()
1ab49020
OM
365 continue
366 if command_arr[0] == "user":
dbdb1782
OM
367 logging.debug(
368 "Handling batch collection instruction: {}".format(line))
1ab49020
OM
369 Designs(command_arr[1], download_dir).download()
370 continue
371 logging.warning("Unable to parse current instruction. Skipping.")
372
dbdb1782 373
975060c9
OM
374def main():
375 """ Entry point for script being run as a command. """
376 parser = argparse.ArgumentParser()
dbdb1782
OM
377 parser.add_argument("-l", "--log-level", choices=[
378 'debug', 'info', 'warning'], default='info', help="level of logging desired")
379 parser.add_argument("-d", "--directory",
380 help="Target directory to download into")
381 subparsers = parser.add_subparsers(
382 help="Type of thing to download", dest="subcommand")
383 collection_parser = subparsers.add_parser(
384 'collection', help="Download an entire collection")
385 collection_parser.add_argument(
386 "owner", help="The owner of the collection to get")
387 collection_parser.add_argument(
388 "collection", help="The name of the collection to get")
389 thing_parser = subparsers.add_parser(
390 'thing', help="Download a single thing.")
4a98996b 391 thing_parser.add_argument("thing", help="Thing ID to download")
dbdb1782
OM
392 user_parser = subparsers.add_parser(
393 "user", help="Download all things by a user")
3522a3bf 394 user_parser.add_argument("user", help="The user to get the designs of")
dbdb1782
OM
395 batch_parser = subparsers.add_parser(
396 "batch", help="Perform multiple actions written in a text file")
397 batch_parser.add_argument(
398 "batch_file", help="The name of the file to read.")
680039fe 399 subparsers.add_parser("version", help="Show the current version")
4a98996b 400
975060c9 401 args = parser.parse_args()
4a98996b
OM
402 if not args.subcommand:
403 parser.print_help()
404 sys.exit(1)
d66f1f78
OM
405 if not args.directory:
406 args.directory = os.getcwd()
fa2f3251
OM
407 logging.basicConfig(level=getattr(logging, args.log_level.upper()))
408
4a98996b 409 if args.subcommand.startswith("collection"):
1ab49020 410 Collection(args.owner, args.collection, args.directory).download()
4a98996b 411 if args.subcommand == "thing":
d66f1f78 412 Thing(args.thing).download(args.directory)
3522a3bf 413 if args.subcommand == "user":
1ab49020 414 Designs(args.user, args.directory).download()
db8066ec
OM
415 if args.subcommand == "version":
416 print("thingy_grabber.py version {}".format(VERSION))
1ab49020
OM
417 if args.subcommand == "batch":
418 do_batch(args.batch_file, args.directory)
419
975060c9
OM
420
421if __name__ == "__main__":
422 main()