Add batch support
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
fa2f3251 12import logging
3c82f75b 13from shutil import copyfile
975060c9
OM
14from bs4 import BeautifulSoup
15
16URL_BASE = "https://www.thingiverse.com"
17URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
948bd56f 18USER_COLLECTION = URL_BASE + "/ajax/user/designs"
975060c9
OM
19
20ID_REGEX = re.compile(r'"id":(\d*),')
21TOTAL_REGEX = re.compile(r'"total":(\d*),')
22LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
23# This appears to be fixed at 12, but if it changes would screw the rest up.
24PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
25NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
26
db8066ec
OM
27VERSION = "0.4.0"
28
dd8c35f4
OM
29def strip_ws(value):
30 """ Remove whitespace from a string """
31 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9
OM
32
33def slugify(value):
34 """
35 Normalizes string, converts to lowercase, removes non-alpha characters,
36 and converts spaces to hyphens.
37 """
38 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
39 value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4
OM
40 value = str(NO_WHITESPACE_REGEX.sub('-', value))
41 #value = str(re.sub(r'[-\s]+', '-', value))
975060c9
OM
42 return value
43
3522a3bf 44class Grouping:
d66f1f78 45 """ Holds details of a group of things for download
3c82f75b
OM
46 This is effectively (although not actually) an abstract class
47 - use Collection or Designs instead.
48 """
3522a3bf 49 def __init__(self):
975060c9
OM
50 self.things = []
51 self.total = 0
52 self.req_id = None
53 self.last_page = 0
54 self.per_page = None
948bd56f 55 # These should be set by child classes.
3522a3bf
OM
56 self.url = None
57 self.download_dir = None
948bd56f 58 self.collection_url = None
975060c9 59
3522a3bf
OM
60 def _get_small_grouping(self, req):
61 """ Handle small groupings """
975060c9
OM
62 soup = BeautifulSoup(req.text, features='lxml')
63 links = soup.find_all('a', {'class':'card-img-holder'})
64 self.things = [x['href'].split(':')[1] for x in links]
fa2f3251 65 self.total = len(self.things)
975060c9
OM
66
67 return self.things
68
3522a3bf
OM
69 def get(self):
70 """ retrieve the things of the grouping. """
975060c9
OM
71 if self.things:
72 # We've already done it.
73 return self.things
74
3522a3bf
OM
75 # Check for initialisation:
76 if not self.url:
fa2f3251 77 logging.error("No URL set - object not initialised properly?")
3522a3bf
OM
78 raise ValueError("No URL set - object not initialised properly?")
79
80 # Get the internal details of the grouping.
fa2f3251 81 logging.debug("Querying {}".format(self.url))
3522a3bf 82 c_req = requests.get(self.url)
975060c9
OM
83 total = TOTAL_REGEX.search(c_req.text)
84 if total is None:
3522a3bf
OM
85 # This is a small (<13) items grouping. Pull the list from this req.
86 return self._get_small_grouping(c_req)
975060c9
OM
87 self.total = total.groups()[0]
88 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
89 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
90 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
91 parameters = {
3522a3bf 92 'base_url':self.url,
975060c9
OM
93 'page':'1',
94 'per_page':'12',
95 'id':self.req_id
96 }
97 for current_page in range(1, self.last_page + 1):
98 parameters['page'] = current_page
948bd56f 99 req = requests.post(self.collection_url, parameters)
975060c9
OM
100 soup = BeautifulSoup(req.text, features='lxml')
101 links = soup.find_all('a', {'class':'card-img-holder'})
102 self.things += [x['href'].split(':')[1] for x in links]
103
104 return self.things
105
106 def download(self):
107 """ Downloads all the files in a collection """
108 if not self.things:
3522a3bf
OM
109 self.get()
110
111 if not self.download_dir:
112 raise ValueError("No download_dir set - invalidly initialised object?")
113
975060c9 114 base_dir = os.getcwd()
975060c9 115 try:
3522a3bf 116 os.mkdir(self.download_dir)
975060c9 117 except FileExistsError:
fa2f3251
OM
118 logging.info("Target directory {} already exists. Assuming a resume."
119 .format(self.download_dir))
120 logging.info("Downloading {} thing(s).".format(self.total))
121 for idx,thing in enumerate(self.things):
122 logging.info("Downloading thing {}".format(idx))
3c82f75b 123 Thing(thing).download(self.download_dir)
975060c9 124
3522a3bf
OM
125class Collection(Grouping):
126 """ Holds details of a collection. """
d66f1f78 127 def __init__(self, user, name, directory):
3522a3bf
OM
128 Grouping.__init__(self)
129 self.user = user
130 self.name = name
3c82f75b
OM
131 self.url = "{}/{}/collections/{}".format(
132 URL_BASE, self.user, strip_ws(self.name))
d66f1f78 133 self.download_dir = os.path.join(directory,
3c82f75b 134 "{}-{}".format(slugify(self.user), slugify(self.name)))
948bd56f 135 self.collection_url = URL_COLLECTION
3522a3bf
OM
136
137class Designs(Grouping):
138 """ Holds details of all of a users' designs. """
d66f1f78 139 def __init__(self, user, directory):
3522a3bf
OM
140 Grouping.__init__(self)
141 self.user = user
142 self.url = "{}/{}/designs".format(URL_BASE, self.user)
d66f1f78 143 self.download_dir = os.path.join(directory, "{} designs".format(slugify(self.user)))
948bd56f 144 self.collection_url = USER_COLLECTION
975060c9 145
3c82f75b
OM
146class Thing:
147 """ An individual design on thingiverse. """
148 def __init__(self, thing_id):
149 self.thing_id = thing_id
150 self.last_time = None
151 self._parsed = False
152 self._needs_download = True
153 self.text = None
154 self.title = None
155 self.download_dir = None
975060c9 156
3c82f75b
OM
157 def _parse(self, base_dir):
158 """ Work out what, if anything needs to be done. """
159 if self._parsed:
160 return
e36c2a07 161
3c82f75b
OM
162 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
163 req = requests.get(url)
164 self.text = req.text
165 soup = BeautifulSoup(self.text, features='lxml')
680039fe
OM
166 #import code
167 #code.interact(local=dict(globals(), **locals()))
3c82f75b
OM
168 self.title = slugify(soup.find_all('h1')[0].text.strip())
169 self.download_dir = os.path.join(base_dir, self.title)
170
fa2f3251
OM
171 logging.debug("Parsing {} ({})".format(self.thing_id, self.title))
172
3c82f75b
OM
173 if not os.path.exists(self.download_dir):
174 # Not yet downloaded
175 self._parsed = True
176 return
177
178 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
179 if not os.path.exists(timestamp_file):
180 # Old download from before
fa2f3251 181 logging.warning("Old-style download directory found. Assuming update required.")
3c82f75b
OM
182 self._parsed = True
183 return
184
185 try:
186 with open(timestamp_file, 'r') as timestamp_handle:
187 self.last_time = timestamp_handle.readlines()[0]
fa2f3251 188 logging.info("last downloaded version: {}".format(self.last_time))
3c82f75b
OM
189 except FileNotFoundError:
190 # Not run on this thing before.
fa2f3251 191 logging.info("Old-style download directory found. Assuming update required.")
3c82f75b
OM
192 self.last_time = None
193 self._parsed = True
194 return
195
196 # OK, so we have a timestamp, lets see if there is anything new to get
197 file_links = soup.find_all('a', {'class':'file-download'})
198 for file_link in file_links:
199 timestamp = file_link.find_all('time')[0]['datetime']
fa2f3251 200 logging.debug("Checking {} (updated {})".format(file_link["title"], timestamp))
3c82f75b 201 if timestamp > self.last_time:
fa2f3251 202 logging.info("Found new/updated file {}".format(file_link["title"]))
3c82f75b
OM
203 self._needs_download = True
204 self._parsed = True
205 return
206 # Got here, so nope, no new files.
3c82f75b
OM
207 self._needs_download = False
208 self._parsed = True
209
210 def download(self, base_dir):
211 """ Download all files for a given thing. """
212 if not self._parsed:
213 self._parse(base_dir)
214
215 if not self._needs_download:
fa2f3251 216 print("{} already downloaded - skipping.".format(self.title))
3c82f75b
OM
217 return
218
219 # Have we already downloaded some things?
220 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
221 prev_dir = None
222 if os.path.exists(self.download_dir):
223 if not os.path.exists(timestamp_file):
224 # edge case: old style dir w/out timestamp.
fa2f3251 225 logging.warning("Old style download dir found for {}".format(self.title))
3c82f75b
OM
226 os.rename(self.download_dir, "{}_old".format(self.download_dir))
227 else:
228 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
229 os.rename(self.download_dir, prev_dir)
230
231 # Get the list of files to download
232 soup = BeautifulSoup(self.text, features='lxml')
233 file_links = soup.find_all('a', {'class':'file-download'})
234
235 new_file_links = []
236 old_file_links = []
237 new_last_time = None
238
239 if not self.last_time:
240 # If we don't have anything to copy from, then it is all new.
241 new_file_links = file_links
242 new_last_time = file_links[0].find_all('time')[0]['datetime']
243 for file_link in file_links:
244 timestamp = file_link.find_all('time')[0]['datetime']
fa2f3251 245 logging.debug("Found file {} from {}".format(file_link["title"], timestamp))
3c82f75b
OM
246 if timestamp > new_last_time:
247 new_last_time = timestamp
248 else:
249 for file_link in file_links:
250 timestamp = file_link.find_all('time')[0]['datetime']
fa2f3251 251 logging.debug("Checking {} (updated {})".format(file_link["title"], timestamp))
3c82f75b
OM
252 if timestamp > self.last_time:
253 new_file_links.append(file_link)
254 else:
255 old_file_links.append(file_link)
256 if not new_last_time or timestamp > new_last_time:
257 new_last_time = timestamp
258
fa2f3251 259 logging.debug("new timestamp {}".format(new_last_time))
3c82f75b
OM
260
261 # OK. Time to get to work.
fa2f3251 262 logging.debug("Generating download_dir")
3c82f75b
OM
263 os.mkdir(self.download_dir)
264 # First grab the cached files (if any)
fa2f3251 265 logging.info("Copying {} unchanged files.".format(len(old_file_links)))
3c82f75b
OM
266 for file_link in old_file_links:
267 old_file = os.path.join(prev_dir, file_link["title"])
268 new_file = os.path.join(self.download_dir, file_link["title"])
269 try:
fa2f3251 270 logging.debug("Copying {} to {}".format(old_file, new_file))
3c82f75b
OM
271 copyfile(old_file, new_file)
272 except FileNotFoundError:
fa2f3251 273 logging.warning("Unable to find {} in old archive, redownloading".format(file_link["title"]))
3c82f75b
OM
274 new_file_links.append(file_link)
275
276 # Now download the new ones
277 files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
fa2f3251 278 logging.info("Downloading {} new files of {}".format(len(new_file_links), len(file_links)))
3c82f75b
OM
279 try:
280 for url, name in files:
281 file_name = os.path.join(self.download_dir, name)
fa2f3251 282 logging.debug("Downloading {} from {} to {}".format(name, url, file_name))
3c82f75b
OM
283 data_req = requests.get(url)
284 with open(file_name, 'wb') as handle:
285 handle.write(data_req.content)
286 except Exception as exception:
fa2f3251 287 logging.error("Failed to download {} - {}".format(name, exception))
3c82f75b
OM
288 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
289 return
290
680039fe
OM
291 # People like images
292 image_dir = os.path.join(self.download_dir, 'images')
fa2f3251
OM
293 imagelinks = soup.find_all('span', {'class':'gallery-slider'})[0] \
294 .find_all('div', {'class':'gallery-photo'})
295 logging.info("Downloading {} images.".format(len(imagelinks)))
680039fe
OM
296 try:
297 os.mkdir(image_dir)
fa2f3251 298 for imagelink in imagelinks:
680039fe
OM
299 url = imagelink['data-full']
300 filename = os.path.basename(url)
301 if filename.endswith('stl'):
302 filename = "{}.png".format(filename)
303 image_req = requests.get(url)
304 with open(os.path.join(image_dir, filename), 'wb') as handle:
305 handle.write(image_req.content)
306 except Exception as exception:
307 print("Failed to download {} - {}".format(filename, exception))
308 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
309 return
310
311
312
313
3c82f75b
OM
314 try:
315 # Now write the timestamp
316 with open(timestamp_file, 'w') as timestamp_handle:
317 timestamp_handle.write(new_last_time)
318 except Exception as exception:
319 print("Failed to write timestamp file - {}".format(exception))
320 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
321 return
322 self._needs_download = False
fa2f3251 323 logging.debug("Download of {} finished".format(self.title))
975060c9 324
1ab49020
OM
325def do_batch(batch_file, download_dir):
326 """ Read a file in line by line, parsing each as a set of calls to this script."""
327 with open(batch_file) as handle:
328 for line in handle:
329 line = line.strip()
330 logging.info("Handling instruction {}".format(line))
331 command_arr = line.split()
332 if command_arr[0] == "thing":
333 logging.debug("Handling batch thing instruction: {}".format(line))
334 Thing(command_arr[1]).download(download_dir)
335 continue
336 if command_arr[0] == "collection":
337 logging.debug("Handling batch collection instruction: {}".format(line))
338 Collection(command_arr[1], command_arr[2], download_dir).download()
339 continue
340 if command_arr[0] == "user":
341 logging.debug("Handling batch collection instruction: {}".format(line))
342 Designs(command_arr[1], download_dir).download()
343 continue
344 logging.warning("Unable to parse current instruction. Skipping.")
345
975060c9
OM
346def main():
347 """ Entry point for script being run as a command. """
348 parser = argparse.ArgumentParser()
fa2f3251 349 parser.add_argument("-l", "--log-level", choices=['debug','info','warning'], default='info', help="level of logging desired")
d66f1f78 350 parser.add_argument("-d", "--directory", help="Target directory to download into")
4a98996b
OM
351 subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
352 collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
353 collection_parser.add_argument("owner", help="The owner of the collection to get")
354 collection_parser.add_argument("collection", help="The name of the collection to get")
355 thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
356 thing_parser.add_argument("thing", help="Thing ID to download")
3522a3bf
OM
357 user_parser = subparsers.add_parser("user", help="Download all things by a user")
358 user_parser.add_argument("user", help="The user to get the designs of")
1ab49020
OM
359 batch_parser = subparsers.add_parser("batch", help="Perform multiple actions written in a text file")
360 batch_parser.add_argument("batch_file", help="The name of the file to read.")
680039fe 361 subparsers.add_parser("version", help="Show the current version")
4a98996b 362
975060c9 363 args = parser.parse_args()
4a98996b
OM
364 if not args.subcommand:
365 parser.print_help()
366 sys.exit(1)
d66f1f78
OM
367 if not args.directory:
368 args.directory = os.getcwd()
fa2f3251
OM
369 logging.basicConfig(level=getattr(logging, args.log_level.upper()))
370
d66f1f78 371
4a98996b 372 if args.subcommand.startswith("collection"):
1ab49020 373 Collection(args.owner, args.collection, args.directory).download()
4a98996b 374 if args.subcommand == "thing":
d66f1f78 375 Thing(args.thing).download(args.directory)
3522a3bf 376 if args.subcommand == "user":
1ab49020 377 Designs(args.user, args.directory).download()
db8066ec
OM
378 if args.subcommand == "version":
379 print("thingy_grabber.py version {}".format(VERSION))
1ab49020
OM
380 if args.subcommand == "batch":
381 do_batch(args.batch_file, args.directory)
382
975060c9
OM
383
384if __name__ == "__main__":
385 main()