Move old downloads sideways
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
3c82f75b 12from shutil import copyfile
975060c9
OM
13from bs4 import BeautifulSoup
14
15URL_BASE = "https://www.thingiverse.com"
16URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
17
18ID_REGEX = re.compile(r'"id":(\d*),')
19TOTAL_REGEX = re.compile(r'"total":(\d*),')
20LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
21# This appears to be fixed at 12, but if it changes would screw the rest up.
22PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
23NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
24
25VERBOSE = False
26
27def strip_ws(value):
28 """ Remove whitespace from a string """
29 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9
OM
30
31def slugify(value):
32 """
33 Normalizes string, converts to lowercase, removes non-alpha characters,
34 and converts spaces to hyphens.
35 """
36 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
37 value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4
OM
38 value = str(NO_WHITESPACE_REGEX.sub('-', value))
39 #value = str(re.sub(r'[-\s]+', '-', value))
975060c9
OM
40 return value
41
3522a3bf 42class Grouping:
3c82f75b
OM
43 """ Holds details of a group of things.
44 This is effectively (although not actually) an abstract class
45 - use Collection or Designs instead.
46 """
3522a3bf 47 def __init__(self):
975060c9
OM
48 self.things = []
49 self.total = 0
50 self.req_id = None
51 self.last_page = 0
52 self.per_page = None
3522a3bf
OM
53 # These two should be set by child classes.
54 self.url = None
55 self.download_dir = None
975060c9 56
3522a3bf
OM
57 def _get_small_grouping(self, req):
58 """ Handle small groupings """
975060c9
OM
59 soup = BeautifulSoup(req.text, features='lxml')
60 links = soup.find_all('a', {'class':'card-img-holder'})
61 self.things = [x['href'].split(':')[1] for x in links]
62
63 return self.things
64
3522a3bf
OM
65 def get(self):
66 """ retrieve the things of the grouping. """
975060c9
OM
67 if self.things:
68 # We've already done it.
69 return self.things
70
3522a3bf
OM
71 # Check for initialisation:
72 if not self.url:
73 print("No URL set - object not initialised properly?")
74 raise ValueError("No URL set - object not initialised properly?")
75
76 # Get the internal details of the grouping.
dd8c35f4 77 if VERBOSE:
3522a3bf
OM
78 print("Querying {}".format(self.url))
79 c_req = requests.get(self.url)
975060c9
OM
80 total = TOTAL_REGEX.search(c_req.text)
81 if total is None:
3522a3bf
OM
82 # This is a small (<13) items grouping. Pull the list from this req.
83 return self._get_small_grouping(c_req)
975060c9
OM
84 self.total = total.groups()[0]
85 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
86 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
87 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
88 parameters = {
3522a3bf 89 'base_url':self.url,
975060c9
OM
90 'page':'1',
91 'per_page':'12',
92 'id':self.req_id
93 }
94 for current_page in range(1, self.last_page + 1):
95 parameters['page'] = current_page
96 req = requests.post(URL_COLLECTION, parameters)
97 soup = BeautifulSoup(req.text, features='lxml')
98 links = soup.find_all('a', {'class':'card-img-holder'})
99 self.things += [x['href'].split(':')[1] for x in links]
100
101 return self.things
102
103 def download(self):
104 """ Downloads all the files in a collection """
105 if not self.things:
3522a3bf
OM
106 self.get()
107
108 if not self.download_dir:
109 raise ValueError("No download_dir set - invalidly initialised object?")
110
975060c9 111 base_dir = os.getcwd()
975060c9 112 try:
3522a3bf 113 os.mkdir(self.download_dir)
975060c9 114 except FileExistsError:
3c82f75b
OM
115 print("Target directory {} already exists. Assuming a resume."
116 .format(self.download_dir))
975060c9 117 for thing in self.things:
3c82f75b 118 Thing(thing).download(self.download_dir)
975060c9 119
3522a3bf
OM
120class Collection(Grouping):
121 """ Holds details of a collection. """
122 def __init__(self, user, name):
123 Grouping.__init__(self)
124 self.user = user
125 self.name = name
3c82f75b
OM
126 self.url = "{}/{}/collections/{}".format(
127 URL_BASE, self.user, strip_ws(self.name))
128 self.download_dir = os.path.join(os.getcwd(),
129 "{}-{}".format(slugify(self.user), slugify(self.name)))
3522a3bf
OM
130
131class Designs(Grouping):
132 """ Holds details of all of a users' designs. """
133 def __init__(self, user):
134 Grouping.__init__(self)
135 self.user = user
136 self.url = "{}/{}/designs".format(URL_BASE, self.user)
137 self.download_dir = os.path.join(os.getcwd(), "{} designs".format(slugify(self.user)))
975060c9 138
3c82f75b
OM
139class Thing:
140 """ An individual design on thingiverse. """
141 def __init__(self, thing_id):
142 self.thing_id = thing_id
143 self.last_time = None
144 self._parsed = False
145 self._needs_download = True
146 self.text = None
147 self.title = None
148 self.download_dir = None
975060c9 149
3c82f75b
OM
150 def _parse(self, base_dir):
151 """ Work out what, if anything needs to be done. """
152 if self._parsed:
153 return
e36c2a07 154
3c82f75b
OM
155 url = "{}/thing:{}/files".format(URL_BASE, self.thing_id)
156 req = requests.get(url)
157 self.text = req.text
158 soup = BeautifulSoup(self.text, features='lxml')
159
160 self.title = slugify(soup.find_all('h1')[0].text.strip())
161 self.download_dir = os.path.join(base_dir, self.title)
162
163 if not os.path.exists(self.download_dir):
164 # Not yet downloaded
165 self._parsed = True
166 return
167
168 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
169 if not os.path.exists(timestamp_file):
170 # Old download from before
171 if VERBOSE:
172 print("Old-style download directory found. Assuming update required.")
173 self._parsed = True
174 return
175
176 try:
177 with open(timestamp_file, 'r') as timestamp_handle:
178 self.last_time = timestamp_handle.readlines()[0]
179 if VERBOSE:
180 print("last downloaded version: {}".format(self.last_time))
181 except FileNotFoundError:
182 # Not run on this thing before.
183 if VERBOSE:
184 print("Old-style download directory found. Assuming update required.")
185 self.last_time = None
186 self._parsed = True
187 return
188
189 # OK, so we have a timestamp, lets see if there is anything new to get
190 file_links = soup.find_all('a', {'class':'file-download'})
191 for file_link in file_links:
192 timestamp = file_link.find_all('time')[0]['datetime']
e36c2a07 193 if VERBOSE:
3c82f75b
OM
194 print("Checking {} (updated {})".format(file_link["title"], timestamp))
195 if timestamp > self.last_time:
196 print("Found new/updated file {}".format(file_link["title"]))
197 self._needs_download = True
198 self._parsed = True
199 return
200 # Got here, so nope, no new files.
201 print("Found no new files for {}".format(self.title))
202 self._needs_download = False
203 self._parsed = True
204
205 def download(self, base_dir):
206 """ Download all files for a given thing. """
207 if not self._parsed:
208 self._parse(base_dir)
209
210 if not self._needs_download:
211 if VERBOSE:
212 print("{} already downloaded - skipping.".format(self.title))
213 return
214
215 # Have we already downloaded some things?
216 timestamp_file = os.path.join(self.download_dir, 'timestamp.txt')
217 prev_dir = None
218 if os.path.exists(self.download_dir):
219 if not os.path.exists(timestamp_file):
220 # edge case: old style dir w/out timestamp.
221 print("Old style download dir found for {}".format(self.title))
222 os.rename(self.download_dir, "{}_old".format(self.download_dir))
223 else:
224 prev_dir = "{}_{}".format(self.download_dir, self.last_time)
225 os.rename(self.download_dir, prev_dir)
226
227 # Get the list of files to download
228 soup = BeautifulSoup(self.text, features='lxml')
229 file_links = soup.find_all('a', {'class':'file-download'})
230
231 new_file_links = []
232 old_file_links = []
233 new_last_time = None
234
235 if not self.last_time:
236 # If we don't have anything to copy from, then it is all new.
237 new_file_links = file_links
238 new_last_time = file_links[0].find_all('time')[0]['datetime']
239 for file_link in file_links:
240 timestamp = file_link.find_all('time')[0]['datetime']
241 if VERBOSE:
242 print("Found file {} from {}".format(file_link["title"], timestamp))
243 if timestamp > new_last_time:
244 new_last_time = timestamp
245 else:
246 for file_link in file_links:
247 timestamp = file_link.find_all('time')[0]['datetime']
248 if VERBOSE:
249 print("Checking {} (updated {})".format(file_link["title"], timestamp))
250 if timestamp > self.last_time:
251 new_file_links.append(file_link)
252 else:
253 old_file_links.append(file_link)
254 if not new_last_time or timestamp > new_last_time:
255 new_last_time = timestamp
256
257 if VERBOSE:
258 print("new timestamp {}".format(new_last_time))
259
260 # OK. Time to get to work.
261 os.mkdir(self.download_dir)
262 # First grab the cached files (if any)
263 for file_link in old_file_links:
264 old_file = os.path.join(prev_dir, file_link["title"])
265 new_file = os.path.join(self.download_dir, file_link["title"])
266 try:
267 if VERBOSE:
268 print("Copying {} to {}".format(old_file, new_file))
269 copyfile(old_file, new_file)
270 except FileNotFoundError:
271 print("Unable to find {} in old archive, redownloading".format(file_link["title"]))
272 new_file_links.append(file_link)
273
274 # Now download the new ones
275 files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
276 try:
277 for url, name in files:
278 file_name = os.path.join(self.download_dir, name)
279 if VERBOSE:
280 print("Downloading {} from {} to {}".format(name, url, file_name))
281 data_req = requests.get(url)
282 with open(file_name, 'wb') as handle:
283 handle.write(data_req.content)
284 except Exception as exception:
285 print("Failed to download {} - {}".format(name, exception))
286 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
287 return
288
289 try:
290 # Now write the timestamp
291 with open(timestamp_file, 'w') as timestamp_handle:
292 timestamp_handle.write(new_last_time)
293 except Exception as exception:
294 print("Failed to write timestamp file - {}".format(exception))
295 os.rename(self.download_dir, "{}_failed".format(self.download_dir))
296 return
297 self._needs_download = False
298 if VERBOSE:
299 print("Download of {} finished".format(self.title))
975060c9
OM
300
301def main():
302 """ Entry point for script being run as a command. """
303 parser = argparse.ArgumentParser()
dd8c35f4 304 parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
4a98996b
OM
305 subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
306 collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
307 collection_parser.add_argument("owner", help="The owner of the collection to get")
308 collection_parser.add_argument("collection", help="The name of the collection to get")
309 thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
310 thing_parser.add_argument("thing", help="Thing ID to download")
3522a3bf
OM
311 user_parser = subparsers.add_parser("user", help="Download all things by a user")
312 user_parser.add_argument("user", help="The user to get the designs of")
4a98996b 313
975060c9 314 args = parser.parse_args()
4a98996b
OM
315 if not args.subcommand:
316 parser.print_help()
317 sys.exit(1)
dd8c35f4
OM
318 global VERBOSE
319 VERBOSE = args.verbose
4a98996b
OM
320 if args.subcommand.startswith("collection"):
321 collection = Collection(args.owner, args.collection)
3522a3bf 322 print(collection.get())
4a98996b
OM
323 collection.download()
324 if args.subcommand == "thing":
3c82f75b 325 Thing(args.thing).download(os.getcwd())
3522a3bf
OM
326 if args.subcommand == "user":
327 designs = Designs(args.user)
328 print(designs.get())
329 designs.download()
330
975060c9 331
975060c9
OM
332
333if __name__ == "__main__":
334 main()