Allow downloading designs
[clinton/thingy_grabber.git] / thingy_grabber.py
CommitLineData
975060c9
OM
1#!/usr/bin/env python3
2"""
3Thingiverse bulk downloader
4"""
5
6import re
4a98996b 7import sys
975060c9
OM
8import os
9import argparse
10import unicodedata
11import requests
12from bs4 import BeautifulSoup
13
14URL_BASE = "https://www.thingiverse.com"
15URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
16
17ID_REGEX = re.compile(r'"id":(\d*),')
18TOTAL_REGEX = re.compile(r'"total":(\d*),')
19LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
20# This appears to be fixed at 12, but if it changes would screw the rest up.
21PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
dd8c35f4
OM
22NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
23
24VERBOSE = False
25
26def strip_ws(value):
27 """ Remove whitespace from a string """
28 return str(NO_WHITESPACE_REGEX.sub('-', value))
975060c9
OM
29
30def slugify(value):
31 """
32 Normalizes string, converts to lowercase, removes non-alpha characters,
33 and converts spaces to hyphens.
34 """
35 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
36 value = str(re.sub(r'[^\w\s-]', '', value).strip())
dd8c35f4
OM
37 value = str(NO_WHITESPACE_REGEX.sub('-', value))
38 #value = str(re.sub(r'[-\s]+', '-', value))
975060c9
OM
39 return value
40
3522a3bf
OM
41class Grouping:
42 """ Holds details of a group of things. """
43 def __init__(self):
975060c9
OM
44 self.things = []
45 self.total = 0
46 self.req_id = None
47 self.last_page = 0
48 self.per_page = None
3522a3bf
OM
49 # These two should be set by child classes.
50 self.url = None
51 self.download_dir = None
975060c9 52
3522a3bf
OM
53 def _get_small_grouping(self, req):
54 """ Handle small groupings """
975060c9
OM
55 soup = BeautifulSoup(req.text, features='lxml')
56 links = soup.find_all('a', {'class':'card-img-holder'})
57 self.things = [x['href'].split(':')[1] for x in links]
58
59 return self.things
60
3522a3bf
OM
61 def get(self):
62 """ retrieve the things of the grouping. """
975060c9
OM
63 if self.things:
64 # We've already done it.
65 return self.things
66
3522a3bf
OM
67 # Check for initialisation:
68 if not self.url:
69 print("No URL set - object not initialised properly?")
70 raise ValueError("No URL set - object not initialised properly?")
71
72 # Get the internal details of the grouping.
dd8c35f4 73 if VERBOSE:
3522a3bf
OM
74 print("Querying {}".format(self.url))
75 c_req = requests.get(self.url)
975060c9
OM
76 total = TOTAL_REGEX.search(c_req.text)
77 if total is None:
3522a3bf
OM
78 # This is a small (<13) items grouping. Pull the list from this req.
79 return self._get_small_grouping(c_req)
975060c9
OM
80 self.total = total.groups()[0]
81 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
82 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
83 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
84 parameters = {
3522a3bf 85 'base_url':self.url,
975060c9
OM
86 'page':'1',
87 'per_page':'12',
88 'id':self.req_id
89 }
90 for current_page in range(1, self.last_page + 1):
91 parameters['page'] = current_page
92 req = requests.post(URL_COLLECTION, parameters)
93 soup = BeautifulSoup(req.text, features='lxml')
94 links = soup.find_all('a', {'class':'card-img-holder'})
95 self.things += [x['href'].split(':')[1] for x in links]
96
97 return self.things
98
99 def download(self):
100 """ Downloads all the files in a collection """
101 if not self.things:
3522a3bf
OM
102 self.get()
103
104 if not self.download_dir:
105 raise ValueError("No download_dir set - invalidly initialised object?")
106
975060c9 107 base_dir = os.getcwd()
975060c9 108 try:
3522a3bf 109 os.mkdir(self.download_dir)
975060c9 110 except FileExistsError:
3522a3bf
OM
111 print("Target directory {} already exists. Assuming a resume.".format(self.download_dir))
112 os.chdir(self.download_dir)
975060c9
OM
113 for thing in self.things:
114 download_thing(thing)
3522a3bf 115 os.chdir(base_dir)
975060c9 116
3522a3bf
OM
117class Collection(Grouping):
118 """ Holds details of a collection. """
119 def __init__(self, user, name):
120 Grouping.__init__(self)
121 self.user = user
122 self.name = name
123 self.url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
124 self.download_dir = os.path.join(os.getcwd(), "{}-{}".format(slugify(self.user), slugify(self.name)))
125
126class Designs(Grouping):
127 """ Holds details of all of a users' designs. """
128 def __init__(self, user):
129 Grouping.__init__(self)
130 self.user = user
131 self.url = "{}/{}/designs".format(URL_BASE, self.user)
132 self.download_dir = os.path.join(os.getcwd(), "{} designs".format(slugify(self.user)))
975060c9
OM
133
134def download_thing(thing):
135 """ Downloads all the files for a given thing. """
136 file_url = "{}/thing:{}/files".format(URL_BASE, thing)
137 file_req = requests.get(file_url)
138 file_soup = BeautifulSoup(file_req.text, features='lxml')
139
140 title = slugify(file_soup.find_all('h1')[0].text.strip())
141 base_dir = os.getcwd()
142 try:
143 os.mkdir(title)
144 except FileExistsError:
e36c2a07
OM
145 pass
146
975060c9
OM
147 print("Downloading {} ({})".format(thing, title))
148 os.chdir(title)
e36c2a07
OM
149 last_time = None
150
151 try:
4a98996b
OM
152 with open('timestamp.txt', 'r') as timestamp_handle:
153 last_time = timestamp_handle.readlines()[0]
e36c2a07
OM
154 if VERBOSE:
155 print("last downloaded version: {}".format(last_time))
156 except FileNotFoundError:
157 # Not run on this thing before.
158 if VERBOSE:
159 print('Directory for thing already exists, checking for update.')
160 last_time = None
975060c9
OM
161
162 file_links = file_soup.find_all('a', {'class':'file-download'})
e36c2a07
OM
163 new_last_time = last_time
164 new_file_links = []
165
166 for file_link in file_links:
167 timestamp = file_link.find_all('time')[0]['datetime']
168 if VERBOSE:
169 print("Checking {} (updated {})".format(file_link["title"], timestamp))
170 if not last_time or timestamp > last_time:
171 new_file_links.append(file_link)
172 if not new_last_time or timestamp > new_last_time:
173 new_last_time = timestamp
174
175 if last_time and new_last_time <= last_time:
176 print("Thing already downloaded. Skipping.")
177 files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
975060c9 178
a7152c35
OM
179 try:
180 for url, name in files:
e36c2a07
OM
181 if VERBOSE:
182 print("Downloading {} from {}".format(name, url))
a7152c35
OM
183 data_req = requests.get(url)
184 with open(name, 'wb') as handle:
185 handle.write(data_req.content)
e36c2a07 186 # now write timestamp
4a98996b
OM
187 with open('timestamp.txt', 'w') as timestamp_handle:
188 timestamp_handle.write(new_last_time)
a7152c35
OM
189 except Exception as exception:
190 print("Failed to download {} - {}".format(name, exception))
191 os.chdir(base_dir)
192 os.rename(title, "{}_failed".format(title))
193 return
194
e36c2a07 195
975060c9
OM
196 os.chdir(base_dir)
197
198def main():
199 """ Entry point for script being run as a command. """
200 parser = argparse.ArgumentParser()
dd8c35f4 201 parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
4a98996b
OM
202 subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
203 collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
204 collection_parser.add_argument("owner", help="The owner of the collection to get")
205 collection_parser.add_argument("collection", help="The name of the collection to get")
206 thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
207 thing_parser.add_argument("thing", help="Thing ID to download")
3522a3bf
OM
208 user_parser = subparsers.add_parser("user", help="Download all things by a user")
209 user_parser.add_argument("user", help="The user to get the designs of")
4a98996b 210
975060c9 211 args = parser.parse_args()
4a98996b
OM
212 if not args.subcommand:
213 parser.print_help()
214 sys.exit(1)
dd8c35f4
OM
215 global VERBOSE
216 VERBOSE = args.verbose
4a98996b
OM
217 if args.subcommand.startswith("collection"):
218 collection = Collection(args.owner, args.collection)
3522a3bf 219 print(collection.get())
4a98996b
OM
220 collection.download()
221 if args.subcommand == "thing":
222 download_thing(args.thing)
3522a3bf
OM
223 if args.subcommand == "user":
224 designs = Designs(args.user)
225 print(designs.get())
226 designs.download()
227
975060c9 228
975060c9
OM
229
230if __name__ == "__main__":
231 main()