587d47ef9aae203b56d56de9d8f4bea9f717eeaf
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import os
8 import argparse
9 import unicodedata
10 import requests
11 from bs4 import BeautifulSoup
12
13 URL_BASE = "https://www.thingiverse.com"
14 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
15
16 ID_REGEX = re.compile(r'"id":(\d*),')
17 TOTAL_REGEX = re.compile(r'"total":(\d*),')
18 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
19 # This appears to be fixed at 12, but if it changes would screw the rest up.
20 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
21 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
22
23 VERBOSE = False
24
25 def strip_ws(value):
26 """ Remove whitespace from a string """
27 return str(NO_WHITESPACE_REGEX.sub('-', value))
28
29 def slugify(value):
30 """
31 Normalizes string, converts to lowercase, removes non-alpha characters,
32 and converts spaces to hyphens.
33 """
34 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
35 value = str(re.sub(r'[^\w\s-]', '', value).strip())
36 value = str(NO_WHITESPACE_REGEX.sub('-', value))
37 #value = str(re.sub(r'[-\s]+', '-', value))
38 return value
39
40 class Collection:
41 """ Holds details of a collection. """
42 def __init__(self, user, name):
43 self.user = user
44 self.name = name
45 self.things = []
46 self.total = 0
47 self.req_id = None
48 self.last_page = 0
49 self.per_page = None
50
51 def _get_small_collection(self, req):
52 """ Handle small collections """
53 soup = BeautifulSoup(req.text, features='lxml')
54 links = soup.find_all('a', {'class':'card-img-holder'})
55 self.things = [x['href'].split(':')[1] for x in links]
56
57 return self.things
58
59 def get_collection(self):
60 """ retrieve the things of the collection. """
61 if self.things:
62 # We've already done it.
63 return self.things
64
65 # Get the internal details of the collection.
66 c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
67 if VERBOSE:
68 print("Querying {}".format(c_url))
69 c_req = requests.get(c_url)
70 total = TOTAL_REGEX.search(c_req.text)
71 if total is None:
72 # This is a small (<13) items collection. Pull the list from this req.
73 return self._get_small_collection(c_req)
74 self.total = total.groups()[0]
75 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
76 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
77 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
78 parameters = {
79 'base_url':"{}/collections/{}".format(self.user, self.name),
80 'page':'1',
81 'per_page':'12',
82 'id':self.req_id
83 }
84 for current_page in range(1, self.last_page + 1):
85 parameters['page'] = current_page
86 req = requests.post(URL_COLLECTION, parameters)
87 soup = BeautifulSoup(req.text, features='lxml')
88 links = soup.find_all('a', {'class':'card-img-holder'})
89 self.things += [x['href'].split(':')[1] for x in links]
90
91 return self.things
92
93 def download(self):
94 """ Downloads all the files in a collection """
95 if not self.things:
96 self.get_collection()
97 base_dir = os.getcwd()
98 new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
99 target_dir = os.path.join(base_dir, new_dir)
100 try:
101 os.mkdir(target_dir)
102 except FileExistsError:
103 print("Target directory {} already exists. Assuming a resume.".format(new_dir))
104 os.chdir(target_dir)
105 for thing in self.things:
106 download_thing(thing)
107
108
109 def download_thing(thing):
110 """ Downloads all the files for a given thing. """
111 file_url = "{}/thing:{}/files".format(URL_BASE, thing)
112 file_req = requests.get(file_url)
113 file_soup = BeautifulSoup(file_req.text, features='lxml')
114
115 title = slugify(file_soup.find_all('h1')[0].text.strip())
116 base_dir = os.getcwd()
117 try:
118 os.mkdir(title)
119 except FileExistsError:
120 pass
121
122 print("Downloading {} ({})".format(thing, title))
123 os.chdir(title)
124 last_time = None
125
126 try:
127 with open('timestamp.txt', 'r') as fh:
128 last_time = fh.readlines()[0]
129 if VERBOSE:
130 print("last downloaded version: {}".format(last_time))
131 except FileNotFoundError:
132 # Not run on this thing before.
133 if VERBOSE:
134 print('Directory for thing already exists, checking for update.')
135 last_time = None
136
137 file_links = file_soup.find_all('a', {'class':'file-download'})
138 new_last_time = last_time
139 new_file_links = []
140
141 for file_link in file_links:
142 timestamp = file_link.find_all('time')[0]['datetime']
143 if VERBOSE:
144 print("Checking {} (updated {})".format(file_link["title"], timestamp))
145 if not last_time or timestamp > last_time:
146 new_file_links.append(file_link)
147 if not new_last_time or timestamp > new_last_time:
148 new_last_time = timestamp
149
150 if last_time and new_last_time <= last_time:
151 print("Thing already downloaded. Skipping.")
152 files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
153
154 try:
155 for url, name in files:
156 if VERBOSE:
157 print("Downloading {} from {}".format(name, url))
158 data_req = requests.get(url)
159 with open(name, 'wb') as handle:
160 handle.write(data_req.content)
161 # now write timestamp
162 with open('timestamp.txt', 'w') as fh:
163 fh.write(new_last_time)
164 except Exception as exception:
165 print("Failed to download {} - {}".format(name, exception))
166 os.chdir(base_dir)
167 os.rename(title, "{}_failed".format(title))
168 return
169
170
171 os.chdir(base_dir)
172
173 def main():
174 """ Entry point for script being run as a command. """
175 parser = argparse.ArgumentParser()
176 parser.add_argument("owner", help="The owner of the collection to get")
177 parser.add_argument("collection", help="The name of the collection to get")
178 parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
179 args = parser.parse_args()
180 global VERBOSE
181 VERBOSE = args.verbose
182
183 collection = Collection(args.owner, args.collection)
184 print(collection.get_collection())
185 collection.download()
186
187 if __name__ == "__main__":
188 main()