add thing downloading
[clinton/thingy_grabber.git] / thingy_grabber.py
1 #!/usr/bin/env python3
2 """
3 Thingiverse bulk downloader
4 """
5
6 import re
7 import sys
8 import os
9 import argparse
10 import unicodedata
11 import requests
12 from bs4 import BeautifulSoup
13
14 URL_BASE = "https://www.thingiverse.com"
15 URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things"
16
17 ID_REGEX = re.compile(r'"id":(\d*),')
18 TOTAL_REGEX = re.compile(r'"total":(\d*),')
19 LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),')
20 # This appears to be fixed at 12, but if it changes would screw the rest up.
21 PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),')
22 NO_WHITESPACE_REGEX = re.compile(r'[-\s]+')
23
24 VERBOSE = False
25
26 def strip_ws(value):
27 """ Remove whitespace from a string """
28 return str(NO_WHITESPACE_REGEX.sub('-', value))
29
30 def slugify(value):
31 """
32 Normalizes string, converts to lowercase, removes non-alpha characters,
33 and converts spaces to hyphens.
34 """
35 value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
36 value = str(re.sub(r'[^\w\s-]', '', value).strip())
37 value = str(NO_WHITESPACE_REGEX.sub('-', value))
38 #value = str(re.sub(r'[-\s]+', '-', value))
39 return value
40
41 class Collection:
42 """ Holds details of a collection. """
43 def __init__(self, user, name):
44 self.user = user
45 self.name = name
46 self.things = []
47 self.total = 0
48 self.req_id = None
49 self.last_page = 0
50 self.per_page = None
51
52 def _get_small_collection(self, req):
53 """ Handle small collections """
54 soup = BeautifulSoup(req.text, features='lxml')
55 links = soup.find_all('a', {'class':'card-img-holder'})
56 self.things = [x['href'].split(':')[1] for x in links]
57
58 return self.things
59
60 def get_collection(self):
61 """ retrieve the things of the collection. """
62 if self.things:
63 # We've already done it.
64 return self.things
65
66 # Get the internal details of the collection.
67 c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name))
68 if VERBOSE:
69 print("Querying {}".format(c_url))
70 c_req = requests.get(c_url)
71 total = TOTAL_REGEX.search(c_req.text)
72 if total is None:
73 # This is a small (<13) items collection. Pull the list from this req.
74 return self._get_small_collection(c_req)
75 self.total = total.groups()[0]
76 self.req_id = ID_REGEX.search(c_req.text).groups()[0]
77 self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0])
78 self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0]
79 parameters = {
80 'base_url':"{}/collections/{}".format(self.user, self.name),
81 'page':'1',
82 'per_page':'12',
83 'id':self.req_id
84 }
85 for current_page in range(1, self.last_page + 1):
86 parameters['page'] = current_page
87 req = requests.post(URL_COLLECTION, parameters)
88 soup = BeautifulSoup(req.text, features='lxml')
89 links = soup.find_all('a', {'class':'card-img-holder'})
90 self.things += [x['href'].split(':')[1] for x in links]
91
92 return self.things
93
94 def download(self):
95 """ Downloads all the files in a collection """
96 if not self.things:
97 self.get_collection()
98 base_dir = os.getcwd()
99 new_dir = "{}-{}".format(slugify(self.user), slugify(self.name))
100 target_dir = os.path.join(base_dir, new_dir)
101 try:
102 os.mkdir(target_dir)
103 except FileExistsError:
104 print("Target directory {} already exists. Assuming a resume.".format(new_dir))
105 os.chdir(target_dir)
106 for thing in self.things:
107 download_thing(thing)
108
109
110 def download_thing(thing):
111 """ Downloads all the files for a given thing. """
112 file_url = "{}/thing:{}/files".format(URL_BASE, thing)
113 file_req = requests.get(file_url)
114 file_soup = BeautifulSoup(file_req.text, features='lxml')
115
116 title = slugify(file_soup.find_all('h1')[0].text.strip())
117 base_dir = os.getcwd()
118 try:
119 os.mkdir(title)
120 except FileExistsError:
121 pass
122
123 print("Downloading {} ({})".format(thing, title))
124 os.chdir(title)
125 last_time = None
126
127 try:
128 with open('timestamp.txt', 'r') as timestamp_handle:
129 last_time = timestamp_handle.readlines()[0]
130 if VERBOSE:
131 print("last downloaded version: {}".format(last_time))
132 except FileNotFoundError:
133 # Not run on this thing before.
134 if VERBOSE:
135 print('Directory for thing already exists, checking for update.')
136 last_time = None
137
138 file_links = file_soup.find_all('a', {'class':'file-download'})
139 new_last_time = last_time
140 new_file_links = []
141
142 for file_link in file_links:
143 timestamp = file_link.find_all('time')[0]['datetime']
144 if VERBOSE:
145 print("Checking {} (updated {})".format(file_link["title"], timestamp))
146 if not last_time or timestamp > last_time:
147 new_file_links.append(file_link)
148 if not new_last_time or timestamp > new_last_time:
149 new_last_time = timestamp
150
151 if last_time and new_last_time <= last_time:
152 print("Thing already downloaded. Skipping.")
153 files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in new_file_links]
154
155 try:
156 for url, name in files:
157 if VERBOSE:
158 print("Downloading {} from {}".format(name, url))
159 data_req = requests.get(url)
160 with open(name, 'wb') as handle:
161 handle.write(data_req.content)
162 # now write timestamp
163 with open('timestamp.txt', 'w') as timestamp_handle:
164 timestamp_handle.write(new_last_time)
165 except Exception as exception:
166 print("Failed to download {} - {}".format(name, exception))
167 os.chdir(base_dir)
168 os.rename(title, "{}_failed".format(title))
169 return
170
171
172 os.chdir(base_dir)
173
174 def main():
175 """ Entry point for script being run as a command. """
176 parser = argparse.ArgumentParser()
177 parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true")
178 subparsers = parser.add_subparsers(help="Type of thing to download", dest="subcommand")
179 collection_parser = subparsers.add_parser('collection', help="Download an entire collection")
180 collection_parser.add_argument("owner", help="The owner of the collection to get")
181 collection_parser.add_argument("collection", help="The name of the collection to get")
182 thing_parser = subparsers.add_parser('thing', help="Download a single thing.")
183 thing_parser.add_argument("thing", help="Thing ID to download")
184
185 args = parser.parse_args()
186 if not args.subcommand:
187 parser.print_help()
188 sys.exit(1)
189 global VERBOSE
190 VERBOSE = args.verbose
191 if args.subcommand.startswith("collection"):
192 collection = Collection(args.owner, args.collection)
193 print(collection.get_collection())
194 collection.download()
195 if args.subcommand == "thing":
196 download_thing(args.thing)
197
198
199 if __name__ == "__main__":
200 main()