Commit | Line | Data |
---|---|---|
975060c9 OM |
1 | #!/usr/bin/env python3 |
2 | """ | |
3 | Thingiverse bulk downloader | |
4 | """ | |
5 | ||
6 | import re | |
7 | import os | |
8 | import argparse | |
9 | import unicodedata | |
10 | import requests | |
11 | from bs4 import BeautifulSoup | |
12 | ||
13 | URL_BASE = "https://www.thingiverse.com" | |
14 | URL_COLLECTION = URL_BASE + "/ajax/thingcollection/list_collected_things" | |
15 | ||
16 | ID_REGEX = re.compile(r'"id":(\d*),') | |
17 | TOTAL_REGEX = re.compile(r'"total":(\d*),') | |
18 | LAST_PAGE_REGEX = re.compile(r'"last_page":(\d*),') | |
19 | # This appears to be fixed at 12, but if it changes would screw the rest up. | |
20 | PER_PAGE_REGEX = re.compile(r'"per_page":(\d*),') | |
dd8c35f4 OM |
21 | NO_WHITESPACE_REGEX = re.compile(r'[-\s]+') |
22 | ||
23 | VERBOSE = False | |
24 | ||
25 | def strip_ws(value): | |
26 | """ Remove whitespace from a string """ | |
27 | return str(NO_WHITESPACE_REGEX.sub('-', value)) | |
975060c9 OM |
28 | |
29 | def slugify(value): | |
30 | """ | |
31 | Normalizes string, converts to lowercase, removes non-alpha characters, | |
32 | and converts spaces to hyphens. | |
33 | """ | |
34 | value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() | |
35 | value = str(re.sub(r'[^\w\s-]', '', value).strip()) | |
dd8c35f4 OM |
36 | value = str(NO_WHITESPACE_REGEX.sub('-', value)) |
37 | #value = str(re.sub(r'[-\s]+', '-', value)) | |
975060c9 OM |
38 | return value |
39 | ||
40 | class Collection: | |
41 | """ Holds details of a collection. """ | |
42 | def __init__(self, user, name): | |
43 | self.user = user | |
44 | self.name = name | |
45 | self.things = [] | |
46 | self.total = 0 | |
47 | self.req_id = None | |
48 | self.last_page = 0 | |
49 | self.per_page = None | |
50 | ||
51 | def _get_small_collection(self, req): | |
52 | """ Handle small collections """ | |
53 | soup = BeautifulSoup(req.text, features='lxml') | |
54 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
55 | self.things = [x['href'].split(':')[1] for x in links] | |
56 | ||
57 | return self.things | |
58 | ||
59 | def get_collection(self): | |
60 | """ retrieve the things of the collection. """ | |
61 | if self.things: | |
62 | # We've already done it. | |
63 | return self.things | |
64 | ||
65 | # Get the internal details of the collection. | |
dd8c35f4 OM |
66 | c_url = "{}/{}/collections/{}".format(URL_BASE, self.user, strip_ws(self.name)) |
67 | if VERBOSE: | |
68 | print("Querying {}".format(c_url)) | |
975060c9 OM |
69 | c_req = requests.get(c_url) |
70 | total = TOTAL_REGEX.search(c_req.text) | |
71 | if total is None: | |
72 | # This is a small (<13) items collection. Pull the list from this req. | |
73 | return self._get_small_collection(c_req) | |
74 | self.total = total.groups()[0] | |
75 | self.req_id = ID_REGEX.search(c_req.text).groups()[0] | |
76 | self.last_page = int(LAST_PAGE_REGEX.search(c_req.text).groups()[0]) | |
77 | self.per_page = PER_PAGE_REGEX.search(c_req.text).groups()[0] | |
78 | parameters = { | |
79 | 'base_url':"{}/collections/{}".format(self.user, self.name), | |
80 | 'page':'1', | |
81 | 'per_page':'12', | |
82 | 'id':self.req_id | |
83 | } | |
84 | for current_page in range(1, self.last_page + 1): | |
85 | parameters['page'] = current_page | |
86 | req = requests.post(URL_COLLECTION, parameters) | |
87 | soup = BeautifulSoup(req.text, features='lxml') | |
88 | links = soup.find_all('a', {'class':'card-img-holder'}) | |
89 | self.things += [x['href'].split(':')[1] for x in links] | |
90 | ||
91 | return self.things | |
92 | ||
93 | def download(self): | |
94 | """ Downloads all the files in a collection """ | |
95 | if not self.things: | |
96 | self.get_collection() | |
97 | base_dir = os.getcwd() | |
98 | new_dir = "{}-{}".format(slugify(self.user), slugify(self.name)) | |
99 | target_dir = os.path.join(base_dir, new_dir) | |
100 | try: | |
101 | os.mkdir(target_dir) | |
102 | except FileExistsError: | |
103 | print("Target directory {} already exists. Assuming a resume.".format(new_dir)) | |
104 | os.chdir(target_dir) | |
105 | for thing in self.things: | |
106 | download_thing(thing) | |
107 | ||
108 | ||
109 | def download_thing(thing): | |
110 | """ Downloads all the files for a given thing. """ | |
111 | file_url = "{}/thing:{}/files".format(URL_BASE, thing) | |
112 | file_req = requests.get(file_url) | |
113 | file_soup = BeautifulSoup(file_req.text, features='lxml') | |
114 | ||
115 | title = slugify(file_soup.find_all('h1')[0].text.strip()) | |
116 | base_dir = os.getcwd() | |
117 | try: | |
118 | os.mkdir(title) | |
119 | except FileExistsError: | |
120 | print("Directory for {} ({}) already exists, skipping".format(thing, title)) | |
121 | return | |
122 | print("Downloading {} ({})".format(thing, title)) | |
123 | os.chdir(title) | |
124 | ||
125 | file_links = file_soup.find_all('a', {'class':'file-download'}) | |
126 | files = [("{}{}".format(URL_BASE, x['href']), x["title"]) for x in file_links] | |
127 | ||
a7152c35 OM |
128 | try: |
129 | for url, name in files: | |
130 | data_req = requests.get(url) | |
131 | with open(name, 'wb') as handle: | |
132 | handle.write(data_req.content) | |
133 | except Exception as exception: | |
134 | print("Failed to download {} - {}".format(name, exception)) | |
135 | os.chdir(base_dir) | |
136 | os.rename(title, "{}_failed".format(title)) | |
137 | return | |
138 | ||
975060c9 OM |
139 | os.chdir(base_dir) |
140 | ||
141 | def main(): | |
142 | """ Entry point for script being run as a command. """ | |
143 | parser = argparse.ArgumentParser() | |
144 | parser.add_argument("owner", help="The owner of the collection to get") | |
145 | parser.add_argument("collection", help="The name of the collection to get") | |
dd8c35f4 | 146 | parser.add_argument("-v", "--verbose", help="Be more verbose", action="store_true") |
975060c9 | 147 | args = parser.parse_args() |
dd8c35f4 OM |
148 | global VERBOSE |
149 | VERBOSE = args.verbose | |
975060c9 OM |
150 | |
151 | collection = Collection(args.owner, args.collection) | |
152 | print(collection.get_collection()) | |
153 | collection.download() | |
154 | ||
155 | if __name__ == "__main__": | |
156 | main() |