Commit | Line | Data |
---|---|---|
c38e0c97 | 1 | ;;; html2text.el --- a simple html to plain text converter -*- coding: utf-8 -*- |
e84b4b86 | 2 | |
ba318903 | 3 | ;; Copyright (C) 2002-2014 Free Software Foundation, Inc. |
23f87bed MB |
4 | |
5 | ;; Author: Joakim Hove <hove@phys.ntnu.no> | |
6 | ||
7 | ;; This file is part of GNU Emacs. | |
8 | ||
5e809f55 | 9 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
23f87bed | 10 | ;; it under the terms of the GNU General Public License as published by |
5e809f55 GM |
11 | ;; the Free Software Foundation, either version 3 of the License, or |
12 | ;; (at your option) any later version. | |
23f87bed MB |
13 | |
14 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
15 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
5e809f55 | 16 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23f87bed MB |
17 | ;; GNU General Public License for more details. |
18 | ||
19 | ;; You should have received a copy of the GNU General Public License | |
5e809f55 | 20 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
23f87bed MB |
21 | |
22 | ;;; Commentary: | |
23 | ||
24 | ;; These functions provide a simple way to wash/clean html infected | |
25 | ;; mails. Definitely do not work in all cases, but some improvement | |
e2642250 | 26 | ;; in readability is generally obtained. Formatting is only done in |
23f87bed MB |
27 | ;; the buffer, so the next time you enter the article it will be |
28 | ;; "re-htmlized". | |
29 | ;; | |
e2642250 | 30 | ;; The main function is `html2text'. |
23f87bed MB |
31 | |
32 | ;;; Code: | |
33 | ||
34 | ;; | |
35 | ;; <Global variables> | |
36 | ;; | |
37 | ||
38 | (eval-when-compile | |
39 | (require 'cl)) | |
40 | ||
41 | (defvar html2text-format-single-element-list '(("hr" . html2text-clean-hr))) | |
42 | ||
43 | (defvar html2text-replace-list | |
01c52d31 MB |
44 | '(("´" . "`") |
45 | ("&" . "&") | |
46 | ("'" . "'") | |
47 | ("¦" . "|") | |
48 | ("¢" . "c") | |
49 | ("ˆ" . "^") | |
50 | ("©" . "(C)") | |
51 | ("¤" . "(#)") | |
52 | ("°" . "degree") | |
53 | ("÷" . "/") | |
54 | ("€" . "e") | |
55 | ("½" . "1/2") | |
56 | (">" . ">") | |
57 | ("¿" . "?") | |
58 | ("«" . "<<") | |
59 | ("&ldquo" . "\"") | |
60 | ("‹" . "(") | |
61 | ("‘" . "`") | |
62 | ("<" . "<") | |
63 | ("—" . "--") | |
64 | (" " . " ") | |
65 | ("–" . "-") | |
66 | ("‰" . "%%") | |
67 | ("±" . "+-") | |
c38e0c97 | 68 | ("£" . "£") |
01c52d31 MB |
69 | (""" . "\"") |
70 | ("»" . ">>") | |
71 | ("&rdquo" . "\"") | |
72 | ("®" . "(R)") | |
73 | ("›" . ")") | |
74 | ("’" . "'") | |
c38e0c97 | 75 | ("§" . "§") |
01c52d31 MB |
76 | ("¹" . "^1") |
77 | ("²" . "^2") | |
78 | ("³" . "^3") | |
79 | ("˜" . "~")) | |
23f87bed MB |
80 | "The map of entity to text. |
81 | ||
82 | This is an alist were each element is a dotted pair consisting of an | |
e2642250 MB |
83 | old string, and a replacement string. This replacement is done by the |
84 | function `html2text-substitute' which basically performs a | |
85 | `replace-string' operation for every element in the list. This is | |
23f87bed MB |
86 | completely verbatim - without any use of REGEXP.") |
87 | ||
88 | (defvar html2text-remove-tag-list | |
89 | '("html" "body" "p" "img" "dir" "head" "div" "br" "font" "title" "meta") | |
90 | "A list of removable tags. | |
91 | ||
92 | This is a list of tags which should be removed, without any | |
e2642250 | 93 | formatting. Note that tags in the list are presented *without* |
338ecb71 | 94 | any \"<\" or \">\". All occurrences of a tag appearing in this |
e2642250 MB |
95 | list are removed, irrespective of whether it is a closing or |
96 | opening tag, or if the tag has additional attributes. The | |
97 | deletion is done by the function `html2text-remove-tags'. | |
23f87bed MB |
98 | |
99 | For instance the text: | |
100 | ||
101 | \"Here comes something <font size\"+3\" face=\"Helvetica\"> big </font>.\" | |
102 | ||
103 | will be reduced to: | |
104 | ||
105 | \"Here comes something big.\" | |
106 | ||
107 | If this list contains the element \"font\".") | |
108 | ||
109 | (defvar html2text-format-tag-list | |
110 | '(("b" . html2text-clean-bold) | |
e2642250 | 111 | ("strong" . html2text-clean-bold) |
23f87bed MB |
112 | ("u" . html2text-clean-underline) |
113 | ("i" . html2text-clean-italic) | |
e2642250 | 114 | ("em" . html2text-clean-italic) |
23f87bed MB |
115 | ("blockquote" . html2text-clean-blockquote) |
116 | ("a" . html2text-clean-anchor) | |
117 | ("ul" . html2text-clean-ul) | |
118 | ("ol" . html2text-clean-ol) | |
119 | ("dl" . html2text-clean-dl) | |
120 | ("center" . html2text-clean-center)) | |
121 | "An alist of tags and processing functions. | |
122 | ||
123 | This is an alist where each dotted pair consists of a tag, and then | |
e2642250 | 124 | the name of a function to be called when this tag is found. The |
23f87bed | 125 | function is called with the arguments p1, p2, p3 and p4. These are |
4c36be58 | 126 | demonstrated below: |
23f87bed MB |
127 | |
128 | \"<b> This is bold text </b>\" | |
129 | ^ ^ ^ ^ | |
130 | | | | | | |
131 | p1 p2 p3 p4 | |
132 | ||
133 | Then the called function will typically format the text somewhat and | |
134 | remove the tags.") | |
135 | ||
136 | (defvar html2text-remove-tag-list2 '("li" "dt" "dd" "meta") | |
137 | "Another list of removable tags. | |
138 | ||
139 | This is a list of tags which are removed similarly to the list | |
140 | `html2text-remove-tag-list' - but these tags are retained for the | |
141 | formatting, and then moved afterward.") | |
142 | ||
143 | ;; | |
144 | ;; </Global variables> | |
145 | ;; | |
146 | ||
147 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
148 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
149 | ||
150 | ;; | |
151 | ;; <Utility functions> | |
152 | ;; | |
153 | ||
23f87bed | 154 | |
e2642250 MB |
155 | (defun html2text-replace-string (from-string to-string min max) |
156 | "Replace FROM-STRING with TO-STRING in region from MIN to MAX." | |
157 | (goto-char min) | |
23f87bed MB |
158 | (let ((delta (- (string-width to-string) (string-width from-string))) |
159 | (change 0)) | |
e2642250 | 160 | (while (search-forward from-string max t) |
23f87bed | 161 | (replace-match to-string) |
e2642250 MB |
162 | (setq change (+ change delta))) |
163 | change)) | |
23f87bed MB |
164 | |
165 | ;; | |
166 | ;; </Utility functions> | |
167 | ;; | |
168 | ||
169 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
170 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
171 | ||
172 | ;; | |
173 | ;; <Functions related to attributes> i.e. <font size=+3> | |
174 | ;; | |
175 | ||
e2642250 MB |
176 | (defun html2text-attr-value (list attribute) |
177 | "Get value of ATTRIBUTE from LIST." | |
178 | (nth 1 (assoc attribute list))) | |
23f87bed | 179 | |
0683d241 | 180 | (defun html2text-get-attr (p1 p2) |
23f87bed | 181 | (goto-char p1) |
f76c98dd AS |
182 | (re-search-forward "\\s-+" p2 t) |
183 | (let (attr-list) | |
184 | (while (re-search-forward "[-a-z0-9._]+" p2 t) | |
185 | (setq attr-list | |
186 | (cons | |
187 | (list (match-string 0) | |
188 | (when (looking-at "\\s-*=") | |
189 | (goto-char (match-end 0)) | |
190 | (skip-chars-forward "[:space:]") | |
191 | (when (or (looking-at "\"[^\"]*\"\\|'[^']*'") | |
192 | (looking-at "[-a-z0-9._:]+")) | |
193 | (goto-char (match-end 0)) | |
194 | (match-string 0)))) | |
195 | attr-list))) | |
e2642250 | 196 | attr-list)) |
23f87bed MB |
197 | |
198 | ;; | |
199 | ;; </Functions related to attributes> | |
200 | ;; | |
201 | ||
202 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
203 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
204 | ||
205 | ;; | |
206 | ;; <Functions to be called to format a tag-pair> | |
207 | ;; | |
208 | (defun html2text-clean-list-items (p1 p2 list-type) | |
209 | (goto-char p1) | |
210 | (let ((item-nr 0) | |
211 | (items 0)) | |
01c52d31 | 212 | (while (search-forward "<li>" p2 t) |
23f87bed MB |
213 | (setq items (1+ items))) |
214 | (goto-char p1) | |
215 | (while (< item-nr items) | |
216 | (setq item-nr (1+ item-nr)) | |
01c52d31 | 217 | (search-forward "<li>" (point-max) t) |
23f87bed MB |
218 | (cond |
219 | ((string= list-type "ul") (insert " o ")) | |
220 | ((string= list-type "ol") (insert (format " %s: " item-nr))) | |
e2642250 | 221 | (t (insert " x ")))))) |
23f87bed MB |
222 | |
223 | (defun html2text-clean-dtdd (p1 p2) | |
224 | (goto-char p1) | |
225 | (let ((items 0) | |
226 | (item-nr 0)) | |
01c52d31 | 227 | (while (search-forward "<dt>" p2 t) |
23f87bed MB |
228 | (setq items (1+ items))) |
229 | (goto-char p1) | |
230 | (while (< item-nr items) | |
231 | (setq item-nr (1+ item-nr)) | |
232 | (re-search-forward "<dt>\\([ ]*\\)" (point-max) t) | |
233 | (when (match-string 1) | |
234 | (delete-region (point) (- (point) (string-width (match-string 1))))) | |
235 | (let ((def-p1 (point)) | |
236 | (def-p2 0)) | |
237 | (re-search-forward "\\([ ]*\\)\\(</dt>\\|<dd>\\)" (point-max) t) | |
238 | (if (match-string 1) | |
239 | (progn | |
240 | (let* ((mw1 (string-width (match-string 1))) | |
241 | (mw2 (string-width (match-string 2))) | |
242 | (mw (+ mw1 mw2))) | |
243 | (goto-char (- (point) mw)) | |
244 | (delete-region (point) (+ (point) mw1)) | |
245 | (setq def-p2 (point)))) | |
246 | (setq def-p2 (- (point) (string-width (match-string 2))))) | |
247 | (put-text-property def-p1 def-p2 'face 'bold))))) | |
248 | ||
249 | (defun html2text-delete-tags (p1 p2 p3 p4) | |
250 | (delete-region p1 p2) | |
251 | (delete-region (- p3 (- p2 p1)) (- p4 (- p2 p1)))) | |
252 | ||
253 | (defun html2text-delete-single-tag (p1 p2) | |
254 | (delete-region p1 p2)) | |
255 | ||
256 | (defun html2text-clean-hr (p1 p2) | |
257 | (html2text-delete-single-tag p1 p2) | |
258 | (goto-char p1) | |
259 | (newline 1) | |
e2642250 | 260 | (insert (make-string fill-column ?-))) |
23f87bed MB |
261 | |
262 | (defun html2text-clean-ul (p1 p2 p3 p4) | |
263 | (html2text-delete-tags p1 p2 p3 p4) | |
e2642250 | 264 | (html2text-clean-list-items p1 (- p3 (- p1 p2)) "ul")) |
23f87bed MB |
265 | |
266 | (defun html2text-clean-ol (p1 p2 p3 p4) | |
267 | (html2text-delete-tags p1 p2 p3 p4) | |
e2642250 | 268 | (html2text-clean-list-items p1 (- p3 (- p1 p2)) "ol")) |
23f87bed MB |
269 | |
270 | (defun html2text-clean-dl (p1 p2 p3 p4) | |
271 | (html2text-delete-tags p1 p2 p3 p4) | |
e2642250 | 272 | (html2text-clean-dtdd p1 (- p3 (- p1 p2)))) |
23f87bed MB |
273 | |
274 | (defun html2text-clean-center (p1 p2 p3 p4) | |
275 | (html2text-delete-tags p1 p2 p3 p4) | |
e2642250 | 276 | (center-region p1 (- p3 (- p2 p1)))) |
23f87bed MB |
277 | |
278 | (defun html2text-clean-bold (p1 p2 p3 p4) | |
279 | (put-text-property p2 p3 'face 'bold) | |
e2642250 | 280 | (html2text-delete-tags p1 p2 p3 p4)) |
23f87bed MB |
281 | |
282 | (defun html2text-clean-title (p1 p2 p3 p4) | |
283 | (put-text-property p2 p3 'face 'bold) | |
e2642250 | 284 | (html2text-delete-tags p1 p2 p3 p4)) |
23f87bed MB |
285 | |
286 | (defun html2text-clean-underline (p1 p2 p3 p4) | |
287 | (put-text-property p2 p3 'face 'underline) | |
e2642250 | 288 | (html2text-delete-tags p1 p2 p3 p4)) |
23f87bed MB |
289 | |
290 | (defun html2text-clean-italic (p1 p2 p3 p4) | |
291 | (put-text-property p2 p3 'face 'italic) | |
e2642250 | 292 | (html2text-delete-tags p1 p2 p3 p4)) |
23f87bed MB |
293 | |
294 | (defun html2text-clean-font (p1 p2 p3 p4) | |
e2642250 | 295 | (html2text-delete-tags p1 p2 p3 p4)) |
23f87bed MB |
296 | |
297 | (defun html2text-clean-blockquote (p1 p2 p3 p4) | |
e2642250 | 298 | (html2text-delete-tags p1 p2 p3 p4)) |
23f87bed MB |
299 | |
300 | (defun html2text-clean-anchor (p1 p2 p3 p4) | |
e2642250 MB |
301 | ;; If someone can explain how to make the URL clickable I will surely |
302 | ;; improve upon this. | |
303 | ;; Maybe `goto-addr.el' can be used here. | |
0683d241 | 304 | (let* ((attr-list (html2text-get-attr p1 p2)) |
23f87bed MB |
305 | (href (html2text-attr-value attr-list "href"))) |
306 | (delete-region p1 p4) | |
307 | (when href | |
308 | (goto-char p1) | |
75d52912 AS |
309 | (insert (if (string-match "\\`['\"].*['\"]\\'" href) |
310 | (substring href 1 -1) href)) | |
23f87bed MB |
311 | (put-text-property p1 (point) 'face 'bold)))) |
312 | ||
313 | ;; | |
314 | ;; </Functions to be called to format a tag-pair> | |
315 | ;; | |
316 | ||
317 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
318 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
319 | ||
320 | ;; | |
321 | ;; <Functions to be called to fix up paragraphs> | |
322 | ;; | |
323 | ||
324 | (defun html2text-fix-paragraph (p1 p2) | |
325 | (goto-char p1) | |
01c52d31 | 326 | (let ((refill-start) |
23f87bed | 327 | (refill-stop)) |
e2642250 MB |
328 | (when (re-search-forward "<br>$" p2 t) |
329 | (goto-char p1) | |
330 | (when (re-search-forward ".+[^<][^b][^r][^>]$" p2 t) | |
331 | (beginning-of-line) | |
332 | (setq refill-start (point)) | |
333 | (goto-char p2) | |
334 | (re-search-backward ".+[^<][^b][^r][^>]$" refill-start t) | |
fdc90613 | 335 | (forward-line 1) |
e2642250 MB |
336 | (end-of-line) |
337 | ;; refill-stop should ideally be adjusted to | |
cc4a96c6 | 338 | ;; accommodate the "<br>" strings which are removed |
e2642250 MB |
339 | ;; between refill-start and refill-stop. Can simply |
340 | ;; be returned from my-replace-string | |
341 | (setq refill-stop (+ (point) | |
342 | (html2text-replace-string | |
343 | "<br>" "" | |
344 | refill-start (point)))) | |
345 | ;; (message "Point = %s refill-stop = %s" (point) refill-stop) | |
346 | ;; (sleep-for 4) | |
347 | (fill-region refill-start refill-stop)))) | |
348 | (html2text-replace-string "<br>" "" p1 p2)) | |
23f87bed MB |
349 | |
350 | ;; | |
351 | ;; This one is interactive ... | |
352 | ;; | |
353 | (defun html2text-fix-paragraphs () | |
354 | "This _tries_ to fix up the paragraphs - this is done in quite a ad-hook | |
355 | fashion, quite close to pure guess-work. It does work in some cases though." | |
356 | (interactive) | |
86713405 | 357 | (goto-char (point-min)) |
4a43ee9b MB |
358 | (while (re-search-forward "^<br>$" nil t) |
359 | (delete-region (match-beginning 0) (match-end 0))) | |
23f87bed | 360 | ;; Removing lonely <br> on a single line, if they are left intact we |
c7015153 | 361 | ;; don't have any paragraphs at all. |
86713405 | 362 | (goto-char (point-min)) |
23f87bed MB |
363 | (while (not (eobp)) |
364 | (let ((p1 (point))) | |
365 | (forward-paragraph 1) | |
366 | ;;(message "Kaller fix med p1=%s p2=%s " p1 (1- (point))) (sleep-for 5) | |
367 | (html2text-fix-paragraph p1 (1- (point))) | |
368 | (goto-char p1) | |
369 | (when (not (eobp)) | |
370 | (forward-paragraph 1))))) | |
371 | ||
372 | ;; | |
373 | ;; </Functions to be called to fix up paragraphs> | |
374 | ;; | |
375 | ||
376 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
377 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
378 | ||
379 | ;; | |
380 | ;; <Interactive functions> | |
381 | ;; | |
382 | ||
383 | (defun html2text-remove-tags (tag-list) | |
e2642250 | 384 | "Removes the tags listed in the list `html2text-remove-tag-list'. |
23f87bed MB |
385 | See the documentation for that variable." |
386 | (interactive) | |
387 | (dolist (tag tag-list) | |
86713405 | 388 | (goto-char (point-min)) |
23f87bed MB |
389 | (while (re-search-forward (format "\\(</?%s[^>]*>\\)" tag) (point-max) t) |
390 | (delete-region (match-beginning 0) (match-end 0))))) | |
391 | ||
392 | (defun html2text-format-tags () | |
e2642250 | 393 | "See the variable `html2text-format-tag-list' for documentation." |
23f87bed MB |
394 | (interactive) |
395 | (dolist (tag-and-function html2text-format-tag-list) | |
396 | (let ((tag (car tag-and-function)) | |
397 | (function (cdr tag-and-function))) | |
86713405 | 398 | (goto-char (point-min)) |
23f87bed MB |
399 | (while (re-search-forward (format "\\(<%s\\( [^>]*\\)?>\\)" tag) |
400 | (point-max) t) | |
401 | (let ((p1) | |
402 | (p2 (point)) | |
0683d241 | 403 | (p3) (p4)) |
23f87bed MB |
404 | (search-backward "<" (point-min) t) |
405 | (setq p1 (point)) | |
b193caa3 MB |
406 | (unless (search-forward (format "</%s>" tag) (point-max) t) |
407 | (goto-char p2) | |
408 | (insert (format "</%s>" tag))) | |
23f87bed MB |
409 | (setq p4 (point)) |
410 | (search-backward "</" (point-min) t) | |
411 | (setq p3 (point)) | |
412 | (funcall function p1 p2 p3 p4) | |
e2642250 | 413 | (goto-char p1)))))) |
23f87bed MB |
414 | |
415 | (defun html2text-substitute () | |
e2642250 | 416 | "See the variable `html2text-replace-list' for documentation." |
23f87bed MB |
417 | (interactive) |
418 | (dolist (e html2text-replace-list) | |
86713405 | 419 | (goto-char (point-min)) |
23f87bed MB |
420 | (let ((old-string (car e)) |
421 | (new-string (cdr e))) | |
e2642250 | 422 | (html2text-replace-string old-string new-string (point-min) (point-max))))) |
23f87bed MB |
423 | |
424 | (defun html2text-format-single-elements () | |
23f87bed MB |
425 | (interactive) |
426 | (dolist (tag-and-function html2text-format-single-element-list) | |
427 | (let ((tag (car tag-and-function)) | |
428 | (function (cdr tag-and-function))) | |
86713405 | 429 | (goto-char (point-min)) |
23f87bed MB |
430 | (while (re-search-forward (format "\\(<%s\\( [^>]*\\)?>\\)" tag) |
431 | (point-max) t) | |
432 | (let ((p1) | |
433 | (p2 (point))) | |
434 | (search-backward "<" (point-min) t) | |
435 | (setq p1 (point)) | |
e2642250 | 436 | (funcall function p1 p2)))))) |
23f87bed MB |
437 | |
438 | ;; | |
439 | ;; Main function | |
440 | ;; | |
441 | ||
442 | ;;;###autoload | |
443 | (defun html2text () | |
444 | "Convert HTML to plain text in the current buffer." | |
445 | (interactive) | |
446 | (save-excursion | |
447 | (let ((case-fold-search t) | |
448 | (buffer-read-only)) | |
449 | (html2text-remove-tags html2text-remove-tag-list) | |
450 | (html2text-format-tags) | |
451 | (html2text-remove-tags html2text-remove-tag-list2) | |
452 | (html2text-substitute) | |
453 | (html2text-format-single-elements) | |
454 | (html2text-fix-paragraphs)))) | |
455 | ||
456 | ;; | |
457 | ;; </Interactive functions> | |
458 | ;; | |
e2642250 | 459 | (provide 'html2text) |
53080505 | 460 | |
23f87bed | 461 | ;;; html2text.el ends here |