Commit | Line | Data |
---|---|---|
23f87bed | 1 | ;;; html2text.el --- a simple html to plain text converter |
f4dd4ae8 | 2 | ;; Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc. |
23f87bed MB |
3 | |
4 | ;; Author: Joakim Hove <hove@phys.ntnu.no> | |
5 | ||
6 | ;; This file is part of GNU Emacs. | |
7 | ||
8 | ;; GNU Emacs is free software; you can redistribute it and/or modify | |
9 | ;; it under the terms of the GNU General Public License as published by | |
10 | ;; the Free Software Foundation; either version 2, or (at your option) | |
11 | ;; any later version. | |
12 | ||
13 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
14 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | ;; GNU General Public License for more details. | |
17 | ||
18 | ;; You should have received a copy of the GNU General Public License | |
19 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
20 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
21 | ;; Boston, MA 02111-1307, USA. | |
22 | ||
23 | ;;; Commentary: | |
24 | ||
25 | ;; These functions provide a simple way to wash/clean html infected | |
26 | ;; mails. Definitely do not work in all cases, but some improvement | |
27 | ;; in readability is generally obtained. Formatting is only done in | |
28 | ;; the buffer, so the next time you enter the article it will be | |
29 | ;; "re-htmlized". | |
30 | ;; | |
31 | ;; The main function is "html2text" | |
32 | ||
33 | ;;; Code: | |
34 | ||
35 | ;; | |
36 | ;; <Global variables> | |
37 | ;; | |
38 | ||
39 | (eval-when-compile | |
40 | (require 'cl)) | |
41 | ||
42 | (defvar html2text-format-single-element-list '(("hr" . html2text-clean-hr))) | |
43 | ||
44 | (defvar html2text-replace-list | |
f4dd4ae8 MB |
45 | '((" " . " ") (">" . ">") ("<" . "<") (""" . "\"") |
46 | ("&" . "&") ("'" . "'")) | |
23f87bed MB |
47 | "The map of entity to text. |
48 | ||
49 | This is an alist were each element is a dotted pair consisting of an | |
50 | old string, and a replacement string. This replacement is done by the | |
51 | function \"html2text-substitute\" which basically performs a | |
52 | replace-string operation for every element in the list. This is | |
53 | completely verbatim - without any use of REGEXP.") | |
54 | ||
55 | (defvar html2text-remove-tag-list | |
56 | '("html" "body" "p" "img" "dir" "head" "div" "br" "font" "title" "meta") | |
57 | "A list of removable tags. | |
58 | ||
59 | This is a list of tags which should be removed, without any | |
60 | formatting. Observe that if you the tags in the list are presented | |
61 | *without* any \"<\" or \">\". All occurences of a tag appearing in | |
62 | this list are removed, irrespective of whether it is a closing or | |
63 | opening tag, or if the tag has additional attributes. The actual | |
64 | deletion is done by the function \"html2text-remove-tags\". | |
65 | ||
66 | For instance the text: | |
67 | ||
68 | \"Here comes something <font size\"+3\" face=\"Helvetica\"> big </font>.\" | |
69 | ||
70 | will be reduced to: | |
71 | ||
72 | \"Here comes something big.\" | |
73 | ||
74 | If this list contains the element \"font\".") | |
75 | ||
76 | (defvar html2text-format-tag-list | |
77 | '(("b" . html2text-clean-bold) | |
78 | ("u" . html2text-clean-underline) | |
79 | ("i" . html2text-clean-italic) | |
80 | ("blockquote" . html2text-clean-blockquote) | |
81 | ("a" . html2text-clean-anchor) | |
82 | ("ul" . html2text-clean-ul) | |
83 | ("ol" . html2text-clean-ol) | |
84 | ("dl" . html2text-clean-dl) | |
85 | ("center" . html2text-clean-center)) | |
86 | "An alist of tags and processing functions. | |
87 | ||
88 | This is an alist where each dotted pair consists of a tag, and then | |
89 | the name of a function to be called when this tag is found. The | |
90 | function is called with the arguments p1, p2, p3 and p4. These are | |
91 | demontrated below: | |
92 | ||
93 | \"<b> This is bold text </b>\" | |
94 | ^ ^ ^ ^ | |
95 | | | | | | |
96 | p1 p2 p3 p4 | |
97 | ||
98 | Then the called function will typically format the text somewhat and | |
99 | remove the tags.") | |
100 | ||
101 | (defvar html2text-remove-tag-list2 '("li" "dt" "dd" "meta") | |
102 | "Another list of removable tags. | |
103 | ||
104 | This is a list of tags which are removed similarly to the list | |
105 | `html2text-remove-tag-list' - but these tags are retained for the | |
106 | formatting, and then moved afterward.") | |
107 | ||
108 | ;; | |
109 | ;; </Global variables> | |
110 | ;; | |
111 | ||
112 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
113 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
114 | ||
115 | ;; | |
116 | ;; <Utility functions> | |
117 | ;; | |
118 | ||
119 | (defun html2text-buffer-head () | |
120 | (if (string= mode-name "Article") | |
121 | (beginning-of-buffer) | |
122 | (beginning-of-buffer) | |
123 | ) | |
124 | ) | |
125 | ||
126 | (defun html2text-replace-string (from-string to-string p1 p2) | |
127 | (goto-char p1) | |
128 | (let ((delta (- (string-width to-string) (string-width from-string))) | |
129 | (change 0)) | |
130 | (while (search-forward from-string p2 t) | |
131 | (replace-match to-string) | |
132 | (setq change (+ change delta)) | |
133 | ) | |
134 | change | |
135 | ) | |
136 | ) | |
137 | ||
138 | ;; | |
139 | ;; </Utility functions> | |
140 | ;; | |
141 | ||
142 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
143 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
144 | ||
145 | ;; | |
146 | ;; <Functions related to attributes> i.e. <font size=+3> | |
147 | ;; | |
148 | ||
149 | (defun html2text-attr-value (attr-list attr) | |
150 | (nth 1 (assoc attr attr-list)) | |
151 | ) | |
152 | ||
153 | (defun html2text-get-attr (p1 p2 tag) | |
154 | (goto-char p1) | |
155 | (re-search-forward " +[^ ]" p2 t) | |
156 | (let* ((attr-string (buffer-substring-no-properties (1- (point)) (1- p2))) | |
157 | (tmp-list (split-string attr-string)) | |
158 | (attr-list) | |
159 | (counter 0) | |
160 | (prev (car tmp-list)) | |
161 | (this (nth 1 tmp-list)) | |
162 | (next (nth 2 tmp-list)) | |
163 | (index 1)) | |
164 | ||
165 | (cond | |
166 | ;; size=3 | |
167 | ((string-match "[^ ]=[^ ]" prev) | |
168 | (let ((attr (nth 0 (split-string prev "="))) | |
169 | (value (nth 1 (split-string prev "=")))) | |
170 | (setq attr-list (cons (list attr value) attr-list)) | |
171 | ) | |
172 | ) | |
173 | ;; size= 3 | |
174 | ((string-match "[^ ]=\\'" prev) | |
175 | (setq attr-list (cons (list (substring prev 0 -1) this) attr-list)) | |
176 | ) | |
177 | ) | |
178 | ||
179 | (while (< index (length tmp-list)) | |
180 | (cond | |
181 | ;; size=3 | |
182 | ((string-match "[^ ]=[^ ]" this) | |
183 | (let ((attr (nth 0 (split-string this "="))) | |
184 | (value (nth 1 (split-string this "=")))) | |
185 | (setq attr-list (cons (list attr value) attr-list)) | |
186 | ) | |
187 | ) | |
188 | ;; size =3 | |
189 | ((string-match "\\`=[^ ]" this) | |
190 | (setq attr-list (cons (list prev (substring this 1)) attr-list))) | |
191 | ||
192 | ;; size= 3 | |
193 | ((string-match "[^ ]=\\'" this) | |
194 | (setq attr-list (cons (list (substring this 0 -1) next) attr-list)) | |
195 | ) | |
196 | ||
197 | ;; size = 3 | |
198 | ((string= "=" this) | |
199 | (setq attr-list (cons (list prev next) attr-list)) | |
200 | ) | |
201 | ) | |
202 | (setq index (1+ index)) | |
203 | (setq prev this) | |
204 | (setq this next) | |
205 | (setq next (nth (1+ index) tmp-list)) | |
206 | ) | |
207 | ||
208 | ;; | |
209 | ;; Tags with no accompanying "=" i.e. value=nil | |
210 | ;; | |
211 | (setq prev (car tmp-list)) | |
212 | (setq this (nth 1 tmp-list)) | |
213 | (setq next (nth 2 tmp-list)) | |
214 | (setq index 1) | |
215 | ||
216 | (if (not (string-match "=" prev)) | |
217 | (progn | |
218 | (if (not (string= (substring this 0 1) "=")) | |
219 | (setq attr-list (cons (list prev nil) attr-list)) | |
220 | ) | |
221 | ) | |
222 | ) | |
223 | ||
224 | (while (< index (1- (length tmp-list))) | |
225 | (if (not (string-match "=" this)) | |
226 | (if (not (or (string= (substring next 0 1) "=") | |
227 | (string= (substring prev -1) "="))) | |
228 | (setq attr-list (cons (list this nil) attr-list)) | |
229 | ) | |
230 | ) | |
231 | (setq index (1+ index)) | |
232 | (setq prev this) | |
233 | (setq this next) | |
234 | (setq next (nth (1+ index) tmp-list)) | |
235 | ) | |
236 | ||
237 | (if this | |
238 | (progn | |
239 | (if (not (string-match "=" this)) | |
240 | (progn | |
241 | (if (not (string= (substring prev -1) "=")) | |
242 | (setq attr-list (cons (list this nil) attr-list)) | |
243 | ) | |
244 | ) | |
245 | ) | |
246 | ) | |
247 | ) | |
248 | attr-list ;; return - value | |
249 | ) | |
250 | ) | |
251 | ||
252 | ;; | |
253 | ;; </Functions related to attributes> | |
254 | ;; | |
255 | ||
256 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
257 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
258 | ||
259 | ;; | |
260 | ;; <Functions to be called to format a tag-pair> | |
261 | ;; | |
262 | (defun html2text-clean-list-items (p1 p2 list-type) | |
263 | (goto-char p1) | |
264 | (let ((item-nr 0) | |
265 | (items 0)) | |
266 | (while (re-search-forward "<li>" p2 t) | |
267 | (setq items (1+ items))) | |
268 | (goto-char p1) | |
269 | (while (< item-nr items) | |
270 | (setq item-nr (1+ item-nr)) | |
271 | (re-search-forward "<li>" (point-max) t) | |
272 | (cond | |
273 | ((string= list-type "ul") (insert " o ")) | |
274 | ((string= list-type "ol") (insert (format " %s: " item-nr))) | |
275 | (t (insert " x "))) | |
276 | ) | |
277 | ) | |
278 | ) | |
279 | ||
280 | (defun html2text-clean-dtdd (p1 p2) | |
281 | (goto-char p1) | |
282 | (let ((items 0) | |
283 | (item-nr 0)) | |
284 | (while (re-search-forward "<dt>" p2 t) | |
285 | (setq items (1+ items))) | |
286 | (goto-char p1) | |
287 | (while (< item-nr items) | |
288 | (setq item-nr (1+ item-nr)) | |
289 | (re-search-forward "<dt>\\([ ]*\\)" (point-max) t) | |
290 | (when (match-string 1) | |
291 | (delete-region (point) (- (point) (string-width (match-string 1))))) | |
292 | (let ((def-p1 (point)) | |
293 | (def-p2 0)) | |
294 | (re-search-forward "\\([ ]*\\)\\(</dt>\\|<dd>\\)" (point-max) t) | |
295 | (if (match-string 1) | |
296 | (progn | |
297 | (let* ((mw1 (string-width (match-string 1))) | |
298 | (mw2 (string-width (match-string 2))) | |
299 | (mw (+ mw1 mw2))) | |
300 | (goto-char (- (point) mw)) | |
301 | (delete-region (point) (+ (point) mw1)) | |
302 | (setq def-p2 (point)))) | |
303 | (setq def-p2 (- (point) (string-width (match-string 2))))) | |
304 | (put-text-property def-p1 def-p2 'face 'bold))))) | |
305 | ||
306 | (defun html2text-delete-tags (p1 p2 p3 p4) | |
307 | (delete-region p1 p2) | |
308 | (delete-region (- p3 (- p2 p1)) (- p4 (- p2 p1)))) | |
309 | ||
310 | (defun html2text-delete-single-tag (p1 p2) | |
311 | (delete-region p1 p2)) | |
312 | ||
313 | (defun html2text-clean-hr (p1 p2) | |
314 | (html2text-delete-single-tag p1 p2) | |
315 | (goto-char p1) | |
316 | (newline 1) | |
317 | (insert (make-string fill-column ?-)) | |
318 | ) | |
319 | ||
320 | (defun html2text-clean-ul (p1 p2 p3 p4) | |
321 | (html2text-delete-tags p1 p2 p3 p4) | |
322 | (html2text-clean-list-items p1 (- p3 (- p1 p2)) "ul") | |
323 | ) | |
324 | ||
325 | (defun html2text-clean-ol (p1 p2 p3 p4) | |
326 | (html2text-delete-tags p1 p2 p3 p4) | |
327 | (html2text-clean-list-items p1 (- p3 (- p1 p2)) "ol") | |
328 | ) | |
329 | ||
330 | (defun html2text-clean-dl (p1 p2 p3 p4) | |
331 | (html2text-delete-tags p1 p2 p3 p4) | |
332 | (html2text-clean-dtdd p1 (- p3 (- p1 p2))) | |
333 | ) | |
334 | ||
335 | (defun html2text-clean-center (p1 p2 p3 p4) | |
336 | (html2text-delete-tags p1 p2 p3 p4) | |
337 | (center-region p1 (- p3 (- p2 p1))) | |
338 | ) | |
339 | ||
340 | (defun html2text-clean-bold (p1 p2 p3 p4) | |
341 | (put-text-property p2 p3 'face 'bold) | |
342 | (html2text-delete-tags p1 p2 p3 p4) | |
343 | ) | |
344 | ||
345 | (defun html2text-clean-title (p1 p2 p3 p4) | |
346 | (put-text-property p2 p3 'face 'bold) | |
347 | (html2text-delete-tags p1 p2 p3 p4) | |
348 | ) | |
349 | ||
350 | (defun html2text-clean-underline (p1 p2 p3 p4) | |
351 | (put-text-property p2 p3 'face 'underline) | |
352 | (html2text-delete-tags p1 p2 p3 p4) | |
353 | ) | |
354 | ||
355 | (defun html2text-clean-italic (p1 p2 p3 p4) | |
356 | (put-text-property p2 p3 'face 'italic) | |
357 | (html2text-delete-tags p1 p2 p3 p4) | |
358 | ) | |
359 | ||
360 | (defun html2text-clean-font (p1 p2 p3 p4) | |
361 | (html2text-delete-tags p1 p2 p3 p4) | |
362 | ) | |
363 | ||
364 | (defun html2text-clean-blockquote (p1 p2 p3 p4) | |
365 | (html2text-delete-tags p1 p2 p3 p4) | |
366 | ) | |
367 | ||
368 | (defun html2text-clean-anchor (p1 p2 p3 p4) | |
369 | ;; If someone can explain how to make the URL clickable I will | |
370 | ;; surely improve upon this. | |
371 | (let* ((attr-list (html2text-get-attr p1 p2 "a")) | |
372 | (href (html2text-attr-value attr-list "href"))) | |
373 | (delete-region p1 p4) | |
374 | (when href | |
375 | (goto-char p1) | |
376 | (insert (substring href 1 -1 )) | |
377 | (put-text-property p1 (point) 'face 'bold)))) | |
378 | ||
379 | ;; | |
380 | ;; </Functions to be called to format a tag-pair> | |
381 | ;; | |
382 | ||
383 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
384 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
385 | ||
386 | ;; | |
387 | ;; <Functions to be called to fix up paragraphs> | |
388 | ;; | |
389 | ||
390 | (defun html2text-fix-paragraph (p1 p2) | |
391 | (goto-char p1) | |
392 | (let ((has-br-line) | |
393 | (refill-start) | |
394 | (refill-stop)) | |
395 | (if (re-search-forward "<br>$" p2 t) | |
396 | (setq has-br-line t) | |
397 | ) | |
398 | (if has-br-line | |
399 | (progn | |
400 | (goto-char p1) | |
401 | (if (re-search-forward ".+[^<][^b][^r][^>]$" p2 t) | |
402 | (progn | |
403 | (beginning-of-line) | |
404 | (setq refill-start (point)) | |
405 | (goto-char p2) | |
406 | (re-search-backward ".+[^<][^b][^r][^>]$" refill-start t) | |
407 | (next-line 1) | |
408 | (end-of-line) | |
409 | ;; refill-stop should ideally be adjusted to | |
410 | ;; accomodate the "<br>" strings which are removed | |
411 | ;; between refill-start and refill-stop. Can simply | |
412 | ;; be returned from my-replace-string | |
413 | (setq refill-stop (+ (point) | |
414 | (html2text-replace-string | |
415 | "<br>" "" | |
416 | refill-start (point)))) | |
417 | ;; (message "Point = %s refill-stop = %s" (point) refill-stop) | |
418 | ;; (sleep-for 4) | |
419 | (fill-region refill-start refill-stop) | |
420 | ) | |
421 | ) | |
422 | ) | |
423 | ) | |
424 | ) | |
425 | (html2text-replace-string "<br>" "" p1 p2) | |
426 | ) | |
427 | ||
428 | ;; | |
429 | ;; This one is interactive ... | |
430 | ;; | |
431 | (defun html2text-fix-paragraphs () | |
432 | "This _tries_ to fix up the paragraphs - this is done in quite a ad-hook | |
433 | fashion, quite close to pure guess-work. It does work in some cases though." | |
434 | (interactive) | |
435 | (html2text-buffer-head) | |
436 | (replace-regexp "^<br>$" "") | |
437 | ;; Removing lonely <br> on a single line, if they are left intact we | |
438 | ;; dont have any paragraphs at all. | |
439 | (html2text-buffer-head) | |
440 | (while (not (eobp)) | |
441 | (let ((p1 (point))) | |
442 | (forward-paragraph 1) | |
443 | ;;(message "Kaller fix med p1=%s p2=%s " p1 (1- (point))) (sleep-for 5) | |
444 | (html2text-fix-paragraph p1 (1- (point))) | |
445 | (goto-char p1) | |
446 | (when (not (eobp)) | |
447 | (forward-paragraph 1))))) | |
448 | ||
449 | ;; | |
450 | ;; </Functions to be called to fix up paragraphs> | |
451 | ;; | |
452 | ||
453 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
454 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
455 | ||
456 | ;; | |
457 | ;; <Interactive functions> | |
458 | ;; | |
459 | ||
460 | (defun html2text-remove-tags (tag-list) | |
461 | "Removes the tags listed in the list \"html2text-remove-tag-list\". | |
462 | See the documentation for that variable." | |
463 | (interactive) | |
464 | (dolist (tag tag-list) | |
465 | (html2text-buffer-head) | |
466 | (while (re-search-forward (format "\\(</?%s[^>]*>\\)" tag) (point-max) t) | |
467 | (delete-region (match-beginning 0) (match-end 0))))) | |
468 | ||
469 | (defun html2text-format-tags () | |
470 | "See the variable \"html2text-format-tag-list\" for documentation" | |
471 | (interactive) | |
472 | (dolist (tag-and-function html2text-format-tag-list) | |
473 | (let ((tag (car tag-and-function)) | |
474 | (function (cdr tag-and-function))) | |
475 | (html2text-buffer-head) | |
476 | (while (re-search-forward (format "\\(<%s\\( [^>]*\\)?>\\)" tag) | |
477 | (point-max) t) | |
478 | (let ((p1) | |
479 | (p2 (point)) | |
480 | (p3) (p4) | |
481 | (attr (match-string 1))) | |
482 | (search-backward "<" (point-min) t) | |
483 | (setq p1 (point)) | |
484 | (re-search-forward (format "</%s>" tag) (point-max) t) | |
485 | (setq p4 (point)) | |
486 | (search-backward "</" (point-min) t) | |
487 | (setq p3 (point)) | |
488 | (funcall function p1 p2 p3 p4) | |
489 | (goto-char p1) | |
490 | ) | |
491 | ) | |
492 | ) | |
493 | ) | |
494 | ) | |
495 | ||
496 | (defun html2text-substitute () | |
497 | "See the variable \"html2text-replace-list\" for documentation" | |
498 | (interactive) | |
499 | (dolist (e html2text-replace-list) | |
500 | (html2text-buffer-head) | |
501 | (let ((old-string (car e)) | |
502 | (new-string (cdr e))) | |
503 | (html2text-replace-string old-string new-string (point-min) (point-max)) | |
504 | ) | |
505 | ) | |
506 | ) | |
507 | ||
508 | (defun html2text-format-single-elements () | |
509 | "" | |
510 | (interactive) | |
511 | (dolist (tag-and-function html2text-format-single-element-list) | |
512 | (let ((tag (car tag-and-function)) | |
513 | (function (cdr tag-and-function))) | |
514 | (html2text-buffer-head) | |
515 | (while (re-search-forward (format "\\(<%s\\( [^>]*\\)?>\\)" tag) | |
516 | (point-max) t) | |
517 | (let ((p1) | |
518 | (p2 (point))) | |
519 | (search-backward "<" (point-min) t) | |
520 | (setq p1 (point)) | |
521 | (funcall function p1 p2) | |
522 | ) | |
523 | ) | |
524 | ) | |
525 | ) | |
526 | ) | |
527 | ||
528 | ;; | |
529 | ;; Main function | |
530 | ;; | |
531 | ||
532 | ;;;###autoload | |
533 | (defun html2text () | |
534 | "Convert HTML to plain text in the current buffer." | |
535 | (interactive) | |
536 | (save-excursion | |
537 | (let ((case-fold-search t) | |
538 | (buffer-read-only)) | |
539 | (html2text-remove-tags html2text-remove-tag-list) | |
540 | (html2text-format-tags) | |
541 | (html2text-remove-tags html2text-remove-tag-list2) | |
542 | (html2text-substitute) | |
543 | (html2text-format-single-elements) | |
544 | (html2text-fix-paragraphs)))) | |
545 | ||
546 | ;; | |
547 | ;; </Interactive functions> | |
548 | ;; | |
549 | ||
550 | ;;; arch-tag: e9e57b79-35d4-4de1-a647-e7e01fe56d1e | |
551 | ;;; html2text.el ends here |