improving eword-decode.el

2005年 10月 17日 (月) 15:49:08 JST

こんにちは山岡です。

先週 emacs-pretest-bug メーリングリストに、以下のような subject
が Gnus で正しくデコードされないという報告があり、半田さんが巧妙
なやり方で解決して下さいました。

Subject: =?UTF-8?B?W2lwdC5ydSAjMTYzXSDQkNCy0YLQvtCe0YLQstC10YI6INCc0KHQmjog0KHQ?= =?UTF-8?B?nyDRgtC10YHRgg==?=

これは、テキストを utf-8 でエンコードしたデータを、文字の境界で
はない場所で分割し、それぞれを B エンコードしたものです。Gnus の
rfc2047.el だけでなく、FLIM の eword-decode.el も、このようなも
のを扱うことができません。RFC2047 には次のような記述があります。

5. Use of encoded-words in message headers

[...]

   The 'encoded-text' in an 'encoded-word' must be self-contained;
   'encoded-text' MUST NOT be continued from one 'encoded-word' to
   another.  This implies that the 'encoded-text' portion of a "B"
   'encoded-word' will be a multiple of 4 characters long; for a "Q"
   'encoded-word', any "=" character that appears in the 'encoded-text'
   portion will be followed by two hexadecimal characters.

ぼくは最初、上記のようなものはこの「MUST NOT」に該当するのではな
いかと思ったのですが、半田さんからは以下の見解をいただきました。

This example doesn't violate the above restriction.  Each
'encoded-word' is surely "multiple of 4 characters long".

Please note that the above restriction is for
'encoded-text', not for the underlining coded character set.
So, I think the above document doesn't prohibit diviging
UTF-8 byte sequence at non-character boundary.

加えて、半田さんのコードの効率が良いこと、Gnus には何がなんでも
対応してしまう傾向があること、それに RMS からの命があることをもっ
て、ぼくは同意したのでした。:)

FLIM のデコーダはどうしましょうか?  分割された encoded words の
境界がテキストの文字単位になっていないものを他には見た記憶が無い
ので、現在のままでもたぶん問題は無いと思うのですが、いちおう書い
てみました。ただし、ご要望が無ければ CVS commit しないつもりです。

2005-10-17  Katsumi Yamaoka  <yamaoka ＠ jpl.org>

	* eword-decode.el: Change the way to decode successive
	encoded-words: decode B- or Q-encoding in each encoded-word,
	concatenate them, and decode it as charset.  See the following
	threads for more information:
	http://news.gmane.org/group/gmane.emacs.pretest.bugs/thread=9541
	http://news.gmane.org/group/gmane.emacs.gnus.general/thread=61176
	(eword-decode-encoded-words): New function.
	(eword-decode-string): Use it.
	(eword-decode-region): Use it.
	(eword-analyze-encoded-word): Use it.
	(eword-decode-encoded-word): Abolish.
	(eword-decode-encoded-text): Abolish.
	(eword-decode-encoded-word-error-handler): Abolish.
	(eword-warning-face): Abolish.
	(eword-decode-encoded-word-default-error-handler): Abolish.

-------------- next part --------------

--- eword-decode.el~	2005-07-06 01:55:39 +0000
+++ eword-decode.el	2005-10-17 06:45:12 +0000
@@ -88,30 +88,31 @@
 if there are in decoded encoded-words (generated by bad manner MUA
 such as a version of Net$cape)."
   (setq string (std11-unfold-string string))
-  (let ((dest "")(ew nil)
-	beg end)
-    (while (and (string-match eword-encoded-word-regexp string)
-		(setq beg (match-beginning 0)
-		      end (match-end 0))
-		)
-      (if (> beg 0)
-	  (if (not
-	       (and (eq ew t)
-		    (string-match "^[ \t]+$" (substring string 0 beg))
-		    ))
-	      (setq dest (concat dest (substring string 0 beg)))
-	    )
-	)
-      (setq dest
-	    (concat dest
-		    (eword-decode-encoded-word
-		     (substring string beg end) must-unfold)
-		    ))
-      (setq string (substring string end))
-      (setq ew t)
-      )
-    (concat dest string)
-    ))
+  (let (start end words)
+    (while (string-match eword-encoded-word-regexp string)
+      (setq start (match-beginning 0)
+	    end (match-end 0)
+	    words (list (list (match-string 1 string) ;; charset
+			      (match-string 2 string) ;; language
+			      (match-string 3 string) ;; encoding
+			      (match-string 4 string)))) ;; word
+      (while (and (string-match (eval-when-compile
+				  (concat "\\([\n\t ]*\\)"
+					  eword-encoded-word-regexp))
+				string end)
+		  (= end (match-beginning 0)))
+	(setq end (match-end 0))
+	(push (list (match-string 2 string) ;; charset
+		    (match-string 3 string) ;; language
+		    (match-string 4 string) ;; encoding
+		    (match-string 5 string)) ;; word
+	      words))
+      (when (setq words (eword-decode-encoded-words (nreverse words)
+						    must-unfold))
+	(setq string (concat (substring string 0 start)
+			     words
+			     (substring string end))))))
+  string)
 
 (defun eword-decode-structured-field-body (string
 					   &optional start-column max-column
@@ -223,24 +224,29 @@
     (save-restriction
       (narrow-to-region start end)
       (if unfolding
-	  (eword-decode-unfold)
-	)
+	  (eword-decode-unfold))
       (goto-char (point-min))
-      (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
-                                        "\\(\n?[ \t]\\)+"
-                                        "\\(" eword-encoded-word-regexp "\\)")
-                                nil t)
-	(replace-match "\\1\\7")
-        (goto-char (point-min))
-	)
-      (while (re-search-forward eword-encoded-word-regexp nil t)
-	(insert (eword-decode-encoded-word
-		 (prog1
-		     (buffer-substring (match-beginning 0) (match-end 0))
-		   (delete-region (match-beginning 0) (match-end 0))
-		   ) must-unfold))
-	)
-      )))
+      (let (start end words)
+	(while (re-search-forward eword-encoded-word-regexp nil t)
+	  (setq start (match-beginning 0)
+		end (match-end 0)
+		words (list (list (match-string 1) ;; charset
+				  (match-string 2) ;; language
+				  (match-string 3) ;; encoding
+				  (match-string 4)))) ;; word
+	  (while (looking-at (eval-when-compile
+			       (concat "\\([\n\t ]*\\)"
+				       eword-encoded-word-regexp)))
+	    (goto-char (setq end (match-end 0)))
+	    (push (list (match-string 2) ;; charset
+			(match-string 3) ;; language
+			(match-string 4) ;; encoding
+			(match-string 5)) ;; word
+		  words))
+	  (when (setq words (eword-decode-encoded-words (nreverse words)
+							must-unfold))
+	    (delete-region start end)
+	    (insert words)))))))
 
 (defun eword-decode-unfold ()
   (goto-char (point-min))
@@ -511,86 +517,53 @@
 (make-obsolete 'eword-decode-header 'mime-decode-header-in-buffer)
 
 
-;;; @ encoded-word decoder
-;;;
-
-(defvar eword-decode-encoded-word-error-handler
-  'eword-decode-encoded-word-default-error-handler)
-
-(defvar eword-warning-face nil
-  "Face used for invalid encoded-word.")
-
-(defun eword-decode-encoded-word-default-error-handler (word signal)
-  (and (add-text-properties 0 (length word)
-			    (and eword-warning-face
-				 (list 'face eword-warning-face))
-			    word)
-       word))
-
-(defun eword-decode-encoded-word (word &optional must-unfold)
-  "Decode WORD as an encoded-word.
-
-If charset is unknown or unsupported, return WORD.
-If encoding is unknown, or some error occurs while decoding,
-`eword-decode-encoded-word-error-handler' is called with WORD and an
-error condition.
-
-If MUST-UNFOLD is non-nil, unfold decoded WORD."
-  (or (and (string-match eword-encoded-word-regexp word)
-	   (condition-case err
-	       (eword-decode-encoded-text
-		;; charset
-		(substring word (match-beginning 1)(match-end 1))
-		;; language
-		(when (match-beginning 2)
-		  (intern
-		   (downcase
-		    (substring word (1+ (match-beginning 2))(match-end 2)))))
-		;; encoding
-		(upcase
-		 (substring word (match-beginning 3)(match-end 3)))
-		;; encoded-text
-		(substring word (match-beginning 4)(match-end 4))
-		must-unfold)
-	     (error
-	      (funcall eword-decode-encoded-word-error-handler word err))))
-      word))
-
-
-;;; @ encoded-text decoder
+;;; @ encoded-words decoder
 ;;;
 
-(defun eword-decode-encoded-text (charset language encoding string
-					  &optional must-unfold)
-  "Decode STRING as an encoded-text.
-
-If your emacs implementation can not decode CHARSET, it returns nil.
-
-If LANGUAGE is non-nil, it is put to `mime-language' text-property.
-If ENCODING is not \"B\" or \"Q\", it occurs error.
-So you should write error-handling code if you don't want break by errors.
+(defun eword-decode-encoded-words (words must-unfold)
+  "Decode successive encoded-words in WORDS and return a decoded string.
+Each element of WORDS looks like (CHARSET LANGUAGE ENCODING DATA).
 
 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
-if there are in decoded encoded-text (generated by bad manner MUA such
-as a version of Net$cape)."
-  (when (mime-charset-to-coding-system charset)
-    (let ((dest (encoded-text-decode-string string encoding)))
-      (when dest
-	(setq dest (decode-mime-charset-string dest charset))
+if there are in decoded encoded-words (generated by bad manner MUA
+such as a version of Net$cape)."
+  (let (word charset encoding language rest)
+    (catch 'invalid
+      (while words
+	(setq word (pop words))
+	(unless (mime-charset-to-coding-system (setq charset (car word)))
+	  (message "Invalid charset: %s" charset)
+	  (throw 'invalid nil))
+	(setq encoding (nth 2 word))
+	(cond ((member encoding '("B" "Q")))
+	      ((member encoding '("b" "q"))
+	       (setq encoding (upcase encoding)))
+	      (t
+	       (message "Invalid encoding: $s" encoding)
+	       (throw 'invalid nil)))
+	(setq language (nth 1 word)
+	      word (encoded-text-decode-string (nth 3 word) encoding))
+	(if (and rest
+		 (string-equal charset (caaar rest))
+		 (equal language (cdaar rest)))
+	    (setcdr (car rest) (concat (cdar rest) word))
+	  (push (cons (cons charset language) word) rest)))
+      (setq words "")
+      (while rest
+	(setq word (decode-mime-charset-string (cdar rest) (caaar rest)))
 	(when must-unfold
-	  (setq dest
-		(mapconcat
-		 (function
-		  (lambda (chr)
-		    (cond ((eq chr ?\n) "")
-			  ((eq chr ?\r) "")
-			  ((eq chr ?\t) " ")
-			  (t (char-to-string chr)))))
-		 (std11-unfold-string dest) "")))
-	(when language
-	  (put-text-property 0 (length dest) 'mime-language language dest))
-	dest))))
-
+	  (setq word (mapconcat (lambda (chr)
+				  (cond ((eq chr ?\n) "")
+					((eq chr ?\r) "")
+					((eq chr ?\t) " ")
+					(t (char-to-string chr))))
+				(std11-unfold-string word)
+				"")))
+	(when (setq language (cdaar rest))
+	  (put-text-property 0 (length word) 'mime-language language word))
+	(setq words (concat word words)
+	      rest (cdr rest)))
+      words)))
 
 ;;; @ lexical analyze
 ;;;
@@ -713,28 +686,24 @@
   (if (and (string-match eword-encoded-word-regexp string start)
 	   (= (match-beginning 0) start))
       (let ((end (match-end 0))
-	    (dest (eword-decode-encoded-word (match-string 0 string)
-					     must-unfold))
-	    )
-	;;(setq string (substring string end))
-	(setq start end)
+	    (words (list (list (match-string 1 string) ;; charset
+			       (match-string 2 string) ;; language
+			       (match-string 3 string) ;; encoding
+			       (match-string 4 string))))) ;; word
 	(while (and (string-match (eval-when-compile
-				    (concat "[ \t\n]*\\("
-					    eword-encoded-word-regexp
-					    "\\)"))
-				  string start)
-		    (= (match-beginning 0) start))
+				    (concat "\\([\n\t ]*\\)"
+					    eword-encoded-word-regexp))
+				  string end)
+		    (= end (match-beginning 0)))
 	  (setq end (match-end 0))
-	  (setq dest
-		(concat dest
-			(eword-decode-encoded-word (match-string 1 string)
-						   must-unfold))
-		;;string (substring string end))
-		start end)
-	  )
-	(cons (cons 'atom dest) ;;string)
-	      end)
-	)))
+	  (push (list (match-string 2 string) ;; charset
+		      (match-string 3 string) ;; language
+		      (match-string 4 string) ;; encoding
+		      (match-string 5 string)) ;; word
+		words))
+	(when (setq words (eword-decode-encoded-words (nreverse words)
+						      must-unfold))
+	  (cons (cons 'atom words) end)))))
 
 (defun eword-analyze-atom (string start &optional must-unfold)
   (if (and (string-match std11-atom-regexp string start)