Modify the coding system compound-text-with-extensions to conform to the spec of Compound Text.

2024-11-25 07:28:20 +00:00 · 2010-08-04 17:06:52 +09:00 · 2010-08-04 17:06:52 +09:00 · 6b4d96c2f0
commit 6b4d96c2f0
parent 4ce5a4ccd4
4 changed files with 104 additions and 81 deletions
--- a/lisp/ChangeLog
+++ b/lisp/ChangeLog
@ -1,3 +1,20 @@
+2010-08-04  Kenichi Handa  <handa@m17n.org>
+
+	* language/cyrillic.el: Don't add "microsoft-cp1251" to
+	ctext-non-standard-encodings-alist here.
+
+	* international/mule.el (ctext-non-standard-encodings-alist): Add
+	"koi8-r" and "microsoft-cp1251".
+	(ctext-standard-encodings): New variable.
+	(ctext-non-standard-encodings-table): List only elements for
+	non-standard encodings.
+	(ctext-pre-write-conversion): Adjusted for the above change.
+	Check ctext-standard-encodings.
+
+	* international/mule-conf.el (compound-text): Doc fix.
+	(ctext-no-compositions): Doc fix.
+	(compound-text-with-extensions): Doc fix.
+
 2010-07-23  Juanma Barranquero  <lekktu@gmail.com>

 	* help-fns.el (find-lisp-object-file-name): Doc fix (bug#6494).
--- a/lisp/international/mule-conf.el
+++ b/lisp/international/mule-conf.el
@ -1410,9 +1410,10 @@ is treated as a character."
  :flags '(ascii-at-eol ascii-at-cntl designation single-shift composition))

 (define-coding-system 'compound-text
-  "Compound text based generic encoding for decoding unknown messages.
-
-This coding system does not support extended segments of CTEXT."
+  "Compound text based generic encoding.
+This coding system is an extension of X's \"Compound Text Encoding\".
+It encodes many characters using the normal ISO-2022 designation sequences,
+but it doesn't support extended segments of CTEXT."
  :coding-type 'iso-2022
  :mnemonic ?x
  :charset-list 'iso-2022
@ -1432,7 +1433,7 @@ This coding system does not support extended segments of CTEXT."
 ;; not have a mime-charset property, to prevent it from showing up
 ;; close to the beginning of coding systems ordered by priority.
 (define-coding-system 'ctext-no-compositions
- "Compound text based generic encoding for decoding unknown messages.
+ "Compound text based generic encoding.

 Like `compound-text', but does not produce escape sequences for compositions."
  :coding-type 'iso-2022
@ -1445,8 +1446,9 @@ Like `compound-text', but does not produce escape sequences for compositions."
 (define-coding-system 'compound-text-with-extensions
 "Compound text encoding with ICCCM Extended Segment extensions.

-See the variable `ctext-non-standard-encodings-alist' for the
-detail about how extended segments are handled.
+See the variables `ctext-standard-encodings' and
+`ctext-non-standard-encodings-alist' for the detail about how
+extended segments are handled.

 This coding system should be used only for X selections.  It is inappropriate
 for decoding and encoding files, process I/O, etc."
--- a/lisp/international/mule.el
+++ b/lisp/international/mule.el
@ -1408,7 +1408,9 @@ This function is provided for backward compatibility."
  '(("big5-0" big5 2 big5)
    ("ISO8859-14" iso-8859-14 1 latin-iso8859-14)
    ("ISO8859-15" iso-8859-15 1 latin-iso8859-15)
-    ("gbk-0" gbk 2 chinese-gbk)))
+    ("gbk-0" gbk 2 chinese-gbk)
+    ("koi8-r" koi8-r 1 koi8-r)
+    ("microsoft-cp1251" windows-1251 1 windows-1251)))
  "Alist of non-standard encoding names vs the corresponding usages in CTEXT.

 It controls how extended segments of a compound text are handled
@ -1497,6 +1499,20 @@ Each element must be one of the names listed in the variable
      (goto-char (point-min))
      (- (point-max) (point)))))

+(defvar ctext-standard-encodings
+  '(ascii latin-jisx0201 katakana-jisx0201
+	  latin-iso8859-1 latin-iso8859-2 latin-iso8859-3 latin-iso8859-4
+	  greek-iso8859-7 arabic-iso8859-6 hebrew-iso8859-8 cyrillic-iso8859-5
+	  latin-iso8859-9
+	  chinese-gb2312 japanese-jisx0208 korean-ksc5601)
+  "List of approved standard encodings (i.e. charsets) of X's Compound Text.
+Coding-system `compound-text-with-extensions' encodes a character
+belonging to any of those charsets using the normal ISO2022
+designation sequence unless the current language environment or
+the variable `ctext-non-standard-encodings' decide to use an extended
+segment of CTEXT for that character.  See also the documentation
+of `ctext-non-standard-encodings-alist'.")
+
 ;; Return an alist of CHARSET vs CTEXT-USAGE-INFO generated from
 ;; `ctext-non-standard-encodings' and a list specified by the key
 ;; `ctext-non-standard-encodings' for the currrent language
@ -1508,77 +1524,74 @@ Each element must be one of the names listed in the variable
 ;; is encoded using UTF-8 encoding extention.

 (defun ctext-non-standard-encodings-table ()
-  (let (table)
-    ;; Setup charsets specified by the key
-    ;; `ctext-non-standard-encodings' for the current language
-    ;; environment and in `ctext-non-standard-encodings'.
-    (dolist (encoding (append
-			(get-language-info current-language-environment
-					   'ctext-non-standard-encodings)
-			ctext-non-standard-encodings))
-      (let* ((slot (assoc encoding ctext-non-standard-encodings-alist))
+  (let* ((table (append ctext-non-standard-encodings
+			(copy-sequence
+			 (get-language-info current-language-environment
+					    'ctext-non-standard-encodings))))
+	 (tail table)
+	 elt)
+    (while tail
+      (setq elt (car tail))
+      (let* ((slot (assoc elt ctext-non-standard-encodings-alist))
 	     (charset (nth 3 slot)))
 	(if (charsetp charset)
-	    (push (cons charset slot) table)
-	  (dolist (cs charset)
-	    (push (cons cs slot) table)))))
-
-    ;; Next prepend charsets for ISO2022 designation sequence.
-    (dolist (charset charset-list)
-      (let ((final (plist-get (charset-plist charset) :iso-final-char)))
-	(if (and (integerp final)
-		 (>= final #x40) (<= final #x7e)
-		 ;; Exclude ascii and chinese-cns11643-X.
-		 (not (eq charset 'ascii))
-		 (not (string-match "cns11643" (symbol-name charset))))
-	    (push (cons charset nil) table))))
-
-    ;; Returned reversed list so that the charsets specified by the
-    ;; key `ctext-non-standard-encodings' for the current language
-    ;; have the highest priority.
-    (nreverse table)))
+	    (setcar tail (cons charset slot))
+	  (setcar tail (cons (car charset) slot))
+	  (dolist (cs (cdr charset))
+	    (setcdr tail
+		    (cons (cons (car cs) slot) (cdr tail)))
+	    (setq tail (cdr tail))))
+	(setq tail (cdr tail))))
+    table))

 (defun ctext-pre-write-conversion (from to)
  "Encode characters between FROM and TO as Compound Text w/Extended Segments.

-If FROM is a string, or if the current buffer is not the one set up for us
-by `encode-coding-string', generate a new temp buffer, insert the text,
-and convert it in the temporary buffer.  Otherwise, convert in-place."
+If FROM is a string, generate a new temp buffer, insert the text,
+and convert it in the temporary buffer.  Otherwise, convert
+in-place."
  (save-match-data
    ;; Setup a working buffer if necessary.
    (when (stringp from)
      (set-buffer (generate-new-buffer " *temp"))
      (set-buffer-multibyte (multibyte-string-p from))
-      (insert from))
-
-    ;; Now we can encode the whole buffer.
-    (let ((encoding-table (ctext-non-standard-encodings-table))
-	  last-coding-system-used
-	  last-pos last-encoding-info
-	  encoding-info end-pos ch)
-      (goto-char (setq last-pos (point-min)))
-      (setq end-pos (point-marker))
-      (while (re-search-forward "[^\000-\177]+" nil t)
-	;; Found a sequence of non-ASCII characters.
-	(setq last-pos (match-beginning 0)
-	      ch (char-after last-pos)
-	      last-encoding-info (catch 'tag
-				   (dolist (elt encoding-table)
-				     (if (encode-char ch (car elt))
-					 (throw 'tag (cdr elt))))
-				   'utf-8))
-	(set-marker end-pos (match-end 0))
-	(goto-char (1+ last-pos))
-	(catch 'tag
-	  (while t
-	    (setq encoding-info
-		  (if (< (point) end-pos)
-		      (catch 'tag
-			(setq ch (following-char))
-			(dolist (elt encoding-table)
-			  (if (encode-char ch (car elt))
-			      (throw 'tag (cdr elt))))
-			'utf-8)))
+      (insert from)
+      (setq from 1 to (point-max)))
+    (save-restriction
+      (narrow-to-region from to)
+      (let ((encoding-table (ctext-non-standard-encodings-table))
+	    (charset-list ctext-standard-encodings)
+	    last-coding-system-used
+	    last-pos last-encoding-info
+	    encoding-info end-pos ch charset)
+	(dolist (elt encoding-table)
+	  (push (car elt) charset-list))
+	(goto-char (setq last-pos from))
+	(setq end-pos (point-marker))
+	(while (re-search-forward "[^\000-\177]+" nil t)
+	  ;; Found a sequence of non-ASCII characters.
+	  (setq last-pos (match-beginning 0)
+		ch (char-after last-pos)
+		charset (char-charset ch charset-list)
+		last-encoding-info
+		(if charset
+		    (or (cdr (assq charset encoding-table))
+			charset)
+		  'utf-8))
+	  (set-marker end-pos (match-end 0))
+	  (goto-char (1+ last-pos))
+	  (while (marker-position end-pos)
+	    (if (< (point) end-pos)
+		(progn
+		  (setq charset (char-charset (following-char) charset-list)
+			encoding-info
+			(if charset
+			    (or (cdr (assq charset encoding-table))
+				charset)
+			  'utf-8))
+		  (forward-char 1))
+	      (setq encoding-info nil)
+	      (set-marker end-pos nil))
 	    (unless (eq last-encoding-info encoding-info)
 	      (cond ((consp last-encoding-info)
 		     ;; Encode the previous range using an extended
@ -1609,14 +1622,12 @@ and convert it in the temporary buffer.  Otherwise, convert in-place."
 		     (save-excursion
 		       (goto-char last-pos)
 		       (insert "\e%G"))
-		     (insert "\e%@")))
+		     (insert "\e%@"))
+		    (t
+		     (put-text-property last-pos (point) 'charset charset)))
 	      (setq last-pos (point)
-		    last-encoding-info encoding-info))
-	    (if (< (point) end-pos)
-		(forward-char 1)
-	      (throw 'tag nil)))))
-      (set-marker end-pos nil)
-      (goto-char (point-min))))
+		    last-encoding-info encoding-info))))
+	(goto-char (point-min)))))
  ;; Must return nil, as build_annotations_2 expects that.
  nil)

--- a/lisp/language/cyrillic.el
+++ b/lisp/language/cyrillic.el
@ -239,13 +239,6 @@ Support for Russian using koi8-r and the russian-computer input method.")
 	   (documentation . "Support for Tajik using KOI8-T."))
 '("Cyrillic"))

-(let ((elt `("microsoft-cp1251" windows-1251 1
-	     ,(get 'encode-windows-1251 'translation-table)))
-      (slot (assoc "microsoft-cp1251" ctext-non-standard-encodings-alist)))
-  (if slot
-      (setcdr slot (cdr elt))
-    (push elt ctext-non-standard-encodings-alist)))
-
 (set-language-info-alist
 "Bulgarian" `((coding-system windows-1251)
 	       (coding-priority windows-1251)