mirror of
https://git.savannah.gnu.org/git/emacs.git
synced 2024-11-23 07:19:15 +00:00
3d0e328b95
invalid UTF-8 sequences.
378 lines
11 KiB
EmacsLisp
378 lines
11 KiB
EmacsLisp
;;; utf-8.el --- Limited UTF-8 decoding/encoding support
|
|
|
|
;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
|
|
;; Licensed to the Free Software Foundation.
|
|
;; Copyright (C) 2001 Free Software Foundation, Inc.
|
|
|
|
;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
|
|
;; Keywords: multilingual, Unicode, UTF-8, i18n
|
|
|
|
;; This file is part of GNU Emacs.
|
|
|
|
;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
;; it under the terms of the GNU General Public License as published by
|
|
;; the Free Software Foundation; either version 2, or (at your option)
|
|
;; any later version.
|
|
|
|
;; GNU Emacs is distributed in the hope that it will be useful,
|
|
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
;; GNU General Public License for more details.
|
|
|
|
;; You should have received a copy of the GNU General Public License
|
|
;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
;; Boston, MA 02111-1307, USA.
|
|
|
|
;;; Commentary:
|
|
|
|
;; The coding-system `mule-utf-8' supports encoding/decoding of the
|
|
;; following character sets to and from UTF-8:
|
|
;;
|
|
;; ascii
|
|
;; eight-bit-control
|
|
;; latin-iso8859-1
|
|
;; mule-unicode-0100-24ff
|
|
;; mule-unicode-2500-33ff
|
|
;; mule-unicode-e000-ffff
|
|
;;
|
|
;; Characters of other character sets cannot be encoded with
|
|
;; mule-utf-8. Note that the mule-unicode charsets currently lack
|
|
;; case and syntax information, so things like `downcase' will only
|
|
;; work for characters from ASCII and Latin-1.
|
|
;;
|
|
;; On decoding, Unicode characters that do not fit into the above
|
|
;; character sets are handled as `eight-bit-control' or
|
|
;; `eight-bit-graphic' characters to retain the information about the
|
|
;; original byte sequence.
|
|
|
|
;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
|
|
|
|
;; scalar | utf-8
|
|
;; value | 1st byte | 2nd byte | 3rd byte
|
|
;; --------------------+-----------+-----------+----------
|
|
;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
|
|
;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
|
|
;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
|
|
|
|
;;; Code:
|
|
|
|
(define-ccl-program ccl-decode-mule-utf-8
|
|
;;
|
|
;; charset | bytes in utf-8 | bytes in emacs
|
|
;; -----------------------+----------------+---------------
|
|
;; ascii | 1 | 1
|
|
;; -----------------------+----------------+---------------
|
|
;; eight-bit-control | 2 | 2
|
|
;; latin-iso8859-1 | 2 | 2
|
|
;; -----------------------+----------------+---------------
|
|
;; mule-unicode-0100-24ff | 2 | 4
|
|
;; (< 0800) | |
|
|
;; -----------------------+----------------+---------------
|
|
;; mule-unicode-0100-24ff | 3 | 4
|
|
;; (>= 8000) | |
|
|
;; mule-unicode-2500-33ff | 3 | 4
|
|
;; mule-unicode-e000-ffff | 3 | 4
|
|
;;
|
|
;; Thus magnification factor is two.
|
|
;;
|
|
`(2
|
|
((r5 = ,(charset-id 'eight-bit-control))
|
|
(r6 = ,(charset-id 'eight-bit-graphic))
|
|
(loop
|
|
(read r0)
|
|
|
|
;; 1byte encoding, i.e., ascii
|
|
(if (r0 < #x80)
|
|
(write r0)
|
|
|
|
;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
|
|
(if (r0 < #xe0)
|
|
((read r1)
|
|
|
|
(if ((r1 & #b11000000) != #b10000000)
|
|
;; Invalid 2-byte sequence
|
|
((if (r0 < #xa0)
|
|
(write-multibyte-character r5 r0)
|
|
(write-multibyte-character r6 r0))
|
|
(if (r1 < #x80)
|
|
(write r1)
|
|
(if (r1 < #xa0)
|
|
(write-multibyte-character r5 r1)
|
|
(write-multibyte-character r6 r1))))
|
|
|
|
((r0 &= #x1f)
|
|
(r0 <<= 6)
|
|
(r1 &= #x3f)
|
|
(r1 += r0)
|
|
;; Now r1 holds scalar value
|
|
|
|
;; eight-bit-control
|
|
(if (r1 < 160)
|
|
((write-multibyte-character r5 r1))
|
|
|
|
;; latin-iso8859-1
|
|
(if (r1 < 256)
|
|
((r0 = ,(charset-id 'latin-iso8859-1))
|
|
(r1 -= 128)
|
|
(write-multibyte-character r0 r1))
|
|
|
|
;; mule-unicode-0100-24ff (< 0800)
|
|
((r0 = ,(charset-id 'mule-unicode-0100-24ff))
|
|
(r1 -= #x0100)
|
|
(r2 = (((r1 / 96) + 32) << 7))
|
|
(r1 %= 96)
|
|
(r1 += (r2 + 32))
|
|
(write-multibyte-character r0 r1)))))))
|
|
|
|
;; 3byte encoding
|
|
;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
|
|
(if (r0 < #xf0)
|
|
((read r1 r2)
|
|
|
|
;; This is set to 1 if the encoding is invalid.
|
|
(r4 = 0)
|
|
|
|
(r3 = (r1 & #b11000000))
|
|
(r3 |= ((r2 >> 2) & #b00110000))
|
|
(if (r3 != #b10100000)
|
|
(r4 = 1)
|
|
((r3 = ((r0 & #x0f) << 12))
|
|
(r3 += ((r1 & #x3f) << 6))
|
|
(r3 += (r2 & #x3f))
|
|
(if (r3 < #x0800)
|
|
(r4 = 1))))
|
|
|
|
(if (r4 != 0)
|
|
;; Invalid 3-byte sequence
|
|
((if (r0 < #xa0)
|
|
(write-multibyte-character r5 r0)
|
|
(write-multibyte-character r6 r0))
|
|
(if (r1 < #x80)
|
|
(write r1)
|
|
(if (r1 < #xa0)
|
|
(write-multibyte-character r5 r1)
|
|
(write-multibyte-character r6 r1)))
|
|
(if (r2 < #x80)
|
|
(write r2)
|
|
(if (r2 < #xa0)
|
|
(write-multibyte-character r5 r2)
|
|
(write-multibyte-character r6 r2))))
|
|
|
|
;; mule-unicode-0100-24ff (>= 0800)
|
|
((if (r3 < #x2500)
|
|
((r0 = ,(charset-id 'mule-unicode-0100-24ff))
|
|
(r3 -= #x0100)
|
|
(r3 //= 96)
|
|
(r1 = (r7 + 32))
|
|
(r1 += ((r3 + 32) << 7))
|
|
(write-multibyte-character r0 r1))
|
|
|
|
;; mule-unicode-2500-33ff
|
|
(if (r3 < #x3400)
|
|
((r0 = ,(charset-id 'mule-unicode-2500-33ff))
|
|
(r3 -= #x2500)
|
|
(r3 //= 96)
|
|
(r1 = (r7 + 32))
|
|
(r1 += ((r3 + 32) << 7))
|
|
(write-multibyte-character r0 r1))
|
|
|
|
;; U+3400 .. U+DFFF
|
|
;; keep those bytes as eight-bit-{control|graphic}
|
|
(if (r3 < #xe000)
|
|
( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
|
|
(r3 = r6)
|
|
(write-multibyte-character r3 r0)
|
|
(if (r1 < #xa0)
|
|
(r3 = r5))
|
|
(write-multibyte-character r3 r1)
|
|
(if (r2 < #xa0)
|
|
(r3 = r5)
|
|
(r3 = r6))
|
|
(write-multibyte-character r3 r2))
|
|
|
|
;; mule-unicode-e000-ffff
|
|
((r0 = ,(charset-id 'mule-unicode-e000-ffff))
|
|
(r3 -= #xe000)
|
|
(r3 //= 96)
|
|
(r1 = (r7 + 32))
|
|
(r1 += ((r3 + 32) << 7))
|
|
(write-multibyte-character r0 r1))))))))
|
|
|
|
;; 4byte encoding
|
|
;; keep those bytes as eight-bit-{control|graphic}
|
|
((read r1 r2 r3)
|
|
;; r0 > #xf0, thus eight-bit-graphic
|
|
(write-multibyte-character r6 r0)
|
|
(if (r1 < #xa0)
|
|
(write-multibyte-character r5 r1)
|
|
(write-multibyte-character r6 r1))
|
|
(if (r2 < #xa0)
|
|
(write-multibyte-character r5 r2)
|
|
(write-multibyte-character r6 r2))
|
|
(if (r3 < #xa0)
|
|
(write-multibyte-character r5 r3)
|
|
(write-multibyte-character r6 r3))))))
|
|
|
|
(repeat))))
|
|
|
|
"CCL program to decode UTF-8.
|
|
Basic decoding is done into the charsets ascii, latin-iso8859-1 and
|
|
mule-unicode-*. Encodings of un-representable Unicode characters are
|
|
decoded asis into eight-bit-control and eight-bit-graphic
|
|
characters.")
|
|
|
|
(define-ccl-program ccl-encode-mule-utf-8
|
|
`(1
|
|
((r5 = -1)
|
|
(loop
|
|
(if (r5 < 0)
|
|
((r1 = -1)
|
|
(read-multibyte-character r0 r1))
|
|
(;; We have already done read-multibyte-character.
|
|
(r0 = r5)
|
|
(r1 = r6)
|
|
(r5 = -1)))
|
|
|
|
(if (r0 == ,(charset-id 'ascii))
|
|
(write r1)
|
|
|
|
(if (r0 == ,(charset-id 'latin-iso8859-1))
|
|
;; r1 scalar utf-8
|
|
;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
|
|
;; 20 0000 0000 1010 0000 1100 0010 1010 0000
|
|
;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
|
|
((r0 = (((r1 & #x40) >> 6) | #xc2))
|
|
(r1 &= #x3f)
|
|
(r1 |= #x80)
|
|
(write r0 r1))
|
|
|
|
(if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
|
|
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
|
|
;; #x3f80 == (0011 1111 1000 0000)b
|
|
(r1 &= #x7f)
|
|
(r1 += (r0 + 224)) ; 240 == -32 + #x0100
|
|
;; now r1 holds scalar value
|
|
(if (r1 < #x0800)
|
|
;; 2byte encoding
|
|
((r0 = (((r1 & #x07c0) >> 6) | #xc0))
|
|
;; #x07c0 == (0000 0111 1100 0000)b
|
|
(r1 &= #x3f)
|
|
(r1 |= #x80)
|
|
(write r0 r1))
|
|
;; 3byte encoding
|
|
((r0 = (((r1 & #xf000) >> 12) | #xe0))
|
|
(r2 = ((r1 & #x3f) | #x80))
|
|
(r1 &= #x0fc0)
|
|
(r1 >>= 6)
|
|
(r1 |= #x80)
|
|
(write r0 r1 r2))))
|
|
|
|
(if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
|
|
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
|
|
(r1 &= #x7f)
|
|
(r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
|
|
(r0 = (((r1 & #xf000) >> 12) | #xe0))
|
|
(r2 = ((r1 & #x3f) | #x80))
|
|
(r1 &= #x0fc0)
|
|
(r1 >>= 6)
|
|
(r1 |= #x80)
|
|
(write r0 r1 r2))
|
|
|
|
(if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
|
|
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
|
|
(r1 &= #x7f)
|
|
(r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
|
|
(r0 = (((r1 & #xf000) >> 12) | #xe0))
|
|
(r2 = ((r1 & #x3f) | #x80))
|
|
(r1 &= #x0fc0)
|
|
(r1 >>= 6)
|
|
(r1 |= #x80)
|
|
(write r0 r1 r2))
|
|
|
|
(if (r0 == ,(charset-id 'eight-bit-control))
|
|
;; r1 scalar utf-8
|
|
;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
|
|
;; 80 0000 0000 1000 0000 1100 0010 1000 0000
|
|
;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
|
|
((write #xc2)
|
|
(write r1))
|
|
|
|
(if (r0 == ,(charset-id 'eight-bit-graphic))
|
|
;; r1 scalar utf-8
|
|
;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
|
|
;; a0 0000 0000 1010 0000 1100 0010 1010 0000
|
|
;; ff 0000 0000 1111 1111 1101 1111 1011 1111
|
|
((write r1)
|
|
(r1 = -1)
|
|
(read-multibyte-character r0 r1)
|
|
(if (r0 != ,(charset-id 'eight-bit-graphic))
|
|
(if (r0 != ,(charset-id 'eight-bit-control))
|
|
((r5 = r0)
|
|
(r6 = r1))))
|
|
(if (r5 < 0)
|
|
((read-multibyte-character r0 r2)
|
|
(if (r0 != ,(charset-id 'eight-bit-graphic))
|
|
(if (r0 != ,(charset-id 'eight-bit-control))
|
|
((r5 = r0)
|
|
(r6 = r2))))
|
|
(if (r5 < 0)
|
|
(write r1 r2)
|
|
(if (r1 < #xa0)
|
|
(write r1)
|
|
((write #xc2)
|
|
(write r1)))))))
|
|
|
|
;; Unsupported character.
|
|
;; Output U+FFFD, which is `ef bf bd' in UTF-8.
|
|
((write #xef)
|
|
(write #xbf)
|
|
(write #xbd)))))))))
|
|
(repeat)))
|
|
(if (r1 >= #xa0)
|
|
(write r1)
|
|
(if (r1 >= #x80)
|
|
((write #xc2)
|
|
(write r1)))))
|
|
|
|
"CCL program to encode into UTF-8.
|
|
Only characters from the charsets ascii, eight-bit-control,
|
|
eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
|
|
Others are encoded as U+FFFD.")
|
|
|
|
(make-coding-system
|
|
'mule-utf-8 4 ?u
|
|
"UTF-8 encoding for Emacs-supported Unicode characters.
|
|
The supported Emacs character sets are:
|
|
ascii
|
|
eight-bit-control
|
|
eight-bit-graphic
|
|
latin-iso8859-1
|
|
mule-unicode-0100-24ff
|
|
mule-unicode-2500-33ff
|
|
mule-unicode-e000-ffff
|
|
|
|
Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
|
|
are decoded into sequences of eight-bit-control and eight-bit-graphic
|
|
characters to preserve their byte sequences. Emacs characters out of
|
|
these ranges are encoded into U+FFFD.
|
|
|
|
Note that, currently, characters in the mule-unicode charsets have no
|
|
syntax and case information. Thus, for instance, upper- and
|
|
lower-casing commands won't work with them."
|
|
|
|
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
|
|
'((safe-charsets
|
|
ascii
|
|
eight-bit-control
|
|
eight-bit-graphic
|
|
latin-iso8859-1
|
|
mule-unicode-0100-24ff
|
|
mule-unicode-2500-33ff
|
|
mule-unicode-e000-ffff)
|
|
(mime-charset . utf-8)
|
|
(coding-category . coding-category-utf-8)
|
|
(valid-codes (0 . 255))))
|
|
|
|
(define-coding-system-alias 'utf-8 'mule-utf-8)
|