1
0
mirror of https://git.savannah.gnu.org/git/emacs.git synced 2024-12-27 10:54:40 +00:00

Improve [:alpha:] and [:alnum:] for multibyte characters (Bug#19878)

src/character.c (alphabeticp, decimalnump): New functions.
 src/character.h (alphabeticp, decimalnump): Add prototypes.
 src/regex.c (ISALNUM, ISALPHA): Check Unicode character properties
 for multibyte characters by calling alphabeticp and decimalnump.
 (BIT_ALPHA, BIT_ALNUM): New bit masks.
 (re_wctype_to_bit): Return them when the class is RECC_ALPHA or
 RECC_ALNUM.
 (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.

 doc/lispref/searching.texi (Char Classes): Update the documentation of
 [:alpha:] and [:alnum:].

 etc/NEWS: Mention the changes in [:alpha:] and [:alnum:].
This commit is contained in:
Eli Zaretskii 2015-02-28 14:25:35 +02:00
parent 31ecbf8d51
commit 1a50945fa4
7 changed files with 89 additions and 10 deletions

View File

@ -1,3 +1,8 @@
2015-02-28 Eli Zaretskii <eliz@gnu.org>
* searching.texi (Char Classes): Update the documentation of
[:alpha:] and [:alnum:]. (Bug#19878)
2015-02-27 Eli Zaretskii <eliz@gnu.org>
* os.texi (Startup Summary):

View File

@ -541,11 +541,15 @@ and what they mean:
@item [:ascii:]
This matches any @acronym{ASCII} character (codes 0--127).
@item [:alnum:]
This matches any letter or digit. (At present, for multibyte
characters, it matches anything that has word syntax.)
This matches any letter or digit. For multibyte characters, it
matches characters whose Unicode @samp{general-category} property
(@pxref{Character Properties}) indicates they are alphabetic or
decimal number characters.
@item [:alpha:]
This matches any letter. (At present, for multibyte characters, it
matches anything that has word syntax.)
This matches any letter. For multibyte characters, it matches
characters whose Unicode @samp{general-category} property
(@pxref{Character Properties}) indicates they are alphabetic
characters.
@item [:blank:]
This matches space and tab only.
@item [:cntrl:]

View File

@ -612,6 +612,12 @@ when signaling a file error. For example, it now reports "Permission
denied" instead of "permission denied". The old behavior was problematic
in languages like German where downcasing rules depend on grammar.
+++
** The character classes [:alpha:] and [:alnum:] in regular expressions
now match multibyte characters using Unicode character properties.
If you want the old behavior where they matched any character with
word syntax, use `\sw' instead.
* Lisp Changes in Emacs 25.1

View File

@ -1,3 +1,16 @@
2015-02-28 Eli Zaretskii <eliz@gnu.org>
* character.c (alphabeticp, decimalnump): New functions.
* character.h (alphabeticp, decimalnump): Add prototypes.
* regex.c (ISALNUM, ISALPHA): Check Unicode character properties
for multibyte characters by calling alphabeticp and decimalnump.
(BIT_ALPHA, BIT_ALNUM): New bit masks.
(re_wctype_to_bit): Return them when the class is RECC_ALPHA or
RECC_ALNUM.
(re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.
(Bug#19878)
2015-02-27 Jan Djärv <jan.h.d@swipnet.se>
* xterm.h (x_real_pos_and_offsets): Take outer_border as arg also.

View File

@ -984,6 +984,48 @@ character is not ASCII nor 8-bit character, an error is signaled. */)
#ifdef emacs
/* Return 'true' if C is an alphabetic character as defined by its
Unicode properties. */
bool
alphabeticp (int c)
{
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
if (INTEGERP (category))
{
unicode_category_t gen_cat = XINT (category);
/* See UTS #18. There are additional characters that should be
here, those designated as Other_uppercase, Other_lowercase,
and Other_alphabetic; FIXME. */
return (gen_cat == UNICODE_CATEGORY_Lu
|| gen_cat == UNICODE_CATEGORY_Ll
|| gen_cat == UNICODE_CATEGORY_Lt
|| gen_cat == UNICODE_CATEGORY_Lm
|| gen_cat == UNICODE_CATEGORY_Lo
|| gen_cat == UNICODE_CATEGORY_Mn
|| gen_cat == UNICODE_CATEGORY_Mc
|| gen_cat == UNICODE_CATEGORY_Me
|| gen_cat == UNICODE_CATEGORY_Nl) ? true : false;
}
}
/* Return 'true' if C is an decimal-number character as defined by its
Unicode properties. */
bool
decimalnump (int c)
{
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
if (INTEGERP (category))
{
unicode_category_t gen_cat = XINT (category);
/* See UTS #18. */
return (gen_cat == UNICODE_CATEGORY_Nd) ? true : false;
}
}
void
syms_of_character (void)
{

View File

@ -660,6 +660,9 @@ extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t,
extern Lisp_Object Vchar_unify_table;
extern Lisp_Object string_escape_byte8 (Lisp_Object);
extern bool alphabeticp (int);
extern bool decimalnump (int);
/* Return a translation table of id number ID. */
#define GET_TRANSLATION_TABLE(id) \
(XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)]))

View File

@ -324,12 +324,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
? (((c) >= 'a' && (c) <= 'z') \
|| ((c) >= 'A' && (c) <= 'Z') \
|| ((c) >= '0' && (c) <= '9')) \
: SYNTAX (c) == Sword)
: (alphabeticp (c) || decimalnump (c)))
# define ISALPHA(c) (IS_REAL_ASCII (c) \
? (((c) >= 'a' && (c) <= 'z') \
|| ((c) >= 'A' && (c) <= 'Z')) \
: SYNTAX (c) == Sword)
: alphabeticp (c))
# define ISLOWER(c) lowercasep (c)
@ -1872,6 +1872,8 @@ struct range_table_work_area
#define BIT_SPACE 0x8
#define BIT_UPPER 0x10
#define BIT_MULTIBYTE 0x20
#define BIT_ALPHA 0x40
#define BIT_ALNUM 0x80
/* Set the bit for character C in a list. */
@ -2072,7 +2074,9 @@ re_wctype_to_bit (re_wctype_t cc)
{
case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
case RECC_MULTIBYTE: return BIT_MULTIBYTE;
case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
case RECC_ALPHA: return BIT_ALPHA;
case RECC_ALNUM: return BIT_ALNUM;
case RECC_WORD: return BIT_WORD;
case RECC_LOWER: return BIT_LOWER;
case RECC_UPPER: return BIT_UPPER;
case RECC_PUNCT: return BIT_PUNCT;
@ -2930,7 +2934,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
#endif /* emacs */
/* In most cases the matching rule for char classes
only uses the syntax table for multibyte chars,
so that the content of the syntax-table it is not
so that the content of the syntax-table is not
hardcoded in the range_table. SPACE and WORD are
the two exceptions. */
if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
@ -2945,7 +2949,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
p = class_beg;
SET_LIST_BIT ('[');
/* Because the `:' may starts the range, we
/* Because the `:' may start the range, we
can't simply set bit and repeat the loop.
Instead, just set it to C and handle below. */
c = ':';
@ -5513,7 +5517,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
| (class_bits & BIT_PUNCT && ISPUNCT (c))
| (class_bits & BIT_SPACE && ISSPACE (c))
| (class_bits & BIT_UPPER && ISUPPER (c))
| (class_bits & BIT_WORD && ISWORD (c)))
| (class_bits & BIT_WORD && ISWORD (c))
| (class_bits & BIT_ALPHA && ISALPHA (c))
| (class_bits & BIT_ALNUM && ISALNUM (c)))
not = !not;
else
CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);