mirror of
https://git.savannah.gnu.org/git/emacs.git
synced 2024-12-27 10:54:40 +00:00
Improve [:alpha:] and [:alnum:] for multibyte characters (Bug#19878)
src/character.c (alphabeticp, decimalnump): New functions. src/character.h (alphabeticp, decimalnump): Add prototypes. src/regex.c (ISALNUM, ISALPHA): Check Unicode character properties for multibyte characters by calling alphabeticp and decimalnump. (BIT_ALPHA, BIT_ALNUM): New bit masks. (re_wctype_to_bit): Return them when the class is RECC_ALPHA or RECC_ALNUM. (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate. doc/lispref/searching.texi (Char Classes): Update the documentation of [:alpha:] and [:alnum:]. etc/NEWS: Mention the changes in [:alpha:] and [:alnum:].
This commit is contained in:
parent
31ecbf8d51
commit
1a50945fa4
@ -1,3 +1,8 @@
|
||||
2015-02-28 Eli Zaretskii <eliz@gnu.org>
|
||||
|
||||
* searching.texi (Char Classes): Update the documentation of
|
||||
[:alpha:] and [:alnum:]. (Bug#19878)
|
||||
|
||||
2015-02-27 Eli Zaretskii <eliz@gnu.org>
|
||||
|
||||
* os.texi (Startup Summary):
|
||||
|
@ -541,11 +541,15 @@ and what they mean:
|
||||
@item [:ascii:]
|
||||
This matches any @acronym{ASCII} character (codes 0--127).
|
||||
@item [:alnum:]
|
||||
This matches any letter or digit. (At present, for multibyte
|
||||
characters, it matches anything that has word syntax.)
|
||||
This matches any letter or digit. For multibyte characters, it
|
||||
matches characters whose Unicode @samp{general-category} property
|
||||
(@pxref{Character Properties}) indicates they are alphabetic or
|
||||
decimal number characters.
|
||||
@item [:alpha:]
|
||||
This matches any letter. (At present, for multibyte characters, it
|
||||
matches anything that has word syntax.)
|
||||
This matches any letter. For multibyte characters, it matches
|
||||
characters whose Unicode @samp{general-category} property
|
||||
(@pxref{Character Properties}) indicates they are alphabetic
|
||||
characters.
|
||||
@item [:blank:]
|
||||
This matches space and tab only.
|
||||
@item [:cntrl:]
|
||||
|
6
etc/NEWS
6
etc/NEWS
@ -612,6 +612,12 @@ when signaling a file error. For example, it now reports "Permission
|
||||
denied" instead of "permission denied". The old behavior was problematic
|
||||
in languages like German where downcasing rules depend on grammar.
|
||||
|
||||
+++
|
||||
** The character classes [:alpha:] and [:alnum:] in regular expressions
|
||||
now match multibyte characters using Unicode character properties.
|
||||
If you want the old behavior where they matched any character with
|
||||
word syntax, use `\sw' instead.
|
||||
|
||||
|
||||
* Lisp Changes in Emacs 25.1
|
||||
|
||||
|
@ -1,3 +1,16 @@
|
||||
2015-02-28 Eli Zaretskii <eliz@gnu.org>
|
||||
|
||||
* character.c (alphabeticp, decimalnump): New functions.
|
||||
* character.h (alphabeticp, decimalnump): Add prototypes.
|
||||
|
||||
* regex.c (ISALNUM, ISALPHA): Check Unicode character properties
|
||||
for multibyte characters by calling alphabeticp and decimalnump.
|
||||
(BIT_ALPHA, BIT_ALNUM): New bit masks.
|
||||
(re_wctype_to_bit): Return them when the class is RECC_ALPHA or
|
||||
RECC_ALNUM.
|
||||
(re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.
|
||||
(Bug#19878)
|
||||
|
||||
2015-02-27 Jan Djärv <jan.h.d@swipnet.se>
|
||||
|
||||
* xterm.h (x_real_pos_and_offsets): Take outer_border as arg also.
|
||||
|
@ -984,6 +984,48 @@ character is not ASCII nor 8-bit character, an error is signaled. */)
|
||||
|
||||
#ifdef emacs
|
||||
|
||||
/* Return 'true' if C is an alphabetic character as defined by its
|
||||
Unicode properties. */
|
||||
bool
|
||||
alphabeticp (int c)
|
||||
{
|
||||
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
|
||||
|
||||
if (INTEGERP (category))
|
||||
{
|
||||
unicode_category_t gen_cat = XINT (category);
|
||||
|
||||
/* See UTS #18. There are additional characters that should be
|
||||
here, those designated as Other_uppercase, Other_lowercase,
|
||||
and Other_alphabetic; FIXME. */
|
||||
return (gen_cat == UNICODE_CATEGORY_Lu
|
||||
|| gen_cat == UNICODE_CATEGORY_Ll
|
||||
|| gen_cat == UNICODE_CATEGORY_Lt
|
||||
|| gen_cat == UNICODE_CATEGORY_Lm
|
||||
|| gen_cat == UNICODE_CATEGORY_Lo
|
||||
|| gen_cat == UNICODE_CATEGORY_Mn
|
||||
|| gen_cat == UNICODE_CATEGORY_Mc
|
||||
|| gen_cat == UNICODE_CATEGORY_Me
|
||||
|| gen_cat == UNICODE_CATEGORY_Nl) ? true : false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Return 'true' if C is an decimal-number character as defined by its
|
||||
Unicode properties. */
|
||||
bool
|
||||
decimalnump (int c)
|
||||
{
|
||||
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
|
||||
|
||||
if (INTEGERP (category))
|
||||
{
|
||||
unicode_category_t gen_cat = XINT (category);
|
||||
|
||||
/* See UTS #18. */
|
||||
return (gen_cat == UNICODE_CATEGORY_Nd) ? true : false;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
syms_of_character (void)
|
||||
{
|
||||
|
@ -660,6 +660,9 @@ extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t,
|
||||
extern Lisp_Object Vchar_unify_table;
|
||||
extern Lisp_Object string_escape_byte8 (Lisp_Object);
|
||||
|
||||
extern bool alphabeticp (int);
|
||||
extern bool decimalnump (int);
|
||||
|
||||
/* Return a translation table of id number ID. */
|
||||
#define GET_TRANSLATION_TABLE(id) \
|
||||
(XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)]))
|
||||
|
18
src/regex.c
18
src/regex.c
@ -324,12 +324,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
|
||||
? (((c) >= 'a' && (c) <= 'z') \
|
||||
|| ((c) >= 'A' && (c) <= 'Z') \
|
||||
|| ((c) >= '0' && (c) <= '9')) \
|
||||
: SYNTAX (c) == Sword)
|
||||
: (alphabeticp (c) || decimalnump (c)))
|
||||
|
||||
# define ISALPHA(c) (IS_REAL_ASCII (c) \
|
||||
? (((c) >= 'a' && (c) <= 'z') \
|
||||
|| ((c) >= 'A' && (c) <= 'Z')) \
|
||||
: SYNTAX (c) == Sword)
|
||||
: alphabeticp (c))
|
||||
|
||||
# define ISLOWER(c) lowercasep (c)
|
||||
|
||||
@ -1872,6 +1872,8 @@ struct range_table_work_area
|
||||
#define BIT_SPACE 0x8
|
||||
#define BIT_UPPER 0x10
|
||||
#define BIT_MULTIBYTE 0x20
|
||||
#define BIT_ALPHA 0x40
|
||||
#define BIT_ALNUM 0x80
|
||||
|
||||
|
||||
/* Set the bit for character C in a list. */
|
||||
@ -2072,7 +2074,9 @@ re_wctype_to_bit (re_wctype_t cc)
|
||||
{
|
||||
case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
|
||||
case RECC_MULTIBYTE: return BIT_MULTIBYTE;
|
||||
case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
|
||||
case RECC_ALPHA: return BIT_ALPHA;
|
||||
case RECC_ALNUM: return BIT_ALNUM;
|
||||
case RECC_WORD: return BIT_WORD;
|
||||
case RECC_LOWER: return BIT_LOWER;
|
||||
case RECC_UPPER: return BIT_UPPER;
|
||||
case RECC_PUNCT: return BIT_PUNCT;
|
||||
@ -2930,7 +2934,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
|
||||
#endif /* emacs */
|
||||
/* In most cases the matching rule for char classes
|
||||
only uses the syntax table for multibyte chars,
|
||||
so that the content of the syntax-table it is not
|
||||
so that the content of the syntax-table is not
|
||||
hardcoded in the range_table. SPACE and WORD are
|
||||
the two exceptions. */
|
||||
if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
|
||||
@ -2945,7 +2949,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
|
||||
p = class_beg;
|
||||
SET_LIST_BIT ('[');
|
||||
|
||||
/* Because the `:' may starts the range, we
|
||||
/* Because the `:' may start the range, we
|
||||
can't simply set bit and repeat the loop.
|
||||
Instead, just set it to C and handle below. */
|
||||
c = ':';
|
||||
@ -5513,7 +5517,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
|
||||
| (class_bits & BIT_PUNCT && ISPUNCT (c))
|
||||
| (class_bits & BIT_SPACE && ISSPACE (c))
|
||||
| (class_bits & BIT_UPPER && ISUPPER (c))
|
||||
| (class_bits & BIT_WORD && ISWORD (c)))
|
||||
| (class_bits & BIT_WORD && ISWORD (c))
|
||||
| (class_bits & BIT_ALPHA && ISALPHA (c))
|
||||
| (class_bits & BIT_ALNUM && ISALNUM (c)))
|
||||
not = !not;
|
||||
else
|
||||
CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
|
||||
|
Loading…
Reference in New Issue
Block a user