From 90aab73f8d6b5fd0a8adb706c8ae669564f23c56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20Engdeg=C3=A5rd?= Date: Thu, 19 Nov 2020 14:24:24 +0100 Subject: [PATCH] More string-search optimisations All-ASCII strings cannot have substrings with non-ASCII characters in them; use this fact to avoid searching entirely. * src/fns.c (Fstring_search): For multibyte non-ASCII needle and unibyte haystack, don't check if the haystack is all-ASCII; it's a waste of time. For multibyte non-ASCII needle and multibyte all-ASCII haystack, fail immediately. * test/src/fns-tests.el (string-search): Add more test cases. --- src/fns.c | 23 +++++++++++++++-------- test/src/fns-tests.el | 7 +++++++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/fns.c b/src/fns.c index f50bf8ecb77..e4c9acc3163 100644 --- a/src/fns.c +++ b/src/fns.c @@ -5502,25 +5502,32 @@ Case is always significant and text properties are ignored. */) haybytes = SBYTES (haystack) - start_byte; /* We can do a direct byte-string search if both strings have the - same multibyteness, or if at least one of them consists of ASCII - characters only. */ + same multibyteness, or if the needle consists of ASCII characters only. */ if (STRING_MULTIBYTE (haystack) ? (STRING_MULTIBYTE (needle) || SCHARS (haystack) == SBYTES (haystack) || string_ascii_p (needle)) : (!STRING_MULTIBYTE (needle) - || SCHARS (needle) == SBYTES (needle) || string_ascii_p (haystack))) - res = memmem (haystart, haybytes, - SSDATA (needle), SBYTES (needle)); - else if (STRING_MULTIBYTE (haystack)) /* unibyte needle */ + || SCHARS (needle) == SBYTES (needle))) + { + if (STRING_MULTIBYTE (haystack) && STRING_MULTIBYTE (needle) + && SCHARS (haystack) == SBYTES (haystack) + && SCHARS (needle) != SBYTES (needle)) + /* Multibyte non-ASCII needle, multibyte ASCII haystack: impossible. */ + return Qnil; + else + res = memmem (haystart, haybytes, + SSDATA (needle), SBYTES (needle)); + } + else if (STRING_MULTIBYTE (haystack)) /* unibyte non-ASCII needle */ { Lisp_Object multi_needle = string_to_multibyte (needle); res = memmem (haystart, haybytes, SSDATA (multi_needle), SBYTES (multi_needle)); } - else /* unibyte haystack, multibyte needle */ + else /* unibyte haystack, multibyte non-ASCII needle */ { /* The only possible way we can find the multibyte needle in the - unibyte stack (since we know that neither are pure-ASCII) is + unibyte stack (since we know that the needle is non-ASCII) is if they contain "raw bytes" (and no other non-ASCII chars.) */ ptrdiff_t nbytes = SBYTES (needle); for (ptrdiff_t i = 0; i < nbytes; i++) diff --git a/test/src/fns-tests.el b/test/src/fns-tests.el index d3c22f966e6..86b8d655d26 100644 --- a/test/src/fns-tests.el +++ b/test/src/fns-tests.el @@ -938,6 +938,13 @@ (should (equal (string-search "\303" "aøb") nil)) (should (equal (string-search "\270" "aøb") nil)) (should (equal (string-search "ø" "\303\270") nil)) + (should (equal (string-search "ø" (make-string 32 ?a)) nil)) + (should (equal (string-search "ø" (string-to-multibyte (make-string 32 ?a))) + nil)) + (should (equal (string-search "o" (string-to-multibyte + (apply #'string + (number-sequence ?a ?z)))) + 14)) (should (equal (string-search "a\U00010f98z" "a\U00010f98a\U00010f98z") 2))