From 6a6d7925c9ddbf558f70932661ee943262aea4ca Mon Sep 17 00:00:00 2001 From: Yuan Fu Date: Fri, 13 Sep 2024 21:42:17 -0700 Subject: [PATCH] Fix range handling so it works for multibyte buffer (bug#73204) Here by multibyte buffer I mean buffer that includes non-ASCII characters. The problem is illustrated by this comment, which I copied from the source: ====================================================================== (ref:bytepos-range-pitfall) Suppose we have the following buffer content ([ ] is a unibyte char, [ ] is a multibyte char): [a][b][c][d][e][ f ] and the following ranges (denoted by braces): [a][b][c][d][e][ f ] { }{ } So far so good, now user deletes a unibyte char at the beginning: [b][c][d][e][ f ] { }{ } Oops, now our range cuts into the multibyte char, bad! ====================================================================== * src/treesit.c (treesit_debug_print_parser_list): Minor fix. (treesit_sync_visible_region): Change the way we fixup ranges, instead of using the bytepos ranges from tree-sitter, we use the cached lisp charpos ranges. (treesit_make_ts_ranges): New function. (Ftreesit_parser_set_included_ranges): Refactor out the new function treesit_make_ts_ranges. (Ftreesit_parser_included_ranges): Rather than getting the ranges from tree-sitter, just return the cached lisp ranges. * src/treesit.h (Lisp_TS_Parser): Add some comment. * test/src/treesit-tests.el (treesit-range-fixup-after-edit): New test. --- src/treesit.c | 213 +++++++++++++++++++++++--------------- src/treesit.h | 23 ++-- test/src/treesit-tests.el | 27 +++++ 3 files changed, 173 insertions(+), 90 deletions(-) diff --git a/src/treesit.c b/src/treesit.c index 2b43c505dfa..9958d8a4c2a 100644 --- a/src/treesit.c +++ b/src/treesit.c @@ -499,7 +499,7 @@ treesit_debug_print_parser_list (char *msg, Lisp_Object parser) SSDATA (SYMBOL_NAME (Vthis_command)), SSDATA (SYMBOL_NAME (XTS_PARSER (parser)->language_symbol)), buf_name, BUF_BEG (buf), - BUF_BEGV (buf), BUF_Z (buf), BUF_ZV (buf)); + BUF_BEGV (buf), BUF_ZV (buf), BUF_Z (buf)); Lisp_Object tail = BVAR (buf, ts_parser_list); FOR_EACH_TAIL (tail) @@ -922,6 +922,9 @@ treesit_record_change (ptrdiff_t start_byte, ptrdiff_t old_end_byte, } } +static TSRange *treesit_make_ts_ranges (Lisp_Object, Lisp_Object, + uint32_t *); + /* Comment (ref:visible-beg-null) The purpose of visible_beg/end is to keep track of "which part of the buffer does the tree-sitter tree see", in order to update the tree correctly. Visible_beg/end have @@ -1050,48 +1053,85 @@ treesit_sync_visible_region (Lisp_Object parser) XTS_PARSER (parser)->visible_end = visible_end; /* Fix ranges so that the ranges stays with in visible_end. Here we - try to do minimal work so that the ranges is minimally correct such - that there's no OOB error. Usually treesit-update-ranges should - update the parser with actually correct ranges. */ - if (NILP (XTS_PARSER (parser)->last_set_ranges)) return; - uint32_t len; - const TSRange *ranges - = ts_parser_included_ranges (XTS_PARSER (parser)->parser, &len); - /* We might need to discard some ranges that exceeds visible_end, in - that case, new_len is the length of the new ranges array (which - will be shorter than len). */ - uint32_t new_len = 0; - uint32_t new_end = 0; - for (int idx = 0; idx < len; idx++) - { - TSRange range = ranges[idx]; - /* If this range starts after visible_end, we don't include this - range and the ranges after it in the new ranges. */ - if (range.start_byte + visible_beg >= visible_end) + try to do minimal work so that the ranges is minimally correct and + there's no OOB error. Usually treesit-update-ranges should update + the parser with semantically correct ranges. + + We start with the charpos ranges, because for bytepos ranges, after + user edits, the ranges start/end might end up inside a multibyte + char! See (ref:bytepos-range-pitfall) below. */ + Lisp_Object lisp_ranges = XTS_PARSER (parser)->last_set_ranges; + if (NILP (lisp_ranges)) return; + + Lisp_Object new_ranges_head = lisp_ranges; + + FOR_EACH_TAIL_SAFE (lisp_ranges) + { + Lisp_Object range = XCAR (lisp_ranges); + ptrdiff_t beg = XFIXNUM (XCAR (range)); + ptrdiff_t end = XFIXNUM (XCDR (range)); + + if (end <= visible_beg) + /* Even the end is before visible_beg, discard this range. */ + new_ranges_head = XCDR (new_ranges_head); + else if (beg >= visible_end) + { + /* Even the beg is after visible_end, dicard this range and all + the ranges after it. */ + XSETCDR (range, Qnil); break; - /* If this range's end is after visible_end, we don't include any - ranges after it, and changes the end of this range to - visible_end. */ - if (range.end_byte + visible_beg > visible_end) - { - new_end = visible_end - visible_beg; - new_len++; - break; - } - new_len++; - } - if (new_len != len || new_end != 0) + } + else + { + /* At this point, the range overlaps with the visible portion of + the buffer in some way (in front / in back / completely + encased / completely encases). */ + if (beg < visible_beg) + XSETCAR (range, make_fixnum (visible_beg)); + if (end > visible_end) + XSETCDR (range, make_fixnum (visible_end)); + } + } + + XTS_PARSER (parser)->last_set_ranges = new_ranges_head; + + if (NILP (new_ranges_head)) { - TSRange *new_ranges = xmalloc (sizeof (TSRange) * new_len); - memcpy (new_ranges, ranges, sizeof (TSRange) * new_len); - new_ranges[new_len - 1].end_byte = new_end; - /* TODO: What should we do if this fails? */ - ts_parser_set_included_ranges (XTS_PARSER (parser)->parser, - new_ranges, new_len); - xfree (new_ranges); + bool success; + success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser, + NULL, 0); + eassert (success); + } + else + { + uint32_t len = 0; + TSRange *ts_ranges = treesit_make_ts_ranges (new_ranges_head, parser, + &len); + bool success; + success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser, + ts_ranges, len); + xfree (ts_ranges); + eassert (success); } } +/* (ref:bytepos-range-pitfall) Suppose we have the following buffer + content ([ ] is a unibyte char, [ ] is a multibyte char): + + [a][b][c][d][e][ f ] + + and the following ranges (denoted by braces): + + [a][b][c][d][e][ f ] + { }{ } + + So far so good, now user deletes a unibyte char at the beginning: + + [b][c][d][e][ f ] + { }{ } + + Oops, now our range cuts into the multibyte char, bad! */ + static void treesit_check_buffer_size (struct buffer *buffer) { @@ -1228,7 +1268,12 @@ treesit_read_buffer (void *parser, uint32_t byte_index, beg = NULL; len = 0; } - /* Normal case, read a character. */ + /* Normal case, read a character. We can't give tree-sitter the + whole buffer range because we move the gap around, realloc the + buffer, etc; and there's no way to invalidate the previously + given range in tree-sitter. Move over, benchmark shows there's + very little difference between passing a whole chunk vs passing a + single char at once. The only cost is funcall I guess. */ else { beg = (char *) BUF_BYTE_ADDRESS (buffer, byte_pos); @@ -1739,6 +1784,48 @@ treesit_make_ranges (const TSRange *ranges, uint32_t len, return Fnreverse (list); } +/* Convert lisp ranges to tree-sitter ranges. Set LEN to the length of + the ranges. RANGES must be a valid ranges list, (cons of numbers, no + overlap, etc). PARSER must be a parser. This function doesn't check + for types. Caller must free the returned ranges. */ +static TSRange * +treesit_make_ts_ranges (Lisp_Object ranges, Lisp_Object parser, uint32_t *len) +{ + ptrdiff_t ranges_len = list_length (ranges); + if (ranges_len > UINT32_MAX) + xsignal (Qargs_out_of_range, list2 (ranges, Flength (ranges))); + + *len = (uint32_t) ranges_len; + TSRange *treesit_ranges = xmalloc (sizeof (TSRange) * ranges_len); + + struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer); + + for (int idx = 0; idx < ranges_len; idx++, ranges = XCDR (ranges)) + { + Lisp_Object range = XCAR (ranges); + ptrdiff_t beg_byte = buf_charpos_to_bytepos (buffer, + XFIXNUM (XCAR (range))); + ptrdiff_t end_byte = buf_charpos_to_bytepos (buffer, + XFIXNUM (XCDR (range))); + + /* Shouldn't violate assertion since we just checked for + buffer size at the beginning of this function. */ + eassert (beg_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX); + eassert (end_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX); + + /* We don't care about points, put in dummy values. */ + TSRange rg = + { + {0, 0}, {0, 0}, + (uint32_t) beg_byte - XTS_PARSER (parser)->visible_beg, + (uint32_t) end_byte - XTS_PARSER (parser)->visible_beg + }; + treesit_ranges[idx] = rg; + } + + return treesit_ranges; +} + DEFUN ("treesit-parser-set-included-ranges", Ftreesit_parser_set_included_ranges, Streesit_parser_set_included_ranges, @@ -1778,33 +1865,8 @@ buffer. */) } else { - /* Set ranges for PARSER. */ - if (list_length (ranges) > UINT32_MAX) - xsignal (Qargs_out_of_range, list2 (ranges, Flength (ranges))); - uint32_t len = (uint32_t) list_length (ranges); - TSRange *treesit_ranges = xmalloc (sizeof (TSRange) * len); - struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer); - - /* We can use XFIXNUM, XCAR, XCDR freely because we have checked - the input by treesit_check_range_argument. */ - for (int idx = 0; !NILP (ranges); idx++, ranges = XCDR (ranges)) - { - Lisp_Object range = XCAR (ranges); - ptrdiff_t beg_byte = buf_charpos_to_bytepos (buffer, - XFIXNUM (XCAR (range))); - ptrdiff_t end_byte = buf_charpos_to_bytepos (buffer, - XFIXNUM (XCDR (range))); - /* Shouldn't violate assertion since we just checked for - buffer size at the beginning of this function. */ - eassert (beg_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX); - eassert (end_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX); - /* We don't care about start and end points, put in dummy - values. */ - TSRange rg = {{0, 0}, {0, 0}, - (uint32_t) beg_byte - BUF_BEGV_BYTE (buffer), - (uint32_t) end_byte - BUF_BEGV_BYTE (buffer)}; - treesit_ranges[idx] = rg; - } + uint32_t len = 0; + TSRange *treesit_ranges = treesit_make_ts_ranges (ranges, parser, &len); success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser, treesit_ranges, len); xfree (treesit_ranges); @@ -1831,26 +1893,9 @@ See also `treesit-parser-set-included-ranges'. */) treesit_check_parser (parser); treesit_initialize (); - /* Our return value depends on the buffer state (BUF_BEGV_BYTE, - etc), so we need to sync up. */ - treesit_check_buffer_size (XBUFFER (XTS_PARSER (parser)->buffer)); treesit_sync_visible_region (parser); - /* When the parser doesn't have a range set and we call - ts_parser_included_ranges on it, it doesn't return an empty list, - but rather return DEFAULT_RANGE. (A single range where start_byte - = 0, end_byte = UINT32_MAX). So we need to track whether the - parser is ranged ourselves. */ - if (NILP (XTS_PARSER (parser)->last_set_ranges)) - return Qnil; - uint32_t len; - const TSRange *ranges - = ts_parser_included_ranges (XTS_PARSER (parser)->parser, &len); - - struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer); - - - return treesit_make_ranges (ranges, len, parser, buffer); + return XTS_PARSER (parser)->last_set_ranges; } DEFUN ("treesit-parser-notifiers", Ftreesit_parser_notifiers, diff --git a/src/treesit.h b/src/treesit.h index f8227a64b0a..3fc42a4ba24 100644 --- a/src/treesit.h +++ b/src/treesit.h @@ -45,12 +45,23 @@ struct Lisp_TS_Parser same tag. A tag is primarily used to differentiate between parsers for the same language. */ Lisp_Object tag; - /* The Lisp ranges last set. This is use to compare to the new ranges - the users wants to set, and avoid reparse if the new ranges is the - same as the last set one. This might go out of sync with the - ranges we return from Ftreesit_parser_included_ranges, if we did a - ranges fix in treesit_sync_visible_region, but I don't think - that'll cause any harm. */ + /* The Lisp ranges last set. One purpose for it is to compare to the + new ranges the users wants to set, and avoid reparse if the new + ranges is the same as the current one. Another purpose is to store + the ranges in charpos (ts api returns ranges in bytepos). We need + to use charpos so we don't end up having a range cut into a + multibyte character. (See (ref:bytepos-range-pitfall) in treesit.c + for more detail.) + + treesit-parser-set-included-ranges sets this field; + treesit-parser-included-ranges directly returns this field, and + before each reparse, treesit_sync_visible_region uses this to + calculate a range for the parser that fits in the visible region. + + Trivia: when the parser doesn't have a range set and we call + ts_parser_included_ranges on it, it doesn't return an empty list, + but rather return DEFAULT_RANGE. (A single range where start_byte + = 0, end_byte = UINT32_MAX). */ Lisp_Object last_set_ranges; /* The buffer associated with this parser. */ Lisp_Object buffer; diff --git a/test/src/treesit-tests.el b/test/src/treesit-tests.el index 3431ba5f4dd..98aaeb62781 100644 --- a/test/src/treesit-tests.el +++ b/test/src/treesit-tests.el @@ -684,6 +684,33 @@ visible_end.)" (should (equal '((16 . 28)) (treesit-query-range 'javascript query nil nil '(1 . -1))))))) +(ert-deftest treesit-range-fixup-after-edit () + "Tests if Emacs can fix OOB ranges after deleting text or narrowing." + (skip-unless (treesit-language-available-p 'json)) + (with-temp-buffer + (let ((parser (treesit-parser-create 'json))) + (insert "11111111111111111111") + (treesit-parser-set-included-ranges parser '((1 . 20))) + (treesit-parser-root-node parser) + (should (equal (treesit-parser-included-ranges parser) + '((1 . 20)))) + + (narrow-to-region 5 15) + (should (equal (treesit-parser-included-ranges parser) + '((5 . 15)))) + + (widen) + ;; Trickier ranges + ;; 11111111111111111111 + ;; [ ] [ ] + ;; { narrow } + (treesit-parser-set-included-ranges parser '((1 . 7) (10 . 15))) + (should (equal (treesit-parser-included-ranges parser) + '((1 . 7) (10 . 15)))) + (narrow-to-region 5 13) + (should (equal (treesit-parser-included-ranges parser) + '((5 . 7) (10 . 13))))))) + ;;; Multiple language (ert-deftest treesit-multi-lang ()