From 6a6d7925c9ddbf558f70932661ee943262aea4ca Mon Sep 17 00:00:00 2001
From: Yuan Fu <casouri@gmail.com>
Date: Fri, 13 Sep 2024 21:42:17 -0700
Subject: [PATCH] Fix range handling so it works for multibyte buffer
 (bug#73204)

Here by multibyte buffer I mean buffer that includes non-ASCII
characters.

The problem is illustrated by this comment, which I copied from the
source:

======================================================================
(ref:bytepos-range-pitfall) Suppose we have the following buffer
content ([ ] is a unibyte char, [    ] is a multibyte char):

    [a][b][c][d][e][ f  ]

and the following ranges (denoted by braces):

    [a][b][c][d][e][ f  ]
    {       }{    }

So far so good, now user deletes a unibyte char at the beginning:

    [b][c][d][e][ f  ]
    {       }{    }

Oops, now our range cuts into the multibyte char, bad!
======================================================================

* src/treesit.c (treesit_debug_print_parser_list): Minor fix.
(treesit_sync_visible_region): Change the way we fixup ranges, instead
of using the bytepos ranges from tree-sitter, we use the cached lisp
charpos ranges.
(treesit_make_ts_ranges): New function.
(Ftreesit_parser_set_included_ranges): Refactor out the new function
treesit_make_ts_ranges.
(Ftreesit_parser_included_ranges): Rather than getting the ranges from
tree-sitter, just return the cached lisp ranges.

* src/treesit.h (Lisp_TS_Parser): Add some comment.
* test/src/treesit-tests.el (treesit-range-fixup-after-edit): New test.
---
 src/treesit.c             | 213 +++++++++++++++++++++++---------------
 src/treesit.h             |  23 ++--
 test/src/treesit-tests.el |  27 +++++
 3 files changed, 173 insertions(+), 90 deletions(-)

diff --git a/src/treesit.c b/src/treesit.c
index 2b43c505dfa..9958d8a4c2a 100644
--- a/src/treesit.c
+++ b/src/treesit.c
@@ -499,7 +499,7 @@ treesit_debug_print_parser_list (char *msg, Lisp_Object parser)
 	  SSDATA (SYMBOL_NAME (Vthis_command)),
 	  SSDATA (SYMBOL_NAME (XTS_PARSER (parser)->language_symbol)),
 	  buf_name, BUF_BEG (buf),
-	  BUF_BEGV (buf), BUF_Z (buf), BUF_ZV (buf));
+	  BUF_BEGV (buf), BUF_ZV (buf), BUF_Z (buf));
   Lisp_Object tail = BVAR (buf, ts_parser_list);
 
   FOR_EACH_TAIL (tail)
@@ -922,6 +922,9 @@ treesit_record_change (ptrdiff_t start_byte, ptrdiff_t old_end_byte,
     }
 }
 
+static TSRange *treesit_make_ts_ranges (Lisp_Object, Lisp_Object,
+					uint32_t *);
+
 /* Comment (ref:visible-beg-null)  The purpose of visible_beg/end is to
    keep track of "which part of the buffer does the tree-sitter tree
    see", in order to update the tree correctly.  Visible_beg/end have
@@ -1050,48 +1053,85 @@ treesit_sync_visible_region (Lisp_Object parser)
   XTS_PARSER (parser)->visible_end = visible_end;
 
   /* Fix ranges so that the ranges stays with in visible_end.  Here we
-     try to do minimal work so that the ranges is minimally correct such
-     that there's no OOB error.  Usually treesit-update-ranges should
-     update the parser with actually correct ranges.  */
-  if (NILP (XTS_PARSER (parser)->last_set_ranges)) return;
-  uint32_t len;
-  const TSRange *ranges
-    = ts_parser_included_ranges (XTS_PARSER (parser)->parser, &len);
-  /* We might need to discard some ranges that exceeds visible_end, in
-     that case, new_len is the length of the new ranges array (which
-     will be shorter than len).  */
-  uint32_t new_len = 0;
-  uint32_t new_end = 0;
-  for (int idx = 0; idx < len; idx++)
-    {
-      TSRange range = ranges[idx];
-      /* If this range starts after visible_end, we don't include this
-         range and the ranges after it in the new ranges.  */
-      if (range.start_byte + visible_beg >= visible_end)
+     try to do minimal work so that the ranges is minimally correct and
+     there's no OOB error.  Usually treesit-update-ranges should update
+     the parser with semantically correct ranges.
+
+     We start with the charpos ranges, because for bytepos ranges, after
+     user edits, the ranges start/end might end up inside a multibyte
+     char!  See (ref:bytepos-range-pitfall) below.  */
+  Lisp_Object lisp_ranges = XTS_PARSER (parser)->last_set_ranges;
+  if (NILP (lisp_ranges)) return;
+
+  Lisp_Object new_ranges_head = lisp_ranges;
+
+  FOR_EACH_TAIL_SAFE (lisp_ranges)
+  {
+    Lisp_Object range = XCAR (lisp_ranges);
+    ptrdiff_t beg = XFIXNUM (XCAR (range));
+    ptrdiff_t end = XFIXNUM (XCDR (range));
+
+    if (end <= visible_beg)
+      /* Even the end is before visible_beg, discard this range.  */
+      new_ranges_head = XCDR (new_ranges_head);
+    else if (beg >= visible_end)
+      {
+	/* Even the beg is after visible_end, dicard this range and all
+           the ranges after it.  */
+	XSETCDR (range, Qnil);
 	break;
-      /* If this range's end is after visible_end, we don't include any
-         ranges after it, and changes the end of this range to
-         visible_end.  */
-      if (range.end_byte + visible_beg > visible_end)
-	{
-	  new_end = visible_end - visible_beg;
-	  new_len++;
-	  break;
-	}
-      new_len++;
-    }
-  if (new_len != len || new_end != 0)
+      }
+    else
+      {
+	/* At this point, the range overlaps with the visible portion of
+	   the buffer in some way (in front / in back / completely
+	   encased / completely encases).  */
+	if (beg < visible_beg)
+	  XSETCAR (range, make_fixnum (visible_beg));
+	if (end > visible_end)
+	  XSETCDR (range, make_fixnum (visible_end));
+      }
+  }
+
+  XTS_PARSER (parser)->last_set_ranges = new_ranges_head;
+
+  if (NILP (new_ranges_head))
     {
-      TSRange *new_ranges = xmalloc (sizeof (TSRange) * new_len);
-      memcpy (new_ranges, ranges, sizeof (TSRange) * new_len);
-      new_ranges[new_len - 1].end_byte = new_end;
-      /* TODO: What should we do if this fails?  */
-      ts_parser_set_included_ranges (XTS_PARSER (parser)->parser,
-				     new_ranges, new_len);
-      xfree (new_ranges);
+      bool success;
+      success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser,
+					       NULL, 0);
+      eassert (success);
+    }
+  else
+    {
+      uint32_t len = 0;
+      TSRange *ts_ranges = treesit_make_ts_ranges (new_ranges_head, parser,
+						   &len);
+      bool success;
+      success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser,
+					       ts_ranges, len);
+      xfree (ts_ranges);
+      eassert (success);
     }
 }
 
+/* (ref:bytepos-range-pitfall) Suppose we have the following buffer
+   content ([ ] is a unibyte char, [    ] is a multibyte char):
+
+       [a][b][c][d][e][ f  ]
+
+   and the following ranges (denoted by braces):
+
+       [a][b][c][d][e][ f  ]
+       {       }{    }
+
+   So far so good, now user deletes a unibyte char at the beginning:
+
+       [b][c][d][e][ f  ]
+       {       }{    }
+
+   Oops, now our range cuts into the multibyte char, bad!  */
+
 static void
 treesit_check_buffer_size (struct buffer *buffer)
 {
@@ -1228,7 +1268,12 @@ treesit_read_buffer (void *parser, uint32_t byte_index,
       beg = NULL;
       len = 0;
     }
-  /* Normal case, read a character.  */
+    /* Normal case, read a character.  We can't give tree-sitter the
+       whole buffer range because we move the gap around, realloc the
+       buffer, etc; and there's no way to invalidate the previously
+       given range in tree-sitter.  Move over, benchmark shows there's
+       very little difference between passing a whole chunk vs passing a
+       single char at once.  The only cost is funcall I guess.  */
   else
     {
       beg = (char *) BUF_BYTE_ADDRESS (buffer, byte_pos);
@@ -1739,6 +1784,48 @@ treesit_make_ranges (const TSRange *ranges, uint32_t len,
   return Fnreverse (list);
 }
 
+/* Convert lisp ranges to tree-sitter ranges.  Set LEN to the length of
+   the ranges.  RANGES must be a valid ranges list, (cons of numbers, no
+   overlap, etc).  PARSER must be a parser.  This function doesn't check
+   for types.  Caller must free the returned ranges.  */
+static TSRange *
+treesit_make_ts_ranges (Lisp_Object ranges, Lisp_Object parser, uint32_t *len)
+{
+  ptrdiff_t ranges_len = list_length (ranges);
+  if (ranges_len > UINT32_MAX)
+    xsignal (Qargs_out_of_range, list2 (ranges, Flength (ranges)));
+
+  *len = (uint32_t) ranges_len;
+  TSRange *treesit_ranges = xmalloc (sizeof (TSRange) * ranges_len);
+
+  struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer);
+
+  for (int idx = 0; idx < ranges_len; idx++, ranges = XCDR (ranges))
+  {
+    Lisp_Object range = XCAR (ranges);
+    ptrdiff_t beg_byte = buf_charpos_to_bytepos (buffer,
+						 XFIXNUM (XCAR (range)));
+    ptrdiff_t end_byte = buf_charpos_to_bytepos (buffer,
+						 XFIXNUM (XCDR (range)));
+
+    /* Shouldn't violate assertion since we just checked for
+       buffer size at the beginning of this function.  */
+    eassert (beg_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX);
+    eassert (end_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX);
+
+    /* We don't care about points, put in dummy values.  */
+    TSRange rg =
+      {
+	{0, 0}, {0, 0},
+	(uint32_t) beg_byte - XTS_PARSER (parser)->visible_beg,
+	(uint32_t) end_byte - XTS_PARSER (parser)->visible_beg
+      };
+    treesit_ranges[idx] = rg;
+  }
+
+  return treesit_ranges;
+}
+
 DEFUN ("treesit-parser-set-included-ranges",
        Ftreesit_parser_set_included_ranges,
        Streesit_parser_set_included_ranges,
@@ -1778,33 +1865,8 @@ buffer.  */)
     }
   else
     {
-      /* Set ranges for PARSER.  */
-      if (list_length (ranges) > UINT32_MAX)
-	xsignal (Qargs_out_of_range, list2 (ranges, Flength (ranges)));
-      uint32_t len = (uint32_t) list_length (ranges);
-      TSRange *treesit_ranges = xmalloc (sizeof (TSRange) * len);
-      struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer);
-
-      /* We can use XFIXNUM, XCAR, XCDR freely because we have checked
-	 the input by treesit_check_range_argument.  */
-      for (int idx = 0; !NILP (ranges); idx++, ranges = XCDR (ranges))
-	{
-	  Lisp_Object range = XCAR (ranges);
-	  ptrdiff_t beg_byte = buf_charpos_to_bytepos (buffer,
-						       XFIXNUM (XCAR (range)));
-	  ptrdiff_t end_byte = buf_charpos_to_bytepos (buffer,
-						       XFIXNUM (XCDR (range)));
-	  /* Shouldn't violate assertion since we just checked for
-	     buffer size at the beginning of this function.  */
-	  eassert (beg_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX);
-	  eassert (end_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX);
-	  /* We don't care about start and end points, put in dummy
-	     values.  */
-	  TSRange rg = {{0, 0}, {0, 0},
-			(uint32_t) beg_byte - BUF_BEGV_BYTE (buffer),
-			(uint32_t) end_byte - BUF_BEGV_BYTE (buffer)};
-	  treesit_ranges[idx] = rg;
-	}
+      uint32_t len = 0;
+      TSRange *treesit_ranges = treesit_make_ts_ranges (ranges, parser, &len);
       success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser,
 					       treesit_ranges, len);
       xfree (treesit_ranges);
@@ -1831,26 +1893,9 @@ See also `treesit-parser-set-included-ranges'.  */)
   treesit_check_parser (parser);
   treesit_initialize ();
 
-  /* Our return value depends on the buffer state (BUF_BEGV_BYTE,
-     etc), so we need to sync up.  */
-  treesit_check_buffer_size (XBUFFER (XTS_PARSER (parser)->buffer));
   treesit_sync_visible_region (parser);
-  /* When the parser doesn't have a range set and we call
-     ts_parser_included_ranges on it, it doesn't return an empty list,
-     but rather return DEFAULT_RANGE.  (A single range where start_byte
-     = 0, end_byte = UINT32_MAX).  So we need to track whether the
-     parser is ranged ourselves.  */
-  if (NILP (XTS_PARSER (parser)->last_set_ranges))
-    return Qnil;
 
-  uint32_t len;
-  const TSRange *ranges
-    = ts_parser_included_ranges (XTS_PARSER (parser)->parser, &len);
-
-  struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer);
-
-
-  return treesit_make_ranges (ranges, len, parser, buffer);
+  return XTS_PARSER (parser)->last_set_ranges;
 }
 
 DEFUN ("treesit-parser-notifiers", Ftreesit_parser_notifiers,
diff --git a/src/treesit.h b/src/treesit.h
index f8227a64b0a..3fc42a4ba24 100644
--- a/src/treesit.h
+++ b/src/treesit.h
@@ -45,12 +45,23 @@ struct Lisp_TS_Parser
      same tag.  A tag is primarily used to differentiate between
      parsers for the same language.  */
   Lisp_Object tag;
-  /* The Lisp ranges last set.  This is use to compare to the new ranges
-     the users wants to set, and avoid reparse if the new ranges is the
-     same as the last set one.  This might go out of sync with the
-     ranges we return from Ftreesit_parser_included_ranges, if we did a
-     ranges fix in treesit_sync_visible_region, but I don't think
-     that'll cause any harm.  */
+  /* The Lisp ranges last set.  One purpose for it is to compare to the
+     new ranges the users wants to set, and avoid reparse if the new
+     ranges is the same as the current one.  Another purpose is to store
+     the ranges in charpos (ts api returns ranges in bytepos).  We need
+     to use charpos so we don't end up having a range cut into a
+     multibyte character.  (See (ref:bytepos-range-pitfall) in treesit.c
+     for more detail.)
+
+     treesit-parser-set-included-ranges sets this field;
+     treesit-parser-included-ranges directly returns this field, and
+     before each reparse, treesit_sync_visible_region uses this to
+     calculate a range for the parser that fits in the visible region.
+
+     Trivia: when the parser doesn't have a range set and we call
+     ts_parser_included_ranges on it, it doesn't return an empty list,
+     but rather return DEFAULT_RANGE.  (A single range where start_byte
+     = 0, end_byte = UINT32_MAX).  */
   Lisp_Object last_set_ranges;
   /* The buffer associated with this parser.  */
   Lisp_Object buffer;
diff --git a/test/src/treesit-tests.el b/test/src/treesit-tests.el
index 3431ba5f4dd..98aaeb62781 100644
--- a/test/src/treesit-tests.el
+++ b/test/src/treesit-tests.el
@@ -684,6 +684,33 @@ visible_end.)"
       (should (equal '((16 . 28)) (treesit-query-range
                                    'javascript query nil nil '(1 . -1)))))))
 
+(ert-deftest treesit-range-fixup-after-edit ()
+  "Tests if Emacs can fix OOB ranges after deleting text or narrowing."
+  (skip-unless (treesit-language-available-p 'json))
+  (with-temp-buffer
+    (let ((parser (treesit-parser-create 'json)))
+      (insert "11111111111111111111")
+      (treesit-parser-set-included-ranges parser '((1 . 20)))
+      (treesit-parser-root-node parser)
+      (should (equal (treesit-parser-included-ranges parser)
+                     '((1 . 20))))
+
+      (narrow-to-region 5 15)
+      (should (equal (treesit-parser-included-ranges parser)
+                     '((5 . 15))))
+
+      (widen)
+      ;; Trickier ranges
+      ;; 11111111111111111111
+      ;; [    ]      [      ]
+      ;;    {  narrow   }
+      (treesit-parser-set-included-ranges parser '((1 . 7) (10 . 15)))
+      (should (equal (treesit-parser-included-ranges parser)
+                     '((1 . 7) (10 . 15))))
+      (narrow-to-region 5 13)
+      (should (equal (treesit-parser-included-ranges parser)
+                     '((5 . 7) (10 . 13)))))))
+
 ;;; Multiple language
 
 (ert-deftest treesit-multi-lang ()