Handle unicode.

Turns out javascript iterates over strings by character, but all the string functions like slicing, lastIndexOf, and indexing with [] are all based on codepoints without taking into account surrogate pairs like orange heart. It would have been nice if that was mentioned in the documentation...
This commit is contained in:
Tom Alexander 2023-08-18 23:17:27 -04:00
parent 372542d914
commit c4762510f4
Signed by: talexander
GPG Key ID: D3A179C9A53C0EDE

View File

@ -59,7 +59,7 @@ function renderAstNode(originalSource, depth, astNode) {
const nodeElem = document.createElement("div");
nodeElem.classList.add("ast_node");
let sourceForNode = originalSource.slice(astNode.position.start_character - 1, astNode.position.end_character - 1);
let sourceForNode = unicodeAwareSlice(originalSource, astNode.position.start_character - 1, astNode.position.end_character - 1);
// Since sourceForList is a string, JSON.stringify will escape with backslashes and wrap the text in quotation marks, ensuring that the string ends up on a single line. Coincidentally, this is the behavior we want.
let escapedSource = JSON.stringify(sourceForNode);
@ -137,14 +137,14 @@ function highlightLine(htmlName, lineOffset) {
}
function highlightCharacters(htmlName, originalSource, startCharacter, endCharacter) {
let sourceBefore = originalSource.slice(0, startCharacter - 1);
let precedingLineBreak = sourceBefore.lastIndexOf("\n");
let sourceBefore = unicodeAwareSlice(originalSource, 0, startCharacter - 1);
let precedingLineBreak = unicodeAwareLastIndexOfCharacter(sourceBefore, "\n");
let characterIndexOnLine = precedingLineBreak !== -1 ? startCharacter - precedingLineBreak - 1 : startCharacter;
let lineNumber = (sourceBefore.match(/\r?\n/g) || '').length + 1;
for (let characterIndex = startCharacter; characterIndex < endCharacter; ++characterIndex) {
document.querySelector(`#${htmlName} > code:nth-child(${lineNumber}) > span:nth-child(${characterIndexOnLine})`)?.classList.add("highlighted");
if (originalSource[characterIndex - 1] == "\n") {
if (unicodeAwareCharAtOffset(originalSource, characterIndex - 1) == "\n") {
++lineNumber;
characterIndexOnLine = 1;
} else {
@ -153,3 +153,43 @@ function highlightCharacters(htmlName, originalSource, startCharacter, endCharac
}
}
function unicodeAwareSlice(text, start, end) {
// Boooo javascript
let i = 0;
let output = "";
for (chr of text) {
if (i >= end) {
break;
}
if (i >= start) {
output += chr;
}
++i;
}
return output;
}
function unicodeAwareLastIndexOfCharacter(haystack, needle) {
// Boooo javascript
let i = 0;
let found = -1;
for (chr of haystack) {
if (chr == needle) {
found = i;
}
++i;
}
return found;
}
function unicodeAwareCharAtOffset(text, offset) {
// Boooo javascript
let i = offset;
for (chr of text) {
if (i == 0) {
return chr;
}
--i;
}
}