Fix handling of unicode in compare tests.

2023-08-31 20:19:28 -04:00 · 2023-08-31 20:19:28 -04:00 · 80d77ff5d6
commit 80d77ff5d6
parent ee92049e5d
4 changed files with 8 additions and 5 deletions
--- a/build.rs
+++ b/build.rs
@ -87,7 +87,6 @@ fn is_expect_fail(name: &str) -> Option<&str> {
        "autogen_greater_element_drawer_drawer_with_headline_inside" => Some("Apparently lines with :end: become their own paragraph. This odd behavior needs to be investigated more."),
        "autogen_element_container_priority_footnote_definition_dynamic_block" => Some("Apparently broken begin lines become their own paragraph."),
        "autogen_lesser_element_paragraphs_paragraph_with_backslash_line_breaks" => Some("The text we're getting out of the parse tree is already processed to remove line breaks, so our comparison needs to take that into account."),
-        "autogen_unicode_hearts" => Some("Unicode is coming out of emacs strange."),
        _ => None,
    }
 }
--- a/docker/organic_test/Dockerfile
+++ b/docker/organic_test/Dockerfile
@ -26,6 +26,7 @@ RUN make DESTDIR="/root/dist" install


 FROM rustlang/rust:nightly-alpine3.17
+ENV LANG=en_US.UTF-8
 RUN apk add --no-cache musl-dev ncurses gnutls
 RUN cargo install --locked --no-default-features --features ci-autoclean cargo-cache
 COPY --from=build-emacs /root/dist/ /
--- a/src/compare/diff.rs
+++ b/src/compare/diff.rs
@ -1457,7 +1457,7 @@ fn compare_plain_text<'s>(
        .as_atom()?
        .parse()?;
    let emacs_text_length = end_ind - start_ind;
-    if rust_source.len() != emacs_text_length {
+    if rust_source.chars().count() != emacs_text_length {
        this_status = DiffStatus::Bad;
        message = Some(format!(
            "(emacs len != rust len) {:?} != {:?}",
--- a/src/compare/util.rs
+++ b/src/compare/util.rs
@ -13,7 +13,7 @@ fn is_slice_of(parent: &str, child: &str) -> bool {
 /// Get the offset into source that the rust object exists at.
 ///
 /// These offsets are zero-based unlike the elisp ones.
-pub fn get_offsets<'s, S: Source<'s>>(source: &'s str, rust_object: &'s S) -> (usize, usize) {
+fn get_offsets<'s, S: Source<'s>>(source: &'s str, rust_object: &'s S) -> (usize, usize) {
    let rust_object_source = rust_object.get_source();
    assert!(is_slice_of(source, rust_object_source));
    let offset = rust_object_source.as_ptr() as usize - source.as_ptr() as usize;
@ -50,8 +50,11 @@ pub fn assert_bounds<'s, S: Source<'s>>(
        standard_properties.end.ok_or("Token should have an end.")?,
    );
    let (rust_begin, rust_end) = get_offsets(source, rust);
-    if (rust_begin + 1) != begin || (rust_end + 1) != end {
-        Err(format!("Rust bounds (in bytes) ({rust_begin}, {rust_end}) do not match emacs bounds ({emacs_begin}, {emacs_end})", rust_begin = rust_begin + 1, rust_end = rust_end + 1, emacs_begin=begin, emacs_end=end))?;
+    let rust_begin_char_offset = (&source[..rust_begin]).chars().count();
+    let rust_end_char_offset =
+        rust_begin_char_offset + (&source[rust_begin..rust_end]).chars().count();
+    if (rust_begin_char_offset + 1) != begin || (rust_end_char_offset + 1) != end {
+        Err(format!("Rust bounds (in chars) ({rust_begin}, {rust_end}) do not match emacs bounds ({emacs_begin}, {emacs_end})", rust_begin = rust_begin_char_offset + 1, rust_end = rust_end_char_offset + 1, emacs_begin=begin, emacs_end=end))?;
    }

    Ok(())