From 42dbda494a47c14fab34ff21ecaa2499dc7d2400 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Sun, 8 Oct 2023 14:11:46 -0400 Subject: [PATCH] Switch to using a similar optimized Cow function for regular link. --- src/types/object.rs | 67 ++++-------------------- src/types/util.rs | 122 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+), 58 deletions(-) diff --git a/src/types/object.rs b/src/types/object.rs index 3d5b0b5..270ecdb 100644 --- a/src/types/object.rs +++ b/src/types/object.rs @@ -1,6 +1,7 @@ use std::borrow::Borrow; use std::borrow::Cow; +use super::util::coalesce_whitespace_if_line_break; use super::util::remove_line_break; use super::util::remove_whitespace_if_line_break; use super::GetStandardProperties; @@ -665,67 +666,22 @@ pub enum LinkType<'s> { Fuzzy, } -#[derive(Debug)] -enum ParserState { - Normal, - InWhitespace, -} - -/// Org-mode treats multiple consecutive whitespace characters as a single space. This function performs that transformation. -/// -/// Example: `orgify_text("foo \t\n bar") == "foo bar"` -pub(crate) fn orgify_text>(raw_text: T) -> String { - let raw_text = raw_text.as_ref(); - let mut ret = String::with_capacity(raw_text.len()); - let mut state = ParserState::Normal; - for c in raw_text.chars() { - state = match (&state, c) { - (ParserState::Normal, _) if " \t\r\n".contains(c) => { - ret.push(' '); - ParserState::InWhitespace - } - (ParserState::InWhitespace, _) if " \t\r\n".contains(c) => ParserState::InWhitespace, - (ParserState::Normal, _) => { - ret.push(c); - ParserState::Normal - } - (ParserState::InWhitespace, _) => { - ret.push(c); - ParserState::Normal - } - }; - } - ret -} - impl<'s> RegularLink<'s> { /// Orgify the raw_link if it contains line breaks. - pub fn get_raw_link(&self) -> String { - if self.raw_link.contains('\n') { - orgify_text(Borrow::::borrow(&self.raw_link)) - } else { - self.raw_link.clone().into_owned() - } + pub fn get_raw_link(&'s self) -> Cow<'s, str> { + coalesce_whitespace_if_line_break(&self.raw_link) } /// Orgify the path if it contains line breaks. - pub fn get_path(&self) -> String { - if self.path.contains('\n') { - orgify_text(Borrow::::borrow(&self.path)) - } else { - self.path.clone().into_owned() - } + pub fn get_path(&'s self) -> Cow<'s, str> { + coalesce_whitespace_if_line_break(&self.path) } /// Orgify the search_option if it contains line breaks. - pub fn get_search_option(&self) -> Option { - self.search_option.as_ref().map(|search_option| { - if search_option.contains('\n') { - orgify_text(search_option) - } else { - search_option.clone().into_owned() - } - }) + pub fn get_search_option(&'s self) -> Option> { + self.search_option + .as_ref() + .map(|search_option| coalesce_whitespace_if_line_break(search_option.borrow())) } } @@ -735,11 +691,6 @@ impl<'s> RadioLink<'s> { } } -enum PathState { - Normal, - HasLineBreak(String), -} - impl<'s> AngleLink<'s> { /// Remove line breaks but preserve multiple consecutive spaces. pub fn get_path(&self) -> Cow<'s, str> { diff --git a/src/types/util.rs b/src/types/util.rs index 31a06b8..c336474 100644 --- a/src/types/util.rs +++ b/src/types/util.rs @@ -75,3 +75,125 @@ enum RemoveLineBreakState { Normal, HasLineBreak(String), } + +/// Removes all whitespace from a string if any line breaks are present. +/// +/// Example: "foo bar" => "foo bar" but "foo \n bar" => "foobar". +pub(crate) fn coalesce_whitespace_if_line_break<'s>(input: &'s str) -> Cow<'s, str> { + let mut state = CoalesceWhitespaceIfLineBreakState::Normal; + for (offset, c) in input.char_indices() { + match (&mut state, c) { + (CoalesceWhitespaceIfLineBreakState::Normal, '\n') => { + // Hit line break without any preceding whitespace + let mut ret = String::with_capacity(input.len()); + ret.push_str(&input[..offset]); + ret.push(' '); + state = CoalesceWhitespaceIfLineBreakState::HasLineBreak { + in_whitespace: true, + ret, + }; + } + (CoalesceWhitespaceIfLineBreakState::Normal, ' ' | '\t') => { + state = CoalesceWhitespaceIfLineBreakState::HasWhitespace { + in_whitespace: true, + first_whitespace_offset: offset, + }; + } + (CoalesceWhitespaceIfLineBreakState::Normal, _) => {} + + ( + CoalesceWhitespaceIfLineBreakState::HasWhitespace { + in_whitespace, + first_whitespace_offset, + }, + '\n', + ) => { + // Hit line break with preceding whitespace so we add all the text up to the first whitespace and then process the remaining text coalescing the whitespace. + let mut ret = String::with_capacity(input.len()); + ret.push_str(&input[..*first_whitespace_offset]); + let mut sub_loop_in_whitespace = false; + for c in input[*first_whitespace_offset..offset].chars() { + if sub_loop_in_whitespace { + if !c.is_ascii_whitespace() { + // Preceding character was whitespace but this is not. + sub_loop_in_whitespace = false; + ret.push(c); + } + // Do nothing if preceding character was whitespace and this character also is whitespace. + } else { + if c.is_ascii_whitespace() { + // Preceding character was not whitespace but this is. + sub_loop_in_whitespace = true; + ret.push(' '); + } else { + // Preceding character was not whitespace and this is not either. + ret.push(c); + } + } + } + if !*in_whitespace { + // If this line break was the start of whitespace then we need to inject a space character for it. + ret.push(' '); + } + state = CoalesceWhitespaceIfLineBreakState::HasLineBreak { + in_whitespace: true, // This was triggered by a line break which is whitespace. + ret, + }; + } + ( + CoalesceWhitespaceIfLineBreakState::HasWhitespace { + in_whitespace, + first_whitespace_offset: _, + }, + ' ' | '\t', + ) => { + *in_whitespace = true; + } + ( + CoalesceWhitespaceIfLineBreakState::HasWhitespace { + in_whitespace, + first_whitespace_offset: _, + }, + _, + ) => { + *in_whitespace = false; + } + ( + CoalesceWhitespaceIfLineBreakState::HasLineBreak { in_whitespace, ret }, + ' ' | '\t' | '\r' | '\n', + ) => { + if !*in_whitespace { + ret.push(' '); + } + *in_whitespace = true; + } + (CoalesceWhitespaceIfLineBreakState::HasLineBreak { in_whitespace, ret }, _) => { + *in_whitespace = false; + ret.push(c); + } + } + } + match state { + CoalesceWhitespaceIfLineBreakState::Normal => Cow::Borrowed(input), + CoalesceWhitespaceIfLineBreakState::HasWhitespace { + in_whitespace: _, + first_whitespace_offset: _, + } => Cow::Borrowed(input), + CoalesceWhitespaceIfLineBreakState::HasLineBreak { + in_whitespace: _, + ret, + } => Cow::Owned(ret), + } +} + +enum CoalesceWhitespaceIfLineBreakState { + Normal, + HasWhitespace { + in_whitespace: bool, + first_whitespace_offset: usize, + }, + HasLineBreak { + in_whitespace: bool, + ret: String, + }, +}