From acfc5e5e68923b2313e736fbf8602d475aac8df8 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 20 Oct 2023 12:44:24 -0400 Subject: [PATCH] Only allocate memory when unquoting sexp string that contains escapes. If the quoted string contains no escape sequences, then unquoting the string can be done by simply shaving off the leading and trailing quotation marks which can be a slice operation. By returning Cow, we can return either a borrowed slice or an owned String. --- src/compare/compare_field.rs | 6 +- src/compare/sexp.rs | 161 ++++++++++++++++++++++++++++------- src/compare/util.rs | 7 +- 3 files changed, 140 insertions(+), 34 deletions(-) diff --git a/src/compare/compare_field.rs b/src/compare/compare_field.rs index b5aa2417..1632da5b 100644 --- a/src/compare/compare_field.rs +++ b/src/compare/compare_field.rs @@ -1,3 +1,5 @@ +use std::borrow::Borrow; +use std::borrow::Cow; use std::collections::BTreeSet; use std::fmt::Debug; use std::str::FromStr; @@ -262,11 +264,11 @@ pub(crate) fn compare_property_set_of_quoted_string< .iter() .map(|e| e.as_atom()) .collect::, _>>()?; - let value: Vec = value + let value: Vec> = value .into_iter() .map(unquote) .collect::, _>>()?; - let value: BTreeSet<&str> = value.iter().map(|e| e.as_str()).collect(); + let value: BTreeSet<&str> = value.iter().map(|e| e.borrow()).collect(); let mismatched: Vec<_> = value.symmetric_difference(&rust_value).copied().collect(); if !mismatched.is_empty() { let this_status = DiffStatus::Bad; diff --git a/src/compare/sexp.rs b/src/compare/sexp.rs index 85cab3f1..a45816a4 100644 --- a/src/compare/sexp.rs +++ b/src/compare/sexp.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::HashMap; use nom::branch::alt; @@ -36,12 +37,6 @@ pub struct TextWithProperties<'s> { pub(crate) properties: Vec>, } -enum ParseState { - Normal, - Escape, - Octal(Vec), -} - impl<'s> Token<'s> { pub(crate) fn as_vector<'p>( &'p self, @@ -117,8 +112,27 @@ fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str { &input[..offset] } -pub(crate) fn unquote(text: &str) -> Result> { - let mut out: Vec = Vec::with_capacity(text.len()); +#[derive(Debug)] +enum UnquoteState { + Normal, + Escape, + HasEscape { + out: Vec, + }, + HasEscapeEscape { + out: Vec, + }, + Octal { + octal_begin_offset: usize, + octal: Vec, + }, + HasEscapeOctal { + out: Vec, + octal: Vec, + }, +} + +pub(crate) fn unquote(text: &str) -> Result, Box> { if !text.starts_with('"') { return Err("Quoted text does not start with quote.".into()); } @@ -126,54 +140,143 @@ pub(crate) fn unquote(text: &str) -> Result> return Err("Quoted text does not end with quote.".into()); } let interior_text = &text[1..(text.len() - 1)]; - let mut state = ParseState::Normal; - for current_char in interior_text.bytes() { + let mut state = UnquoteState::Normal; + for (offset, current_char) in interior_text.bytes().enumerate() { // Check to see if octal finished state = match (state, current_char) { - (ParseState::Octal(octal), b'0'..=b'7') if octal.len() < MAX_OCTAL_LENGTH => { - ParseState::Octal(octal) + ( + UnquoteState::Octal { + octal_begin_offset, + octal, + }, + b'0'..=b'7', + ) if octal.len() < MAX_OCTAL_LENGTH => UnquoteState::Octal { + octal_begin_offset, + octal, + }, + ( + UnquoteState::Octal { + octal_begin_offset, + octal, + }, + _, + ) => { + let octal_number_string = String::from_utf8(octal)?; + let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?; + let mut out: Vec = Vec::with_capacity(interior_text.len()); + out.extend_from_slice(&interior_text.as_bytes()[..octal_begin_offset]); + out.push(decoded_byte); + UnquoteState::HasEscape { out } } - (ParseState::Octal(octal), _) => { + (UnquoteState::HasEscapeOctal { out, octal }, b'0'..=b'7') + if octal.len() < MAX_OCTAL_LENGTH => + { + UnquoteState::HasEscapeOctal { out, octal } + } + (UnquoteState::HasEscapeOctal { mut out, octal }, _) => { let octal_number_string = String::from_utf8(octal)?; let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?; out.push(decoded_byte); - ParseState::Normal + UnquoteState::HasEscape { out } } (state, _) => state, }; state = match (state, current_char) { - (ParseState::Normal, b'\\') => ParseState::Escape, - (ParseState::Normal, _) => { + (UnquoteState::Normal, b'\\') => UnquoteState::Escape, + (UnquoteState::Normal, _) => UnquoteState::Normal, + (UnquoteState::HasEscape { out }, b'\\') => UnquoteState::HasEscapeEscape { out }, + (UnquoteState::HasEscape { mut out }, _) => { out.push(current_char); - ParseState::Normal + UnquoteState::HasEscape { out } } - (ParseState::Escape, b'n') => { + (UnquoteState::Escape, b'n') => { + let mut out: Vec = Vec::with_capacity(interior_text.len()); + // Subtract 1 from offset to account for backslash. + out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]); out.push(b'\n'); - ParseState::Normal + UnquoteState::HasEscape { out } } - (ParseState::Escape, b'\\') => { + (UnquoteState::HasEscapeEscape { mut out }, b'n') => { + out.push(b'\n'); + UnquoteState::HasEscape { out } + } + (UnquoteState::Escape, b'\\') => { + let mut out: Vec = Vec::with_capacity(interior_text.len()); + // Subtract 1 from offset to account for backslash. + out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]); out.push(b'\\'); - ParseState::Normal + UnquoteState::HasEscape { out } } - (ParseState::Escape, b'"') => { + (UnquoteState::HasEscapeEscape { mut out }, b'\\') => { + out.push(b'\\'); + UnquoteState::HasEscape { out } + } + (UnquoteState::Escape, b'"') => { + let mut out: Vec = Vec::with_capacity(interior_text.len()); + // Subtract 1 from offset to account for backslash. + out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]); out.push(b'"'); - ParseState::Normal + UnquoteState::HasEscape { out } } - (ParseState::Escape, b'0'..=b'7') => { + (UnquoteState::HasEscapeEscape { mut out }, b'"') => { + out.push(b'"'); + UnquoteState::HasEscape { out } + } + (UnquoteState::Escape, b'0'..=b'7') => { let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH); octal.push(current_char); - ParseState::Octal(octal) + // Substract 1 from offset to account for backslash + UnquoteState::Octal { + octal_begin_offset: offset - 1, + octal, + } } - (ParseState::Octal(mut octal), b'0'..=b'7') => { + (UnquoteState::HasEscapeEscape { out }, b'0'..=b'7') => { + let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH); octal.push(current_char); - ParseState::Octal(octal) + // Substract 1 from offset to account for backslash + UnquoteState::HasEscapeOctal { out, octal } } - _ => panic!("Invalid state unquoting string."), + ( + UnquoteState::Octal { + octal_begin_offset, + mut octal, + }, + b'0'..=b'7', + ) => { + octal.push(current_char); + UnquoteState::Octal { + octal_begin_offset, + octal, + } + } + (UnquoteState::HasEscapeOctal { out, mut octal }, b'0'..=b'7') => { + octal.push(current_char); + UnquoteState::HasEscapeOctal { out, octal } + } + (state, _) => panic!( + "Invalid state unquoting string: {:?} | {} | {:?}", + state, offset, interior_text + ), }; } - Ok(String::from_utf8(out)?) + match state { + UnquoteState::Normal | UnquoteState::Escape | UnquoteState::Octal { .. } => { + Ok(Cow::Borrowed(interior_text)) + } + UnquoteState::HasEscape { out } => Ok(Cow::Owned(String::from_utf8(out)?)), + UnquoteState::HasEscapeEscape { mut out } => { + out.push(b'\\'); + Ok(Cow::Owned(String::from_utf8(out)?)) + } + UnquoteState::HasEscapeOctal { mut out, octal } => { + out.push(b'\\'); + out.extend(octal); + Ok(Cow::Owned(String::from_utf8(out)?)) + } + } } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] diff --git a/src/compare/util.rs b/src/compare/util.rs index fd85938c..b7c09fbe 100644 --- a/src/compare/util.rs +++ b/src/compare/util.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::str::FromStr; use super::compare_field::compare_property_list_of_quoted_string; @@ -206,10 +207,10 @@ pub(crate) fn get_property_unquoted_atom<'s>( /// Get a named property containing an quoted string from the emacs token. /// /// Returns None if key is not found. -pub(crate) fn get_property_quoted_string( - emacs: &Token<'_>, +pub(crate) fn get_property_quoted_string<'s>( + emacs: &Token<'s>, key: &str, -) -> Result, Box> { +) -> Result>, Box> { get_property(emacs, key)? .map(Token::as_atom) .map_or(Ok(None), |r| r.map(Some))?