From 896250836b6c195128859593612057c7a62d5cf9 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 29 Sep 2023 22:54:50 -0400 Subject: [PATCH] Add support for parsing quoted strings containing escaped octals. --- src/compare/sexp.rs | 85 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 21 deletions(-) diff --git a/src/compare/sexp.rs b/src/compare/sexp.rs index 0cd558e..069863a 100644 --- a/src/compare/sexp.rs +++ b/src/compare/sexp.rs @@ -1,9 +1,10 @@ use std::collections::HashMap; use nom::branch::alt; -use nom::bytes::complete::escaped; use nom::bytes::complete::tag; use nom::bytes::complete::take_till1; +use nom::character::complete::anychar; +use nom::character::complete::digit1; use nom::character::complete::multispace0; use nom::character::complete::multispace1; use nom::character::complete::one_of; @@ -11,6 +12,7 @@ use nom::combinator::map; use nom::combinator::not; use nom::combinator::opt; use nom::combinator::peek; +use nom::combinator::recognize; use nom::multi::separated_list1; use nom::sequence::delimited; use nom::sequence::preceded; @@ -18,6 +20,8 @@ use nom::sequence::tuple; use crate::error::Res; +const MAX_OCTAL_LENGTH: usize = 3; + #[derive(Debug)] pub enum Token<'s> { Atom(&'s str), @@ -35,6 +39,7 @@ pub struct TextWithProperties<'s> { enum ParseState { Normal, Escape, + Octal(Vec), } impl<'s> Token<'s> { @@ -116,7 +121,7 @@ fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str { } pub(crate) fn unquote(text: &str) -> Result> { - let mut out = String::with_capacity(text.len()); + let mut out: Vec = Vec::with_capacity(text.len()); if !text.starts_with(r#"""#) { return Err("Quoted text does not start with quote.".into()); } @@ -125,30 +130,53 @@ pub(crate) fn unquote(text: &str) -> Result> } let interior_text = &text[1..(text.len() - 1)]; let mut state = ParseState::Normal; - for current_char in interior_text.chars().into_iter() { + for current_char in interior_text.bytes().into_iter() { + // Check to see if octal finished state = match (state, current_char) { - (ParseState::Normal, '\\') => ParseState::Escape, + (ParseState::Octal(octal), b'0'..=b'7') if octal.len() < MAX_OCTAL_LENGTH => { + ParseState::Octal(octal) + } + (ParseState::Octal(octal), _) => { + let octal_number_string = String::from_utf8(octal)?; + let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?; + out.push(decoded_byte); + ParseState::Normal + } + (state, _) => state, + }; + + state = match (state, current_char) { + (ParseState::Normal, b'\\') => ParseState::Escape, (ParseState::Normal, _) => { out.push(current_char); ParseState::Normal } - (ParseState::Escape, 'n') => { - out.push('\n'); + (ParseState::Escape, b'n') => { + out.push(b'\n'); ParseState::Normal } - (ParseState::Escape, '\\') => { - out.push('\\'); + (ParseState::Escape, b'\\') => { + out.push(b'\\'); ParseState::Normal } - (ParseState::Escape, '"') => { - out.push('"'); + (ParseState::Escape, b'"') => { + out.push(b'"'); ParseState::Normal } - _ => todo!(), + (ParseState::Escape, b'0'..=b'7') => { + let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH); + octal.push(current_char); + ParseState::Octal(octal) + } + (ParseState::Octal(mut octal), b'0'..=b'7') => { + octal.push(current_char); + ParseState::Octal(octal) + } + _ => panic!("Invalid state unquoting string."), }; } - Ok(out) + Ok(String::from_utf8(out)?) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] @@ -210,15 +238,30 @@ fn unquoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] fn quoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { - let (remaining, _) = tag(r#"""#)(input)?; - let (remaining, _) = escaped( - take_till1(|c| match c { - '\\' | '"' => true, - _ => false, - }), - '\\', - one_of(r#""n\\"#), - )(remaining)?; + let (mut remaining, _) = tag(r#"""#)(input)?; + let mut in_escape = false; + loop { + if in_escape { + let (remain, _) = alt((recognize(one_of(r#""n\\"#)), digit1))(remaining)?; + remaining = remain; + in_escape = false; + } else { + let end_quote = tag::<_, _, nom::error::Error<_>>(r#"""#)(remaining); + if end_quote.is_ok() { + break; + } + + let escape_backslash = tag::<_, _, nom::error::Error<_>>("\\")(remaining); + if let Ok((remain, _)) = escape_backslash { + remaining = remain; + in_escape = true; + continue; + } + + let (remain, _) = anychar(remaining)?; + remaining = remain; + } + } let (remaining, _) = tag(r#"""#)(remaining)?; let source = get_consumed(input, remaining); Ok((remaining, Token::Atom(source.into())))