Add support for parsing quoted strings containing escaped octals.

This commit is contained in:
Tom Alexander 2023-09-29 22:54:50 -04:00
parent 6c77586960
commit 896250836b
Signed by: talexander
GPG Key ID: D3A179C9A53C0EDE

View File

@ -1,9 +1,10 @@
use std::collections::HashMap; use std::collections::HashMap;
use nom::branch::alt; use nom::branch::alt;
use nom::bytes::complete::escaped;
use nom::bytes::complete::tag; use nom::bytes::complete::tag;
use nom::bytes::complete::take_till1; use nom::bytes::complete::take_till1;
use nom::character::complete::anychar;
use nom::character::complete::digit1;
use nom::character::complete::multispace0; use nom::character::complete::multispace0;
use nom::character::complete::multispace1; use nom::character::complete::multispace1;
use nom::character::complete::one_of; use nom::character::complete::one_of;
@ -11,6 +12,7 @@ use nom::combinator::map;
use nom::combinator::not; use nom::combinator::not;
use nom::combinator::opt; use nom::combinator::opt;
use nom::combinator::peek; use nom::combinator::peek;
use nom::combinator::recognize;
use nom::multi::separated_list1; use nom::multi::separated_list1;
use nom::sequence::delimited; use nom::sequence::delimited;
use nom::sequence::preceded; use nom::sequence::preceded;
@ -18,6 +20,8 @@ use nom::sequence::tuple;
use crate::error::Res; use crate::error::Res;
const MAX_OCTAL_LENGTH: usize = 3;
#[derive(Debug)] #[derive(Debug)]
pub enum Token<'s> { pub enum Token<'s> {
Atom(&'s str), Atom(&'s str),
@ -35,6 +39,7 @@ pub struct TextWithProperties<'s> {
enum ParseState { enum ParseState {
Normal, Normal,
Escape, Escape,
Octal(Vec<u8>),
} }
impl<'s> Token<'s> { impl<'s> Token<'s> {
@ -116,7 +121,7 @@ fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {
} }
pub(crate) fn unquote(text: &str) -> Result<String, Box<dyn std::error::Error>> { pub(crate) fn unquote(text: &str) -> Result<String, Box<dyn std::error::Error>> {
let mut out = String::with_capacity(text.len()); let mut out: Vec<u8> = Vec::with_capacity(text.len());
if !text.starts_with(r#"""#) { if !text.starts_with(r#"""#) {
return Err("Quoted text does not start with quote.".into()); return Err("Quoted text does not start with quote.".into());
} }
@ -125,30 +130,53 @@ pub(crate) fn unquote(text: &str) -> Result<String, Box<dyn std::error::Error>>
} }
let interior_text = &text[1..(text.len() - 1)]; let interior_text = &text[1..(text.len() - 1)];
let mut state = ParseState::Normal; let mut state = ParseState::Normal;
for current_char in interior_text.chars().into_iter() { for current_char in interior_text.bytes().into_iter() {
// Check to see if octal finished
state = match (state, current_char) { state = match (state, current_char) {
(ParseState::Normal, '\\') => ParseState::Escape, (ParseState::Octal(octal), b'0'..=b'7') if octal.len() < MAX_OCTAL_LENGTH => {
ParseState::Octal(octal)
}
(ParseState::Octal(octal), _) => {
let octal_number_string = String::from_utf8(octal)?;
let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?;
out.push(decoded_byte);
ParseState::Normal
}
(state, _) => state,
};
state = match (state, current_char) {
(ParseState::Normal, b'\\') => ParseState::Escape,
(ParseState::Normal, _) => { (ParseState::Normal, _) => {
out.push(current_char); out.push(current_char);
ParseState::Normal ParseState::Normal
} }
(ParseState::Escape, 'n') => { (ParseState::Escape, b'n') => {
out.push('\n'); out.push(b'\n');
ParseState::Normal ParseState::Normal
} }
(ParseState::Escape, '\\') => { (ParseState::Escape, b'\\') => {
out.push('\\'); out.push(b'\\');
ParseState::Normal ParseState::Normal
} }
(ParseState::Escape, '"') => { (ParseState::Escape, b'"') => {
out.push('"'); out.push(b'"');
ParseState::Normal ParseState::Normal
} }
_ => todo!(), (ParseState::Escape, b'0'..=b'7') => {
let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH);
octal.push(current_char);
ParseState::Octal(octal)
}
(ParseState::Octal(mut octal), b'0'..=b'7') => {
octal.push(current_char);
ParseState::Octal(octal)
}
_ => panic!("Invalid state unquoting string."),
}; };
} }
Ok(out) Ok(String::from_utf8(out)?)
} }
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
@ -210,15 +238,30 @@ fn unquoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn quoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { fn quoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
let (remaining, _) = tag(r#"""#)(input)?; let (mut remaining, _) = tag(r#"""#)(input)?;
let (remaining, _) = escaped( let mut in_escape = false;
take_till1(|c| match c { loop {
'\\' | '"' => true, if in_escape {
_ => false, let (remain, _) = alt((recognize(one_of(r#""n\\"#)), digit1))(remaining)?;
}), remaining = remain;
'\\', in_escape = false;
one_of(r#""n\\"#), } else {
)(remaining)?; let end_quote = tag::<_, _, nom::error::Error<_>>(r#"""#)(remaining);
if end_quote.is_ok() {
break;
}
let escape_backslash = tag::<_, _, nom::error::Error<_>>("\\")(remaining);
if let Ok((remain, _)) = escape_backslash {
remaining = remain;
in_escape = true;
continue;
}
let (remain, _) = anychar(remaining)?;
remaining = remain;
}
}
let (remaining, _) = tag(r#"""#)(remaining)?; let (remaining, _) = tag(r#"""#)(remaining)?;
let source = get_consumed(input, remaining); let source = get_consumed(input, remaining);
Ok((remaining, Token::Atom(source.into()))) Ok((remaining, Token::Atom(source.into())))