Add support for parsing quoted strings containing escaped octals.

This commit is contained in:
Tom Alexander 2023-09-29 22:54:50 -04:00
parent 6c77586960
commit 896250836b
Signed by: talexander
GPG Key ID: D3A179C9A53C0EDE

View File

@ -1,9 +1,10 @@
use std::collections::HashMap;
use nom::branch::alt;
use nom::bytes::complete::escaped;
use nom::bytes::complete::tag;
use nom::bytes::complete::take_till1;
use nom::character::complete::anychar;
use nom::character::complete::digit1;
use nom::character::complete::multispace0;
use nom::character::complete::multispace1;
use nom::character::complete::one_of;
@ -11,6 +12,7 @@ use nom::combinator::map;
use nom::combinator::not;
use nom::combinator::opt;
use nom::combinator::peek;
use nom::combinator::recognize;
use nom::multi::separated_list1;
use nom::sequence::delimited;
use nom::sequence::preceded;
@ -18,6 +20,8 @@ use nom::sequence::tuple;
use crate::error::Res;
const MAX_OCTAL_LENGTH: usize = 3;
#[derive(Debug)]
pub enum Token<'s> {
Atom(&'s str),
@ -35,6 +39,7 @@ pub struct TextWithProperties<'s> {
enum ParseState {
Normal,
Escape,
Octal(Vec<u8>),
}
impl<'s> Token<'s> {
@ -116,7 +121,7 @@ fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {
}
pub(crate) fn unquote(text: &str) -> Result<String, Box<dyn std::error::Error>> {
let mut out = String::with_capacity(text.len());
let mut out: Vec<u8> = Vec::with_capacity(text.len());
if !text.starts_with(r#"""#) {
return Err("Quoted text does not start with quote.".into());
}
@ -125,30 +130,53 @@ pub(crate) fn unquote(text: &str) -> Result<String, Box<dyn std::error::Error>>
}
let interior_text = &text[1..(text.len() - 1)];
let mut state = ParseState::Normal;
for current_char in interior_text.chars().into_iter() {
for current_char in interior_text.bytes().into_iter() {
// Check to see if octal finished
state = match (state, current_char) {
(ParseState::Normal, '\\') => ParseState::Escape,
(ParseState::Octal(octal), b'0'..=b'7') if octal.len() < MAX_OCTAL_LENGTH => {
ParseState::Octal(octal)
}
(ParseState::Octal(octal), _) => {
let octal_number_string = String::from_utf8(octal)?;
let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?;
out.push(decoded_byte);
ParseState::Normal
}
(state, _) => state,
};
state = match (state, current_char) {
(ParseState::Normal, b'\\') => ParseState::Escape,
(ParseState::Normal, _) => {
out.push(current_char);
ParseState::Normal
}
(ParseState::Escape, 'n') => {
out.push('\n');
(ParseState::Escape, b'n') => {
out.push(b'\n');
ParseState::Normal
}
(ParseState::Escape, '\\') => {
out.push('\\');
(ParseState::Escape, b'\\') => {
out.push(b'\\');
ParseState::Normal
}
(ParseState::Escape, '"') => {
out.push('"');
(ParseState::Escape, b'"') => {
out.push(b'"');
ParseState::Normal
}
_ => todo!(),
(ParseState::Escape, b'0'..=b'7') => {
let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH);
octal.push(current_char);
ParseState::Octal(octal)
}
(ParseState::Octal(mut octal), b'0'..=b'7') => {
octal.push(current_char);
ParseState::Octal(octal)
}
_ => panic!("Invalid state unquoting string."),
};
}
Ok(out)
Ok(String::from_utf8(out)?)
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
@ -210,15 +238,30 @@ fn unquoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn quoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
let (remaining, _) = tag(r#"""#)(input)?;
let (remaining, _) = escaped(
take_till1(|c| match c {
'\\' | '"' => true,
_ => false,
}),
'\\',
one_of(r#""n\\"#),
)(remaining)?;
let (mut remaining, _) = tag(r#"""#)(input)?;
let mut in_escape = false;
loop {
if in_escape {
let (remain, _) = alt((recognize(one_of(r#""n\\"#)), digit1))(remaining)?;
remaining = remain;
in_escape = false;
} else {
let end_quote = tag::<_, _, nom::error::Error<_>>(r#"""#)(remaining);
if end_quote.is_ok() {
break;
}
let escape_backslash = tag::<_, _, nom::error::Error<_>>("\\")(remaining);
if let Ok((remain, _)) = escape_backslash {
remaining = remain;
in_escape = true;
continue;
}
let (remain, _) = anychar(remaining)?;
remaining = remain;
}
}
let (remaining, _) = tag(r#"""#)(remaining)?;
let source = get_consumed(input, remaining);
Ok((remaining, Token::Atom(source.into())))