data:image/s3,"s3://crabby-images/f2560/f2560a3f9d6525e5deaccb1a32431d186733536e" alt="Tom Alexander"
If the quoted string contains no escape sequences, then unquoting the string can be done by simply shaving off the leading and trailing quotation marks which can be a slice operation. By returning Cow, we can return either a borrowed slice or an owned String.
498 lines
17 KiB
Rust
498 lines
17 KiB
Rust
use std::borrow::Cow;
|
|
use std::collections::HashMap;
|
|
|
|
use nom::branch::alt;
|
|
use nom::bytes::complete::tag;
|
|
use nom::bytes::complete::take_till1;
|
|
use nom::character::complete::anychar;
|
|
use nom::character::complete::digit1;
|
|
use nom::character::complete::multispace0;
|
|
use nom::character::complete::multispace1;
|
|
use nom::character::complete::one_of;
|
|
use nom::combinator::map;
|
|
use nom::combinator::not;
|
|
use nom::combinator::opt;
|
|
use nom::combinator::peek;
|
|
use nom::combinator::recognize;
|
|
use nom::multi::separated_list1;
|
|
use nom::sequence::delimited;
|
|
use nom::sequence::preceded;
|
|
use nom::sequence::tuple;
|
|
|
|
use crate::error::Res;
|
|
|
|
const MAX_OCTAL_LENGTH: usize = 3;
|
|
|
|
#[derive(Debug)]
|
|
pub enum Token<'s> {
|
|
Atom(&'s str),
|
|
List(Vec<Token<'s>>),
|
|
TextWithProperties(TextWithProperties<'s>),
|
|
Vector(Vec<Token<'s>>),
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct TextWithProperties<'s> {
|
|
pub(crate) text: &'s str,
|
|
pub(crate) properties: Vec<Token<'s>>,
|
|
}
|
|
|
|
impl<'s> Token<'s> {
|
|
pub(crate) fn as_vector<'p>(
|
|
&'p self,
|
|
) -> Result<&'p Vec<Token<'s>>, Box<dyn std::error::Error>> {
|
|
Ok(match self {
|
|
Token::Vector(children) => Ok(children),
|
|
_ => Err(format!("wrong token type, expected vector: {:?}", self)),
|
|
}?)
|
|
}
|
|
|
|
pub(crate) fn as_list<'p>(&'p self) -> Result<&'p Vec<Token<'s>>, Box<dyn std::error::Error>> {
|
|
Ok(match self {
|
|
Token::List(children) => Ok(children),
|
|
_ => Err(format!("wrong token type, expected list: {:?}", self)),
|
|
}?)
|
|
}
|
|
|
|
pub(crate) fn as_atom<'p>(&'p self) -> Result<&'s str, Box<dyn std::error::Error>> {
|
|
Ok(match self {
|
|
Token::Atom(body) => Ok(*body),
|
|
_ => Err(format!("wrong token type, expected atom: {:?}", self)),
|
|
}?)
|
|
}
|
|
|
|
pub(crate) fn as_text<'p>(
|
|
&'p self,
|
|
) -> Result<&'p TextWithProperties<'s>, Box<dyn std::error::Error>> {
|
|
Ok(match self {
|
|
Token::TextWithProperties(body) => Ok(body),
|
|
_ => Err(format!("wrong token type, expected text: {:?}", self)),
|
|
}?)
|
|
}
|
|
|
|
pub(crate) fn as_map<'p>(
|
|
&'p self,
|
|
) -> Result<HashMap<&'s str, &'p Token<'s>>, Box<dyn std::error::Error>> {
|
|
let mut hashmap = HashMap::new();
|
|
|
|
let children = self.as_list()?;
|
|
if children.len() % 2 != 0 {
|
|
return Err("Expecting an even number of children".into());
|
|
}
|
|
let mut key: Option<&str> = None;
|
|
for child in children.iter() {
|
|
match key {
|
|
None => {
|
|
key = Some(child.as_atom()?);
|
|
}
|
|
Some(key_val) => {
|
|
key = None;
|
|
hashmap.insert(key_val, child);
|
|
}
|
|
};
|
|
}
|
|
|
|
Ok(hashmap)
|
|
}
|
|
}
|
|
|
|
/// Check if the child string slice is a slice of the parent string slice.
|
|
fn is_slice_of(parent: &str, child: &str) -> bool {
|
|
let parent_start = parent.as_ptr() as usize;
|
|
let parent_end = parent_start + parent.len();
|
|
let child_start = child.as_ptr() as usize;
|
|
let child_end = child_start + child.len();
|
|
child_start >= parent_start && child_end <= parent_end
|
|
}
|
|
|
|
/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser.
|
|
fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {
|
|
debug_assert!(is_slice_of(input, remaining));
|
|
let offset = remaining.as_ptr() as usize - input.as_ptr() as usize;
|
|
&input[..offset]
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
enum UnquoteState {
|
|
Normal,
|
|
Escape,
|
|
HasEscape {
|
|
out: Vec<u8>,
|
|
},
|
|
HasEscapeEscape {
|
|
out: Vec<u8>,
|
|
},
|
|
Octal {
|
|
octal_begin_offset: usize,
|
|
octal: Vec<u8>,
|
|
},
|
|
HasEscapeOctal {
|
|
out: Vec<u8>,
|
|
octal: Vec<u8>,
|
|
},
|
|
}
|
|
|
|
pub(crate) fn unquote(text: &str) -> Result<Cow<'_, str>, Box<dyn std::error::Error>> {
|
|
if !text.starts_with('"') {
|
|
return Err("Quoted text does not start with quote.".into());
|
|
}
|
|
if !text.ends_with('"') {
|
|
return Err("Quoted text does not end with quote.".into());
|
|
}
|
|
let interior_text = &text[1..(text.len() - 1)];
|
|
let mut state = UnquoteState::Normal;
|
|
for (offset, current_char) in interior_text.bytes().enumerate() {
|
|
// Check to see if octal finished
|
|
state = match (state, current_char) {
|
|
(
|
|
UnquoteState::Octal {
|
|
octal_begin_offset,
|
|
octal,
|
|
},
|
|
b'0'..=b'7',
|
|
) if octal.len() < MAX_OCTAL_LENGTH => UnquoteState::Octal {
|
|
octal_begin_offset,
|
|
octal,
|
|
},
|
|
(
|
|
UnquoteState::Octal {
|
|
octal_begin_offset,
|
|
octal,
|
|
},
|
|
_,
|
|
) => {
|
|
let octal_number_string = String::from_utf8(octal)?;
|
|
let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?;
|
|
let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
|
|
out.extend_from_slice(&interior_text.as_bytes()[..octal_begin_offset]);
|
|
out.push(decoded_byte);
|
|
UnquoteState::HasEscape { out }
|
|
}
|
|
(UnquoteState::HasEscapeOctal { out, octal }, b'0'..=b'7')
|
|
if octal.len() < MAX_OCTAL_LENGTH =>
|
|
{
|
|
UnquoteState::HasEscapeOctal { out, octal }
|
|
}
|
|
(UnquoteState::HasEscapeOctal { mut out, octal }, _) => {
|
|
let octal_number_string = String::from_utf8(octal)?;
|
|
let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?;
|
|
out.push(decoded_byte);
|
|
UnquoteState::HasEscape { out }
|
|
}
|
|
(state, _) => state,
|
|
};
|
|
|
|
state = match (state, current_char) {
|
|
(UnquoteState::Normal, b'\\') => UnquoteState::Escape,
|
|
(UnquoteState::Normal, _) => UnquoteState::Normal,
|
|
(UnquoteState::HasEscape { out }, b'\\') => UnquoteState::HasEscapeEscape { out },
|
|
(UnquoteState::HasEscape { mut out }, _) => {
|
|
out.push(current_char);
|
|
UnquoteState::HasEscape { out }
|
|
}
|
|
(UnquoteState::Escape, b'n') => {
|
|
let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
|
|
// Subtract 1 from offset to account for backslash.
|
|
out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]);
|
|
out.push(b'\n');
|
|
UnquoteState::HasEscape { out }
|
|
}
|
|
(UnquoteState::HasEscapeEscape { mut out }, b'n') => {
|
|
out.push(b'\n');
|
|
UnquoteState::HasEscape { out }
|
|
}
|
|
(UnquoteState::Escape, b'\\') => {
|
|
let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
|
|
// Subtract 1 from offset to account for backslash.
|
|
out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]);
|
|
out.push(b'\\');
|
|
UnquoteState::HasEscape { out }
|
|
}
|
|
(UnquoteState::HasEscapeEscape { mut out }, b'\\') => {
|
|
out.push(b'\\');
|
|
UnquoteState::HasEscape { out }
|
|
}
|
|
(UnquoteState::Escape, b'"') => {
|
|
let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
|
|
// Subtract 1 from offset to account for backslash.
|
|
out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]);
|
|
out.push(b'"');
|
|
UnquoteState::HasEscape { out }
|
|
}
|
|
(UnquoteState::HasEscapeEscape { mut out }, b'"') => {
|
|
out.push(b'"');
|
|
UnquoteState::HasEscape { out }
|
|
}
|
|
(UnquoteState::Escape, b'0'..=b'7') => {
|
|
let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH);
|
|
octal.push(current_char);
|
|
// Substract 1 from offset to account for backslash
|
|
UnquoteState::Octal {
|
|
octal_begin_offset: offset - 1,
|
|
octal,
|
|
}
|
|
}
|
|
(UnquoteState::HasEscapeEscape { out }, b'0'..=b'7') => {
|
|
let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH);
|
|
octal.push(current_char);
|
|
// Substract 1 from offset to account for backslash
|
|
UnquoteState::HasEscapeOctal { out, octal }
|
|
}
|
|
(
|
|
UnquoteState::Octal {
|
|
octal_begin_offset,
|
|
mut octal,
|
|
},
|
|
b'0'..=b'7',
|
|
) => {
|
|
octal.push(current_char);
|
|
UnquoteState::Octal {
|
|
octal_begin_offset,
|
|
octal,
|
|
}
|
|
}
|
|
(UnquoteState::HasEscapeOctal { out, mut octal }, b'0'..=b'7') => {
|
|
octal.push(current_char);
|
|
UnquoteState::HasEscapeOctal { out, octal }
|
|
}
|
|
(state, _) => panic!(
|
|
"Invalid state unquoting string: {:?} | {} | {:?}",
|
|
state, offset, interior_text
|
|
),
|
|
};
|
|
}
|
|
|
|
match state {
|
|
UnquoteState::Normal | UnquoteState::Escape | UnquoteState::Octal { .. } => {
|
|
Ok(Cow::Borrowed(interior_text))
|
|
}
|
|
UnquoteState::HasEscape { out } => Ok(Cow::Owned(String::from_utf8(out)?)),
|
|
UnquoteState::HasEscapeEscape { mut out } => {
|
|
out.push(b'\\');
|
|
Ok(Cow::Owned(String::from_utf8(out)?))
|
|
}
|
|
UnquoteState::HasEscapeOctal { mut out, octal } => {
|
|
out.push(b'\\');
|
|
out.extend(octal);
|
|
Ok(Cow::Owned(String::from_utf8(out)?))
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
|
|
pub fn sexp<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
|
|
let (remaining, _) = multispace0(input)?;
|
|
let (remaining, tkn) = token(remaining).map(|(rem, out)| (Into::<&str>::into(rem), out))?;
|
|
let (remaining, _) = multispace0(remaining)?;
|
|
Ok((remaining, tkn))
|
|
}
|
|
|
|
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
|
|
fn token<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
|
|
alt((list, vector, atom))(input)
|
|
}
|
|
|
|
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
|
|
fn list<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
|
|
let (remaining, _) = tag("(")(input)?;
|
|
let (remaining, children) = delimited(
|
|
multispace0,
|
|
separated_list1(multispace1, token),
|
|
multispace0,
|
|
)(remaining)?;
|
|
let (remaining, _) = tag(")")(remaining)?;
|
|
Ok((remaining, Token::List(children)))
|
|
}
|
|
|
|
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
|
|
fn vector<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
|
|
let (remaining, _) = tag("[")(input)?;
|
|
let (remaining, children) = delimited(
|
|
multispace0,
|
|
separated_list1(multispace1, token),
|
|
multispace0,
|
|
)(remaining)?;
|
|
let (remaining, _) = tag("]")(remaining)?;
|
|
Ok((remaining, Token::Vector(children)))
|
|
}
|
|
|
|
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
|
|
fn atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
|
|
not(peek(one_of(")]")))(input)?;
|
|
alt((
|
|
text_with_properties,
|
|
hash_notation,
|
|
quoted_atom,
|
|
unquoted_atom,
|
|
))(input)
|
|
}
|
|
|
|
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
|
|
fn unquoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
|
|
let (remaining, body) =
|
|
take_till1(|c| matches!(c, ' ' | '\t' | '\r' | '\n' | ')' | ']'))(input)?;
|
|
Ok((remaining, Token::Atom(body)))
|
|
}
|
|
|
|
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
|
|
fn quoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
|
|
let (mut remaining, _) = tag(r#"""#)(input)?;
|
|
let mut in_escape = false;
|
|
loop {
|
|
if in_escape {
|
|
let (remain, _) = alt((recognize(one_of(r#""n\\"#)), digit1))(remaining)?;
|
|
remaining = remain;
|
|
in_escape = false;
|
|
} else {
|
|
let end_quote = tag::<_, _, nom::error::Error<_>>(r#"""#)(remaining);
|
|
if end_quote.is_ok() {
|
|
break;
|
|
}
|
|
|
|
let escape_backslash = tag::<_, _, nom::error::Error<_>>("\\")(remaining);
|
|
if let Ok((remain, _)) = escape_backslash {
|
|
remaining = remain;
|
|
in_escape = true;
|
|
continue;
|
|
}
|
|
|
|
let (remain, _) = anychar(remaining)?;
|
|
remaining = remain;
|
|
}
|
|
}
|
|
let (remaining, _) = tag(r#"""#)(remaining)?;
|
|
let source = get_consumed(input, remaining);
|
|
Ok((remaining, Token::Atom(source)))
|
|
}
|
|
|
|
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
|
|
fn hash_notation<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
|
|
let (remaining, _) = tag("#<")(input)?;
|
|
let (remaining, _body) = take_till1(|c| matches!(c, '>'))(remaining)?;
|
|
let (remaining, _) = tag(">")(remaining)?;
|
|
let source = get_consumed(input, remaining);
|
|
Ok((remaining, Token::Atom(source)))
|
|
}
|
|
|
|
fn text_with_properties(input: &str) -> Res<&str, Token<'_>> {
|
|
let (remaining, _) = tag("#(")(input)?;
|
|
let (remaining, (text, props)) = delimited(
|
|
multispace0,
|
|
tuple((
|
|
map(quoted_atom, |atom| match atom {
|
|
Token::Atom(body) => body,
|
|
_ => unreachable!(),
|
|
}),
|
|
preceded(multispace1, opt(separated_list1(multispace1, token))),
|
|
)),
|
|
multispace0,
|
|
)(remaining)?;
|
|
let (remaining, _) = tag(")")(remaining)?;
|
|
Ok((
|
|
remaining,
|
|
Token::TextWithProperties(TextWithProperties {
|
|
text,
|
|
properties: props.unwrap_or(Vec::new()),
|
|
}),
|
|
))
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn simple() {
|
|
let input = " (foo bar baz ) ";
|
|
let (remaining, parsed) = sexp(input).expect("Parse the input");
|
|
assert_eq!(remaining, "");
|
|
assert!(match parsed {
|
|
Token::Atom(_) => false,
|
|
Token::List(_) => true,
|
|
Token::TextWithProperties(_) => false,
|
|
Token::Vector(_) => false,
|
|
});
|
|
}
|
|
|
|
#[test]
|
|
fn quoted() {
|
|
let input = r#" ("foo" bar baz ) "#;
|
|
let (remaining, parsed) = sexp(input).expect("Parse the input");
|
|
assert_eq!(remaining, "");
|
|
assert!(match parsed {
|
|
Token::Atom(_) => false,
|
|
Token::List(_) => true,
|
|
Token::TextWithProperties(_) => false,
|
|
Token::Vector(_) => false,
|
|
});
|
|
let children = match parsed {
|
|
Token::List(children) => children,
|
|
_ => panic!("Should be a list."),
|
|
};
|
|
assert_eq!(
|
|
match children.first() {
|
|
Some(Token::Atom(body)) => *body,
|
|
_ => panic!("First child should be an atom."),
|
|
},
|
|
r#""foo""#
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn quoted_containing_paren() {
|
|
let input = r#" (foo "b(a)r" baz ) "#;
|
|
let (remaining, parsed) = sexp(input).expect("Parse the input");
|
|
assert_eq!(remaining, "");
|
|
assert!(matches!(parsed, Token::List(_)));
|
|
let children = match parsed {
|
|
Token::List(children) => children,
|
|
_ => panic!("Should be a list."),
|
|
};
|
|
assert_eq!(
|
|
match children.first() {
|
|
Some(Token::Atom(body)) => *body,
|
|
_ => panic!("First child should be an atom."),
|
|
},
|
|
r#"foo"#
|
|
);
|
|
assert_eq!(
|
|
match children.get(1) {
|
|
Some(Token::Atom(body)) => *body,
|
|
_ => panic!("Second child should be an atom."),
|
|
},
|
|
r#""b(a)r""#
|
|
);
|
|
assert_eq!(
|
|
match children.get(2) {
|
|
Some(Token::Atom(body)) => *body,
|
|
_ => panic!("Third child should be an atom."),
|
|
},
|
|
r#"baz"#
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn string_containing_escaped_characters() {
|
|
let input = r#" (foo "\\( x=2 \\)" bar) "#;
|
|
let (remaining, parsed) = sexp(input).expect("Parse the input");
|
|
assert_eq!(remaining, "");
|
|
assert!(match parsed {
|
|
Token::Atom(_) => false,
|
|
Token::List(_) => true,
|
|
Token::TextWithProperties(_) => false,
|
|
Token::Vector(_) => false,
|
|
});
|
|
let children = match parsed {
|
|
Token::List(children) => children,
|
|
_ => panic!("Should be a list."),
|
|
};
|
|
assert_eq!(
|
|
match children.get(1) {
|
|
Some(Token::Atom(body)) => *body,
|
|
_ => panic!("First child should be an atom."),
|
|
},
|
|
r#""\\( x=2 \\)""#
|
|
)
|
|
}
|
|
}
|