organic/src/compare/sexp.rs
Tom Alexander acfc5e5e68
All checks were successful
rust-build Build rust-build has succeeded
rust-foreign-document-test Build rust-foreign-document-test has succeeded
rust-test Build rust-test has succeeded
clippy Build clippy has succeeded
rustfmt Build rustfmt has succeeded
Only allocate memory when unquoting sexp string that contains escapes.
If the quoted string contains no escape sequences, then unquoting the string can be done by simply shaving off the leading and trailing quotation marks which can be a slice operation. By returning Cow, we can return either a borrowed slice or an owned String.
2023-10-20 12:53:27 -04:00

498 lines
17 KiB
Rust

use std::borrow::Cow;
use std::collections::HashMap;
use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::bytes::complete::take_till1;
use nom::character::complete::anychar;
use nom::character::complete::digit1;
use nom::character::complete::multispace0;
use nom::character::complete::multispace1;
use nom::character::complete::one_of;
use nom::combinator::map;
use nom::combinator::not;
use nom::combinator::opt;
use nom::combinator::peek;
use nom::combinator::recognize;
use nom::multi::separated_list1;
use nom::sequence::delimited;
use nom::sequence::preceded;
use nom::sequence::tuple;
use crate::error::Res;
const MAX_OCTAL_LENGTH: usize = 3;
#[derive(Debug)]
pub enum Token<'s> {
Atom(&'s str),
List(Vec<Token<'s>>),
TextWithProperties(TextWithProperties<'s>),
Vector(Vec<Token<'s>>),
}
#[derive(Debug)]
pub struct TextWithProperties<'s> {
pub(crate) text: &'s str,
pub(crate) properties: Vec<Token<'s>>,
}
impl<'s> Token<'s> {
pub(crate) fn as_vector<'p>(
&'p self,
) -> Result<&'p Vec<Token<'s>>, Box<dyn std::error::Error>> {
Ok(match self {
Token::Vector(children) => Ok(children),
_ => Err(format!("wrong token type, expected vector: {:?}", self)),
}?)
}
pub(crate) fn as_list<'p>(&'p self) -> Result<&'p Vec<Token<'s>>, Box<dyn std::error::Error>> {
Ok(match self {
Token::List(children) => Ok(children),
_ => Err(format!("wrong token type, expected list: {:?}", self)),
}?)
}
pub(crate) fn as_atom<'p>(&'p self) -> Result<&'s str, Box<dyn std::error::Error>> {
Ok(match self {
Token::Atom(body) => Ok(*body),
_ => Err(format!("wrong token type, expected atom: {:?}", self)),
}?)
}
pub(crate) fn as_text<'p>(
&'p self,
) -> Result<&'p TextWithProperties<'s>, Box<dyn std::error::Error>> {
Ok(match self {
Token::TextWithProperties(body) => Ok(body),
_ => Err(format!("wrong token type, expected text: {:?}", self)),
}?)
}
pub(crate) fn as_map<'p>(
&'p self,
) -> Result<HashMap<&'s str, &'p Token<'s>>, Box<dyn std::error::Error>> {
let mut hashmap = HashMap::new();
let children = self.as_list()?;
if children.len() % 2 != 0 {
return Err("Expecting an even number of children".into());
}
let mut key: Option<&str> = None;
for child in children.iter() {
match key {
None => {
key = Some(child.as_atom()?);
}
Some(key_val) => {
key = None;
hashmap.insert(key_val, child);
}
};
}
Ok(hashmap)
}
}
/// Check if the child string slice is a slice of the parent string slice.
fn is_slice_of(parent: &str, child: &str) -> bool {
let parent_start = parent.as_ptr() as usize;
let parent_end = parent_start + parent.len();
let child_start = child.as_ptr() as usize;
let child_end = child_start + child.len();
child_start >= parent_start && child_end <= parent_end
}
/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser.
fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {
debug_assert!(is_slice_of(input, remaining));
let offset = remaining.as_ptr() as usize - input.as_ptr() as usize;
&input[..offset]
}
#[derive(Debug)]
enum UnquoteState {
Normal,
Escape,
HasEscape {
out: Vec<u8>,
},
HasEscapeEscape {
out: Vec<u8>,
},
Octal {
octal_begin_offset: usize,
octal: Vec<u8>,
},
HasEscapeOctal {
out: Vec<u8>,
octal: Vec<u8>,
},
}
pub(crate) fn unquote(text: &str) -> Result<Cow<'_, str>, Box<dyn std::error::Error>> {
if !text.starts_with('"') {
return Err("Quoted text does not start with quote.".into());
}
if !text.ends_with('"') {
return Err("Quoted text does not end with quote.".into());
}
let interior_text = &text[1..(text.len() - 1)];
let mut state = UnquoteState::Normal;
for (offset, current_char) in interior_text.bytes().enumerate() {
// Check to see if octal finished
state = match (state, current_char) {
(
UnquoteState::Octal {
octal_begin_offset,
octal,
},
b'0'..=b'7',
) if octal.len() < MAX_OCTAL_LENGTH => UnquoteState::Octal {
octal_begin_offset,
octal,
},
(
UnquoteState::Octal {
octal_begin_offset,
octal,
},
_,
) => {
let octal_number_string = String::from_utf8(octal)?;
let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?;
let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
out.extend_from_slice(&interior_text.as_bytes()[..octal_begin_offset]);
out.push(decoded_byte);
UnquoteState::HasEscape { out }
}
(UnquoteState::HasEscapeOctal { out, octal }, b'0'..=b'7')
if octal.len() < MAX_OCTAL_LENGTH =>
{
UnquoteState::HasEscapeOctal { out, octal }
}
(UnquoteState::HasEscapeOctal { mut out, octal }, _) => {
let octal_number_string = String::from_utf8(octal)?;
let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?;
out.push(decoded_byte);
UnquoteState::HasEscape { out }
}
(state, _) => state,
};
state = match (state, current_char) {
(UnquoteState::Normal, b'\\') => UnquoteState::Escape,
(UnquoteState::Normal, _) => UnquoteState::Normal,
(UnquoteState::HasEscape { out }, b'\\') => UnquoteState::HasEscapeEscape { out },
(UnquoteState::HasEscape { mut out }, _) => {
out.push(current_char);
UnquoteState::HasEscape { out }
}
(UnquoteState::Escape, b'n') => {
let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
// Subtract 1 from offset to account for backslash.
out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]);
out.push(b'\n');
UnquoteState::HasEscape { out }
}
(UnquoteState::HasEscapeEscape { mut out }, b'n') => {
out.push(b'\n');
UnquoteState::HasEscape { out }
}
(UnquoteState::Escape, b'\\') => {
let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
// Subtract 1 from offset to account for backslash.
out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]);
out.push(b'\\');
UnquoteState::HasEscape { out }
}
(UnquoteState::HasEscapeEscape { mut out }, b'\\') => {
out.push(b'\\');
UnquoteState::HasEscape { out }
}
(UnquoteState::Escape, b'"') => {
let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
// Subtract 1 from offset to account for backslash.
out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]);
out.push(b'"');
UnquoteState::HasEscape { out }
}
(UnquoteState::HasEscapeEscape { mut out }, b'"') => {
out.push(b'"');
UnquoteState::HasEscape { out }
}
(UnquoteState::Escape, b'0'..=b'7') => {
let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH);
octal.push(current_char);
// Substract 1 from offset to account for backslash
UnquoteState::Octal {
octal_begin_offset: offset - 1,
octal,
}
}
(UnquoteState::HasEscapeEscape { out }, b'0'..=b'7') => {
let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH);
octal.push(current_char);
// Substract 1 from offset to account for backslash
UnquoteState::HasEscapeOctal { out, octal }
}
(
UnquoteState::Octal {
octal_begin_offset,
mut octal,
},
b'0'..=b'7',
) => {
octal.push(current_char);
UnquoteState::Octal {
octal_begin_offset,
octal,
}
}
(UnquoteState::HasEscapeOctal { out, mut octal }, b'0'..=b'7') => {
octal.push(current_char);
UnquoteState::HasEscapeOctal { out, octal }
}
(state, _) => panic!(
"Invalid state unquoting string: {:?} | {} | {:?}",
state, offset, interior_text
),
};
}
match state {
UnquoteState::Normal | UnquoteState::Escape | UnquoteState::Octal { .. } => {
Ok(Cow::Borrowed(interior_text))
}
UnquoteState::HasEscape { out } => Ok(Cow::Owned(String::from_utf8(out)?)),
UnquoteState::HasEscapeEscape { mut out } => {
out.push(b'\\');
Ok(Cow::Owned(String::from_utf8(out)?))
}
UnquoteState::HasEscapeOctal { mut out, octal } => {
out.push(b'\\');
out.extend(octal);
Ok(Cow::Owned(String::from_utf8(out)?))
}
}
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
pub fn sexp<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
let (remaining, _) = multispace0(input)?;
let (remaining, tkn) = token(remaining).map(|(rem, out)| (Into::<&str>::into(rem), out))?;
let (remaining, _) = multispace0(remaining)?;
Ok((remaining, tkn))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn token<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
alt((list, vector, atom))(input)
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn list<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
let (remaining, _) = tag("(")(input)?;
let (remaining, children) = delimited(
multispace0,
separated_list1(multispace1, token),
multispace0,
)(remaining)?;
let (remaining, _) = tag(")")(remaining)?;
Ok((remaining, Token::List(children)))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn vector<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
let (remaining, _) = tag("[")(input)?;
let (remaining, children) = delimited(
multispace0,
separated_list1(multispace1, token),
multispace0,
)(remaining)?;
let (remaining, _) = tag("]")(remaining)?;
Ok((remaining, Token::Vector(children)))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
not(peek(one_of(")]")))(input)?;
alt((
text_with_properties,
hash_notation,
quoted_atom,
unquoted_atom,
))(input)
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn unquoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
let (remaining, body) =
take_till1(|c| matches!(c, ' ' | '\t' | '\r' | '\n' | ')' | ']'))(input)?;
Ok((remaining, Token::Atom(body)))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn quoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
let (mut remaining, _) = tag(r#"""#)(input)?;
let mut in_escape = false;
loop {
if in_escape {
let (remain, _) = alt((recognize(one_of(r#""n\\"#)), digit1))(remaining)?;
remaining = remain;
in_escape = false;
} else {
let end_quote = tag::<_, _, nom::error::Error<_>>(r#"""#)(remaining);
if end_quote.is_ok() {
break;
}
let escape_backslash = tag::<_, _, nom::error::Error<_>>("\\")(remaining);
if let Ok((remain, _)) = escape_backslash {
remaining = remain;
in_escape = true;
continue;
}
let (remain, _) = anychar(remaining)?;
remaining = remain;
}
}
let (remaining, _) = tag(r#"""#)(remaining)?;
let source = get_consumed(input, remaining);
Ok((remaining, Token::Atom(source)))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn hash_notation<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
let (remaining, _) = tag("#<")(input)?;
let (remaining, _body) = take_till1(|c| matches!(c, '>'))(remaining)?;
let (remaining, _) = tag(">")(remaining)?;
let source = get_consumed(input, remaining);
Ok((remaining, Token::Atom(source)))
}
fn text_with_properties(input: &str) -> Res<&str, Token<'_>> {
let (remaining, _) = tag("#(")(input)?;
let (remaining, (text, props)) = delimited(
multispace0,
tuple((
map(quoted_atom, |atom| match atom {
Token::Atom(body) => body,
_ => unreachable!(),
}),
preceded(multispace1, opt(separated_list1(multispace1, token))),
)),
multispace0,
)(remaining)?;
let (remaining, _) = tag(")")(remaining)?;
Ok((
remaining,
Token::TextWithProperties(TextWithProperties {
text,
properties: props.unwrap_or(Vec::new()),
}),
))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn simple() {
let input = " (foo bar baz ) ";
let (remaining, parsed) = sexp(input).expect("Parse the input");
assert_eq!(remaining, "");
assert!(match parsed {
Token::Atom(_) => false,
Token::List(_) => true,
Token::TextWithProperties(_) => false,
Token::Vector(_) => false,
});
}
#[test]
fn quoted() {
let input = r#" ("foo" bar baz ) "#;
let (remaining, parsed) = sexp(input).expect("Parse the input");
assert_eq!(remaining, "");
assert!(match parsed {
Token::Atom(_) => false,
Token::List(_) => true,
Token::TextWithProperties(_) => false,
Token::Vector(_) => false,
});
let children = match parsed {
Token::List(children) => children,
_ => panic!("Should be a list."),
};
assert_eq!(
match children.first() {
Some(Token::Atom(body)) => *body,
_ => panic!("First child should be an atom."),
},
r#""foo""#
)
}
#[test]
fn quoted_containing_paren() {
let input = r#" (foo "b(a)r" baz ) "#;
let (remaining, parsed) = sexp(input).expect("Parse the input");
assert_eq!(remaining, "");
assert!(matches!(parsed, Token::List(_)));
let children = match parsed {
Token::List(children) => children,
_ => panic!("Should be a list."),
};
assert_eq!(
match children.first() {
Some(Token::Atom(body)) => *body,
_ => panic!("First child should be an atom."),
},
r#"foo"#
);
assert_eq!(
match children.get(1) {
Some(Token::Atom(body)) => *body,
_ => panic!("Second child should be an atom."),
},
r#""b(a)r""#
);
assert_eq!(
match children.get(2) {
Some(Token::Atom(body)) => *body,
_ => panic!("Third child should be an atom."),
},
r#"baz"#
);
}
#[test]
fn string_containing_escaped_characters() {
let input = r#" (foo "\\( x=2 \\)" bar) "#;
let (remaining, parsed) = sexp(input).expect("Parse the input");
assert_eq!(remaining, "");
assert!(match parsed {
Token::Atom(_) => false,
Token::List(_) => true,
Token::TextWithProperties(_) => false,
Token::Vector(_) => false,
});
let children = match parsed {
Token::List(children) => children,
_ => panic!("Should be a list."),
};
assert_eq!(
match children.get(1) {
Some(Token::Atom(body)) => *body,
_ => panic!("First child should be an atom."),
},
r#""\\( x=2 \\)""#
)
}
}