use std::borrow::Cow; use std::collections::HashMap; use nom::branch::alt; use nom::bytes::complete::tag; use nom::bytes::complete::take_till1; use nom::character::complete::anychar; use nom::character::complete::digit1; use nom::character::complete::multispace0; use nom::character::complete::multispace1; use nom::character::complete::one_of; use nom::combinator::map; use nom::combinator::not; use nom::combinator::opt; use nom::combinator::peek; use nom::combinator::recognize; use nom::multi::separated_list1; use nom::sequence::delimited; use nom::sequence::preceded; use nom::sequence::tuple; use crate::error::Res; const MAX_OCTAL_LENGTH: usize = 3; #[derive(Debug)] pub enum Token<'s> { Atom(&'s str), List(Vec>), TextWithProperties(TextWithProperties<'s>), Vector(Vec>), } #[derive(Debug)] pub struct TextWithProperties<'s> { pub(crate) text: &'s str, pub(crate) properties: Vec>, } impl<'s> Token<'s> { pub(crate) fn as_vector<'p>( &'p self, ) -> Result<&'p Vec>, Box> { Ok(match self { Token::Vector(children) => Ok(children), _ => Err(format!("wrong token type, expected vector: {:?}", self)), }?) } pub(crate) fn as_list<'p>(&'p self) -> Result<&'p Vec>, Box> { Ok(match self { Token::List(children) => Ok(children), _ => Err(format!("wrong token type, expected list: {:?}", self)), }?) } pub(crate) fn as_atom<'p>(&'p self) -> Result<&'s str, Box> { Ok(match self { Token::Atom(body) => Ok(*body), _ => Err(format!("wrong token type, expected atom: {:?}", self)), }?) } pub(crate) fn as_text<'p>( &'p self, ) -> Result<&'p TextWithProperties<'s>, Box> { Ok(match self { Token::TextWithProperties(body) => Ok(body), _ => Err(format!("wrong token type, expected text: {:?}", self)), }?) } pub(crate) fn as_map<'p>( &'p self, ) -> Result>, Box> { let mut hashmap = HashMap::new(); let children = self.as_list()?; if children.len() % 2 != 0 { return Err("Expecting an even number of children".into()); } let mut key: Option<&str> = None; for child in children.iter() { match key { None => { key = Some(child.as_atom()?); } Some(key_val) => { key = None; hashmap.insert(key_val, child); } }; } Ok(hashmap) } } /// Check if the child string slice is a slice of the parent string slice. fn is_slice_of(parent: &str, child: &str) -> bool { let parent_start = parent.as_ptr() as usize; let parent_end = parent_start + parent.len(); let child_start = child.as_ptr() as usize; let child_end = child_start + child.len(); child_start >= parent_start && child_end <= parent_end } /// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser. fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str { debug_assert!(is_slice_of(input, remaining)); let offset = remaining.as_ptr() as usize - input.as_ptr() as usize; &input[..offset] } #[derive(Debug)] enum UnquoteState { Normal, Escape, HasEscape { out: Vec, }, HasEscapeEscape { out: Vec, }, Octal { octal_begin_offset: usize, octal: Vec, }, HasEscapeOctal { out: Vec, octal: Vec, }, } pub(crate) fn unquote(text: &str) -> Result, Box> { if !text.starts_with('"') { return Err("Quoted text does not start with quote.".into()); } if !text.ends_with('"') { return Err("Quoted text does not end with quote.".into()); } let interior_text = &text[1..(text.len() - 1)]; let mut state = UnquoteState::Normal; for (offset, current_char) in interior_text.bytes().enumerate() { // Check to see if octal finished state = match (state, current_char) { ( UnquoteState::Octal { octal_begin_offset, octal, }, b'0'..=b'7', ) if octal.len() < MAX_OCTAL_LENGTH => UnquoteState::Octal { octal_begin_offset, octal, }, ( UnquoteState::Octal { octal_begin_offset, octal, }, _, ) => { let octal_number_string = String::from_utf8(octal)?; let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?; let mut out: Vec = Vec::with_capacity(interior_text.len()); out.extend_from_slice(&interior_text.as_bytes()[..octal_begin_offset]); out.push(decoded_byte); UnquoteState::HasEscape { out } } (UnquoteState::HasEscapeOctal { out, octal }, b'0'..=b'7') if octal.len() < MAX_OCTAL_LENGTH => { UnquoteState::HasEscapeOctal { out, octal } } (UnquoteState::HasEscapeOctal { mut out, octal }, _) => { let octal_number_string = String::from_utf8(octal)?; let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?; out.push(decoded_byte); UnquoteState::HasEscape { out } } (state, _) => state, }; state = match (state, current_char) { (UnquoteState::Normal, b'\\') => UnquoteState::Escape, (UnquoteState::Normal, _) => UnquoteState::Normal, (UnquoteState::HasEscape { out }, b'\\') => UnquoteState::HasEscapeEscape { out }, (UnquoteState::HasEscape { mut out }, _) => { out.push(current_char); UnquoteState::HasEscape { out } } (UnquoteState::Escape, b'n') => { let mut out: Vec = Vec::with_capacity(interior_text.len()); // Subtract 1 from offset to account for backslash. out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]); out.push(b'\n'); UnquoteState::HasEscape { out } } (UnquoteState::HasEscapeEscape { mut out }, b'n') => { out.push(b'\n'); UnquoteState::HasEscape { out } } (UnquoteState::Escape, b'\\') => { let mut out: Vec = Vec::with_capacity(interior_text.len()); // Subtract 1 from offset to account for backslash. out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]); out.push(b'\\'); UnquoteState::HasEscape { out } } (UnquoteState::HasEscapeEscape { mut out }, b'\\') => { out.push(b'\\'); UnquoteState::HasEscape { out } } (UnquoteState::Escape, b'"') => { let mut out: Vec = Vec::with_capacity(interior_text.len()); // Subtract 1 from offset to account for backslash. out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]); out.push(b'"'); UnquoteState::HasEscape { out } } (UnquoteState::HasEscapeEscape { mut out }, b'"') => { out.push(b'"'); UnquoteState::HasEscape { out } } (UnquoteState::Escape, b'0'..=b'7') => { let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH); octal.push(current_char); // Substract 1 from offset to account for backslash UnquoteState::Octal { octal_begin_offset: offset - 1, octal, } } (UnquoteState::HasEscapeEscape { out }, b'0'..=b'7') => { let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH); octal.push(current_char); // Substract 1 from offset to account for backslash UnquoteState::HasEscapeOctal { out, octal } } ( UnquoteState::Octal { octal_begin_offset, mut octal, }, b'0'..=b'7', ) => { octal.push(current_char); UnquoteState::Octal { octal_begin_offset, octal, } } (UnquoteState::HasEscapeOctal { out, mut octal }, b'0'..=b'7') => { octal.push(current_char); UnquoteState::HasEscapeOctal { out, octal } } (state, _) => panic!( "Invalid state unquoting string: {:?} | {} | {:?}", state, offset, interior_text ), }; } match state { UnquoteState::Normal | UnquoteState::Escape | UnquoteState::Octal { .. } => { Ok(Cow::Borrowed(interior_text)) } UnquoteState::HasEscape { out } => Ok(Cow::Owned(String::from_utf8(out)?)), UnquoteState::HasEscapeEscape { mut out } => { out.push(b'\\'); Ok(Cow::Owned(String::from_utf8(out)?)) } UnquoteState::HasEscapeOctal { mut out, octal } => { out.push(b'\\'); out.extend(octal); Ok(Cow::Owned(String::from_utf8(out)?)) } } } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] pub fn sexp<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { let (remaining, _) = multispace0(input)?; let (remaining, tkn) = token(remaining).map(|(rem, out)| (Into::<&str>::into(rem), out))?; let (remaining, _) = multispace0(remaining)?; Ok((remaining, tkn)) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] fn token<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { alt((list, vector, atom))(input) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] fn list<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { let (remaining, _) = tag("(")(input)?; let (remaining, children) = delimited( multispace0, separated_list1(multispace1, token), multispace0, )(remaining)?; let (remaining, _) = tag(")")(remaining)?; Ok((remaining, Token::List(children))) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] fn vector<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { let (remaining, _) = tag("[")(input)?; let (remaining, children) = delimited( multispace0, separated_list1(multispace1, token), multispace0, )(remaining)?; let (remaining, _) = tag("]")(remaining)?; Ok((remaining, Token::Vector(children))) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] fn atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { not(peek(one_of(")]")))(input)?; alt(( text_with_properties, hash_notation, quoted_atom, unquoted_atom, ))(input) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] fn unquoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { let (remaining, body) = take_till1(|c| matches!(c, ' ' | '\t' | '\r' | '\n' | ')' | ']'))(input)?; Ok((remaining, Token::Atom(body))) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] fn quoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { let (mut remaining, _) = tag(r#"""#)(input)?; let mut in_escape = false; loop { if in_escape { let (remain, _) = alt((recognize(one_of(r#""n\\"#)), digit1))(remaining)?; remaining = remain; in_escape = false; } else { let end_quote = tag::<_, _, nom::error::Error<_>>(r#"""#)(remaining); if end_quote.is_ok() { break; } let escape_backslash = tag::<_, _, nom::error::Error<_>>("\\")(remaining); if let Ok((remain, _)) = escape_backslash { remaining = remain; in_escape = true; continue; } let (remain, _) = anychar(remaining)?; remaining = remain; } } let (remaining, _) = tag(r#"""#)(remaining)?; let source = get_consumed(input, remaining); Ok((remaining, Token::Atom(source))) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] fn hash_notation<'s>(input: &'s str) -> Res<&'s str, Token<'s>> { let (remaining, _) = tag("#<")(input)?; let (remaining, _body) = take_till1(|c| matches!(c, '>'))(remaining)?; let (remaining, _) = tag(">")(remaining)?; let source = get_consumed(input, remaining); Ok((remaining, Token::Atom(source))) } fn text_with_properties(input: &str) -> Res<&str, Token<'_>> { let (remaining, _) = tag("#(")(input)?; let (remaining, (text, props)) = delimited( multispace0, tuple(( map(quoted_atom, |atom| match atom { Token::Atom(body) => body, _ => unreachable!(), }), preceded(multispace1, opt(separated_list1(multispace1, token))), )), multispace0, )(remaining)?; let (remaining, _) = tag(")")(remaining)?; Ok(( remaining, Token::TextWithProperties(TextWithProperties { text, properties: props.unwrap_or(Vec::new()), }), )) } #[cfg(test)] mod tests { use super::*; #[test] fn simple() { let input = " (foo bar baz ) "; let (remaining, parsed) = sexp(input).expect("Parse the input"); assert_eq!(remaining, ""); assert!(match parsed { Token::Atom(_) => false, Token::List(_) => true, Token::TextWithProperties(_) => false, Token::Vector(_) => false, }); } #[test] fn quoted() { let input = r#" ("foo" bar baz ) "#; let (remaining, parsed) = sexp(input).expect("Parse the input"); assert_eq!(remaining, ""); assert!(match parsed { Token::Atom(_) => false, Token::List(_) => true, Token::TextWithProperties(_) => false, Token::Vector(_) => false, }); let children = match parsed { Token::List(children) => children, _ => panic!("Should be a list."), }; assert_eq!( match children.first() { Some(Token::Atom(body)) => *body, _ => panic!("First child should be an atom."), }, r#""foo""# ) } #[test] fn quoted_containing_paren() { let input = r#" (foo "b(a)r" baz ) "#; let (remaining, parsed) = sexp(input).expect("Parse the input"); assert_eq!(remaining, ""); assert!(matches!(parsed, Token::List(_))); let children = match parsed { Token::List(children) => children, _ => panic!("Should be a list."), }; assert_eq!( match children.first() { Some(Token::Atom(body)) => *body, _ => panic!("First child should be an atom."), }, r#"foo"# ); assert_eq!( match children.get(1) { Some(Token::Atom(body)) => *body, _ => panic!("Second child should be an atom."), }, r#""b(a)r""# ); assert_eq!( match children.get(2) { Some(Token::Atom(body)) => *body, _ => panic!("Third child should be an atom."), }, r#"baz"# ); } #[test] fn string_containing_escaped_characters() { let input = r#" (foo "\\( x=2 \\)" bar) "#; let (remaining, parsed) = sexp(input).expect("Parse the input"); assert_eq!(remaining, ""); assert!(match parsed { Token::Atom(_) => false, Token::List(_) => true, Token::TextWithProperties(_) => false, Token::Vector(_) => false, }); let children = match parsed { Token::List(children) => children, _ => panic!("Should be a list."), }; assert_eq!( match children.get(1) { Some(Token::Atom(body)) => *body, _ => panic!("First child should be an atom."), }, r#""\\( x=2 \\)""# ) } }