organic/src/compare/sexp.rs

use std::borrow::Cow;
use std::collections::HashMap;

use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::bytes::complete::take_till1;
use nom::character::complete::anychar;
use nom::character::complete::digit1;
use nom::character::complete::multispace0;
use nom::character::complete::multispace1;
use nom::character::complete::one_of;
use nom::combinator::map;
use nom::combinator::not;
use nom::combinator::opt;
use nom::combinator::peek;
use nom::combinator::recognize;
use nom::multi::separated_list1;
use nom::sequence::delimited;
use nom::sequence::preceded;
use nom::sequence::tuple;

use crate::error::Res;

const MAX_OCTAL_LENGTH: usize = 3;

#[derive(Debug)]
pub enum Token<'s> {
    Atom(&'s str),
    List(Vec<Token<'s>>),
    TextWithProperties(TextWithProperties<'s>),
    Vector(Vec<Token<'s>>),
}

#[derive(Debug)]
pub struct TextWithProperties<'s> {
    pub(crate) text: &'s str,
    pub(crate) properties: Vec<Token<'s>>,
}

impl<'s> Token<'s> {
    pub(crate) fn as_vector<'p>(
        &'p self,
    ) -> Result<&'p Vec<Token<'s>>, Box<dyn std::error::Error>> {
        Ok(match self {
            Token::Vector(children) => Ok(children),
            _ => Err(format!("wrong token type, expected vector: {:?}", self)),
        }?)
    }

    pub(crate) fn as_list<'p>(&'p self) -> Result<&'p Vec<Token<'s>>, Box<dyn std::error::Error>> {
        Ok(match self {
            Token::List(children) => Ok(children),
            _ => Err(format!("wrong token type, expected list: {:?}", self)),
        }?)
    }

    pub(crate) fn as_atom<'p>(&'p self) -> Result<&'s str, Box<dyn std::error::Error>> {
        Ok(match self {
            Token::Atom(body) => Ok(*body),
            _ => Err(format!("wrong token type, expected atom: {:?}", self)),
        }?)
    }

    pub(crate) fn as_text<'p>(
        &'p self,
    ) -> Result<&'p TextWithProperties<'s>, Box<dyn std::error::Error>> {
        Ok(match self {
            Token::TextWithProperties(body) => Ok(body),
            _ => Err(format!("wrong token type, expected text: {:?}", self)),
        }?)
    }

    pub(crate) fn as_map<'p>(
        &'p self,
    ) -> Result<HashMap<&'s str, &'p Token<'s>>, Box<dyn std::error::Error>> {
        let mut hashmap = HashMap::new();

        let children = self.as_list()?;
        if children.len() % 2 != 0 {
            return Err("Expecting an even number of children".into());
        }
        let mut key: Option<&str> = None;
        for child in children.iter() {
            match key {
                None => {
                    key = Some(child.as_atom()?);
                }
                Some(key_val) => {
                    key = None;
                    hashmap.insert(key_val, child);
                }
            };
        }

        Ok(hashmap)
    }
}

/// Check if the child string slice is a slice of the parent string slice.
fn is_slice_of(parent: &str, child: &str) -> bool {
    let parent_start = parent.as_ptr() as usize;
    let parent_end = parent_start + parent.len();
    let child_start = child.as_ptr() as usize;
    let child_end = child_start + child.len();
    child_start >= parent_start && child_end <= parent_end
}

/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser.
fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {
    debug_assert!(is_slice_of(input, remaining));
    let offset = remaining.as_ptr() as usize - input.as_ptr() as usize;
    &input[..offset]
}

#[derive(Debug)]
enum UnquoteState {
    Normal,
    Escape,
    HasEscape {
        out: Vec<u8>,
    },
    HasEscapeEscape {
        out: Vec<u8>,
    },
    Octal {
        octal_begin_offset: usize,
        octal: Vec<u8>,
    },
    HasEscapeOctal {
        out: Vec<u8>,
        octal: Vec<u8>,
    },
}

pub(crate) fn unquote(text: &str) -> Result<Cow<'_, str>, Box<dyn std::error::Error>> {
    if !text.starts_with('"') {
        return Err("Quoted text does not start with quote.".into());
    }
    if !text.ends_with('"') {
        return Err("Quoted text does not end with quote.".into());
    }
    let interior_text = &text[1..(text.len() - 1)];
    let mut state = UnquoteState::Normal;
    for (offset, current_char) in interior_text.bytes().enumerate() {
        // Check to see if octal finished
        state = match (state, current_char) {
            (
                UnquoteState::Octal {
                    octal_begin_offset,
                    octal,
                },
                b'0'..=b'7',
            ) if octal.len() < MAX_OCTAL_LENGTH => UnquoteState::Octal {
                octal_begin_offset,
                octal,
            },
            (
                UnquoteState::Octal {
                    octal_begin_offset,
                    octal,
                },
                _,
            ) => {
                let octal_number_string = String::from_utf8(octal)?;
                let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?;
                let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
                out.extend_from_slice(&interior_text.as_bytes()[..octal_begin_offset]);
                out.push(decoded_byte);
                UnquoteState::HasEscape { out }
            }
            (UnquoteState::HasEscapeOctal { out, octal }, b'0'..=b'7')
                if octal.len() < MAX_OCTAL_LENGTH =>
            {
                UnquoteState::HasEscapeOctal { out, octal }
            }
            (UnquoteState::HasEscapeOctal { mut out, octal }, _) => {
                let octal_number_string = String::from_utf8(octal)?;
                let decoded_byte = u8::from_str_radix(&octal_number_string, 8)?;
                out.push(decoded_byte);
                UnquoteState::HasEscape { out }
            }
            (state, _) => state,
        };

        state = match (state, current_char) {
            (UnquoteState::Normal, b'\\') => UnquoteState::Escape,
            (UnquoteState::Normal, _) => UnquoteState::Normal,
            (UnquoteState::HasEscape { out }, b'\\') => UnquoteState::HasEscapeEscape { out },
            (UnquoteState::HasEscape { mut out }, _) => {
                out.push(current_char);
                UnquoteState::HasEscape { out }
            }
            (UnquoteState::Escape, b'n') => {
                let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
                // Subtract 1 from offset to account for backslash.
                out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]);
                out.push(b'\n');
                UnquoteState::HasEscape { out }
            }
            (UnquoteState::HasEscapeEscape { mut out }, b'n') => {
                out.push(b'\n');
                UnquoteState::HasEscape { out }
            }
            (UnquoteState::Escape, b'\\') => {
                let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
                // Subtract 1 from offset to account for backslash.
                out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]);
                out.push(b'\\');
                UnquoteState::HasEscape { out }
            }
            (UnquoteState::HasEscapeEscape { mut out }, b'\\') => {
                out.push(b'\\');
                UnquoteState::HasEscape { out }
            }
            (UnquoteState::Escape, b'"') => {
                let mut out: Vec<u8> = Vec::with_capacity(interior_text.len());
                // Subtract 1 from offset to account for backslash.
                out.extend_from_slice(&interior_text.as_bytes()[..(offset - 1)]);
                out.push(b'"');
                UnquoteState::HasEscape { out }
            }
            (UnquoteState::HasEscapeEscape { mut out }, b'"') => {
                out.push(b'"');
                UnquoteState::HasEscape { out }
            }
            (UnquoteState::Escape, b'0'..=b'7') => {
                let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH);
                octal.push(current_char);
                // Substract 1 from offset to account for backslash
                UnquoteState::Octal {
                    octal_begin_offset: offset - 1,
                    octal,
                }
            }
            (UnquoteState::HasEscapeEscape { out }, b'0'..=b'7') => {
                let mut octal = Vec::with_capacity(MAX_OCTAL_LENGTH);
                octal.push(current_char);
                // Substract 1 from offset to account for backslash
                UnquoteState::HasEscapeOctal { out, octal }
            }
            (
                UnquoteState::Octal {
                    octal_begin_offset,
                    mut octal,
                },
                b'0'..=b'7',
            ) => {
                octal.push(current_char);
                UnquoteState::Octal {
                    octal_begin_offset,
                    octal,
                }
            }
            (UnquoteState::HasEscapeOctal { out, mut octal }, b'0'..=b'7') => {
                octal.push(current_char);
                UnquoteState::HasEscapeOctal { out, octal }
            }
            (state, _) => panic!(
                "Invalid state unquoting string: {:?} | {} | {:?}",
                state, offset, interior_text
            ),
        };
    }

    match state {
        UnquoteState::Normal | UnquoteState::Escape | UnquoteState::Octal { .. } => {
            Ok(Cow::Borrowed(interior_text))
        }
        UnquoteState::HasEscape { out } => Ok(Cow::Owned(String::from_utf8(out)?)),
        UnquoteState::HasEscapeEscape { mut out } => {
            out.push(b'\\');
            Ok(Cow::Owned(String::from_utf8(out)?))
        }
        UnquoteState::HasEscapeOctal { mut out, octal } => {
            out.push(b'\\');
            out.extend(octal);
            Ok(Cow::Owned(String::from_utf8(out)?))
        }
    }
}

#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
pub fn sexp<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
    let (remaining, _) = multispace0(input)?;
    let (remaining, tkn) = token(remaining).map(|(rem, out)| (Into::<&str>::into(rem), out))?;
    let (remaining, _) = multispace0(remaining)?;
    Ok((remaining, tkn))
}

#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn token<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
    alt((list, vector, atom))(input)
}

#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn list<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
    let (remaining, _) = tag("(")(input)?;
    let (remaining, children) = delimited(
        multispace0,
        separated_list1(multispace1, token),
        multispace0,
    )(remaining)?;
    let (remaining, _) = tag(")")(remaining)?;
    Ok((remaining, Token::List(children)))
}

#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn vector<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
    let (remaining, _) = tag("[")(input)?;
    let (remaining, children) = delimited(
        multispace0,
        separated_list1(multispace1, token),
        multispace0,
    )(remaining)?;
    let (remaining, _) = tag("]")(remaining)?;
    Ok((remaining, Token::Vector(children)))
}

#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
    not(peek(one_of(")]")))(input)?;
    alt((
        text_with_properties,
        hash_notation,
        quoted_atom,
        unquoted_atom,
    ))(input)
}

#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn unquoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
    let (remaining, body) =
        take_till1(|c| matches!(c, ' ' | '\t' | '\r' | '\n' | ')' | ']'))(input)?;
    Ok((remaining, Token::Atom(body)))
}

#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn quoted_atom<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
    let (mut remaining, _) = tag(r#"""#)(input)?;
    let mut in_escape = false;
    loop {
        if in_escape {
            let (remain, _) = alt((recognize(one_of(r#""n\\"#)), digit1))(remaining)?;
            remaining = remain;
            in_escape = false;
        } else {
            let end_quote = tag::<_, _, nom::error::Error<_>>(r#"""#)(remaining);
            if end_quote.is_ok() {
                break;
            }

            let escape_backslash = tag::<_, _, nom::error::Error<_>>("\\")(remaining);
            if let Ok((remain, _)) = escape_backslash {
                remaining = remain;
                in_escape = true;
                continue;
            }

            let (remain, _) = anychar(remaining)?;
            remaining = remain;
        }
    }
    let (remaining, _) = tag(r#"""#)(remaining)?;
    let source = get_consumed(input, remaining);
    Ok((remaining, Token::Atom(source)))
}

#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn hash_notation<'s>(input: &'s str) -> Res<&'s str, Token<'s>> {
    let (remaining, _) = tag("#<")(input)?;
    let (remaining, _body) = take_till1(|c| matches!(c, '>'))(remaining)?;
    let (remaining, _) = tag(">")(remaining)?;
    let source = get_consumed(input, remaining);
    Ok((remaining, Token::Atom(source)))
}

fn text_with_properties(input: &str) -> Res<&str, Token<'_>> {
    let (remaining, _) = tag("#(")(input)?;
    let (remaining, (text, props)) = delimited(
        multispace0,
        tuple((
            map(quoted_atom, |atom| match atom {
                Token::Atom(body) => body,
                _ => unreachable!(),
            }),
            preceded(multispace1, opt(separated_list1(multispace1, token))),
        )),
        multispace0,
    )(remaining)?;
    let (remaining, _) = tag(")")(remaining)?;
    Ok((
        remaining,
        Token::TextWithProperties(TextWithProperties {
            text,
            properties: props.unwrap_or(Vec::new()),
        }),
    ))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn simple() {
        let input = "  (foo bar baz )  ";
        let (remaining, parsed) = sexp(input).expect("Parse the input");
        assert_eq!(remaining, "");
        assert!(match parsed {
            Token::Atom(_) => false,
            Token::List(_) => true,
            Token::TextWithProperties(_) => false,
            Token::Vector(_) => false,
        });
    }

    #[test]
    fn quoted() {
        let input = r#"  ("foo" bar baz )  "#;
        let (remaining, parsed) = sexp(input).expect("Parse the input");
        assert_eq!(remaining, "");
        assert!(match parsed {
            Token::Atom(_) => false,
            Token::List(_) => true,
            Token::TextWithProperties(_) => false,
            Token::Vector(_) => false,
        });
        let children = match parsed {
            Token::List(children) => children,
            _ => panic!("Should be a list."),
        };
        assert_eq!(
            match children.first() {
                Some(Token::Atom(body)) => *body,
                _ => panic!("First child should be an atom."),
            },
            r#""foo""#
        )
    }

    #[test]
    fn quoted_containing_paren() {
        let input = r#"  (foo "b(a)r" baz )  "#;
        let (remaining, parsed) = sexp(input).expect("Parse the input");
        assert_eq!(remaining, "");
        assert!(matches!(parsed, Token::List(_)));
        let children = match parsed {
            Token::List(children) => children,
            _ => panic!("Should be a list."),
        };
        assert_eq!(
            match children.first() {
                Some(Token::Atom(body)) => *body,
                _ => panic!("First child should be an atom."),
            },
            r#"foo"#
        );
        assert_eq!(
            match children.get(1) {
                Some(Token::Atom(body)) => *body,
                _ => panic!("Second child should be an atom."),
            },
            r#""b(a)r""#
        );
        assert_eq!(
            match children.get(2) {
                Some(Token::Atom(body)) => *body,
                _ => panic!("Third child should be an atom."),
            },
            r#"baz"#
        );
    }

    #[test]
    fn string_containing_escaped_characters() {
        let input = r#"  (foo "\\( x=2 \\)" bar)  "#;
        let (remaining, parsed) = sexp(input).expect("Parse the input");
        assert_eq!(remaining, "");
        assert!(match parsed {
            Token::Atom(_) => false,
            Token::List(_) => true,
            Token::TextWithProperties(_) => false,
            Token::Vector(_) => false,
        });
        let children = match parsed {
            Token::List(children) => children,
            _ => panic!("Should be a list."),
        };
        assert_eq!(
            match children.get(1) {
                Some(Token::Atom(body)) => *body,
                _ => panic!("First child should be an atom."),
            },
            r#""\\( x=2 \\)""#
        )
    }
}