organic/src/parser/util.rs

use nom::branch::alt;
use nom::character::complete::line_ending;
use nom::character::complete::multispace0;
use nom::character::complete::none_of;
use nom::character::complete::space0;
use nom::combinator::eof;
use nom::combinator::not;
use nom::combinator::opt;
use nom::combinator::peek;
use nom::combinator::recognize;
use nom::multi::many0;
use nom::sequence::tuple;

use super::parser_context::ContextElement;
use super::Context;
use crate::error::CustomError;
use crate::error::MyError;
use crate::error::Res;
use crate::parser::parser_with_context::parser_with_context;

pub const WORD_CONSTITUENT_CHARACTERS: &str =
    "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";

/// Check if we are below a section of the given section type regardless of depth
#[allow(dead_code)]
pub fn in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {
    for thing in context.iter() {
        match thing.get_data() {
            ContextElement::Context(name) if *name == section_name => return true,
            _ => {}
        }
    }
    false
}

/// Checks if we are currently an immediate child of the given section type
pub fn immediate_in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {
    for thing in context.iter() {
        match thing.get_data() {
            ContextElement::Context(name) if *name == section_name => return true,
            ContextElement::Context(name) if *name != section_name => return false,
            _ => {}
        }
    }
    false
}

/// Get one character from before the current position.
pub fn get_one_before<'s>(document: &'s str, current_position: &'s str) -> Option<&'s str> {
    assert!(is_slice_of(document, current_position));
    if document.as_ptr() as usize == current_position.as_ptr() as usize {
        return None;
    }
    let offset = current_position.as_ptr() as usize - document.as_ptr() as usize;
    let previous_character_offset = document.floor_char_boundary(offset - 1);
    Some(&document[previous_character_offset..offset])
}

/// Check if the child string slice is a slice of the parent string slice.
fn is_slice_of(parent: &str, child: &str) -> bool {
    let parent_start = parent.as_ptr() as usize;
    let parent_end = parent_start + parent.len();
    let child_start = child.as_ptr() as usize;
    let child_end = child_start + child.len();
    child_start >= parent_start && child_end <= parent_end
}

/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser.
pub fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {
    assert!(is_slice_of(input, remaining));
    let source = {
        let offset = remaining.as_ptr() as usize - input.as_ptr() as usize;
        &input[..offset]
    };
    source
}

/// A line containing only whitespace and then a line break
///
/// It is up to the caller to ensure this is called at the start of a line.
#[tracing::instrument(ret, level = "debug")]
pub fn blank_line(input: &str) -> Res<&str, &str> {
    not(eof)(input)?;
    recognize(tuple((space0, alt((line_ending, eof)))))(input)
}

#[tracing::instrument(ret, level = "debug")]
pub fn element_trailing_whitespace<'r, 's>(
    context: Context<'r, 's>,
    input: &'s str,
) -> Res<&'s str, &'s str> {
    start_of_line(context, input)?;
    alt((eof, recognize(many0(blank_line))))(input)
}

#[tracing::instrument(ret, level = "debug")]
pub fn maybe_consume_trailing_whitespace_if_not_exiting<'r, 's>(
    context: Context<'r, 's>,
    input: &'s str,
) -> Res<&'s str, Option<&'s str>> {
    if context.should_consume_trailing_whitespace() && exit_matcher_parser(context, input).is_err()
    {
        Ok(opt(parser_with_context!(element_trailing_whitespace)(
            context,
        ))(input)?)
    } else {
        Ok((input, None))
    }
}

#[tracing::instrument(ret, level = "debug")]
pub fn maybe_consume_trailing_whitespace<'r, 's>(
    context: Context<'r, 's>,
    input: &'s str,
) -> Res<&'s str, Option<&'s str>> {
    if context.should_consume_trailing_whitespace() {
        Ok(opt(parser_with_context!(element_trailing_whitespace)(
            context,
        ))(input)?)
    } else {
        Ok((input, None))
    }
}

#[tracing::instrument(ret, level = "debug")]
pub fn trailing_whitespace(input: &str) -> Res<&str, &str> {
    alt((eof, recognize(tuple((line_ending, many0(blank_line))))))(input)
}

/// Check that we are at the start of a line
#[tracing::instrument(ret, level = "debug")]
pub fn start_of_line<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, ()> {
    let document_root = context.get_document_root().unwrap();
    let preceding_character = get_one_before(document_root, input)
        .map(|slice| slice.chars().next())
        .flatten();
    match preceding_character {
        Some('\n') => {}
        Some(_) => {
            // Not at start of line, cannot be a heading
            return Err(nom::Err::Error(CustomError::MyError(MyError(
                "Not at start of line",
            ))));
        }
        // If None, we are at the start of the file which allows for headings
        None => {}
    };
    Ok((input, ()))
}

/// Pull one non-whitespace character.
///
/// This function only operates on spaces, tabs, carriage returns, and line feeds. It does not handle fancy unicode whitespace.
#[tracing::instrument(ret, level = "debug")]
pub fn non_whitespace_character(input: &str) -> Res<&str, char> {
    none_of(" \t\r\n")(input)
}

/// Check that we are at the start of a line
#[tracing::instrument(ret, level = "debug")]
pub fn exit_matcher_parser<'r, 's>(
    context: Context<'r, 's>,
    input: &'s str,
) -> Res<&'s str, &'s str> {
    peek(|i| context.check_exit_matcher(i))(input)
}

#[tracing::instrument(ret, level = "debug")]
pub fn always_fail<'r, 's>(_context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> {
    Err(nom::Err::Error(CustomError::MyError(MyError(
        "Always fail",
    ))))
}

/// Walk backwards unconsuming blank lines and line endings.
///
/// List items are a special case where the trailing blank lines do not belong to it, unlike all other elements. Rather than write that special logic into each child parser, this just walks backwards through the consumed input to unconsume trailing blank lines and line breaks.
#[tracing::instrument(ret, level = "debug")]
pub fn regurgitate<'s>(input: &'s str, remaining: &'s str) -> &'s str {
    assert!(is_slice_of(input, remaining));
    let mut offset = remaining.as_ptr() as usize - input.as_ptr() as usize;
    let source = &input[..offset];
    let mut char_indices = source.char_indices().rev();
    loop {
        match char_indices.next() {
            Some((off, chr)) => {
                if chr == '\n' {
                    offset = off;
                } else if chr != ' ' && chr != '\t' {
                    return &input[offset..];
                }
            }
            None => {
                // It was all whitespace, so return the full input string
                return input;
            }
        };
    }
}

#[tracing::instrument(ret, level = "debug")]
pub fn whitespace_eof(input: &str) -> Res<&str, &str> {
    recognize(tuple((multispace0, eof)))(input)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn get_one_before_unicode() {
        let input = "🧡💛💚💙💜";
        let (green_heart_index, _) = input.char_indices().skip(2).next().unwrap();
        let starting_with_green_heart = &input[green_heart_index..];
        let yellow_heart = get_one_before(input, starting_with_green_heart).unwrap();
        assert!(is_slice_of(input, yellow_heart));
        assert_eq!(yellow_heart, "💛");
    }

    #[test]
    fn regurgitate_unicode() {
        let input = "🧡💛\n\t   \t  \n\n💚💙💜";
        let (green_heart_index, _) = input.char_indices().skip(12).next().unwrap();
        let starting_with_green_heart = &input[green_heart_index..];
        let after_yellow = regurgitate(input, starting_with_green_heart);
        assert!(is_slice_of(input, after_yellow));
        assert_eq!(after_yellow, "\n\t   \t  \n\n💚💙💜");
    }
}
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`use nom::branch::alt;`
			`use nom::character::complete::line_ending;`
Add initial implementation of footnote definition. 2023-04-07 17:14:44 -04:00			`use nom::character::complete::multispace0;`
Building the plain list item context. 2023-03-25 14:10:22 -04:00			`use nom::character::complete::none_of;`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`use nom::character::complete::space0;`
			`use nom::combinator::eof;`
			`use nom::combinator::not;`
Footnote definitions are parsing on their own. 2023-04-10 11:50:43 -04:00			`use nom::combinator::opt;`
I seem to have solved the infinite loop issue by moving the exit check into the plain list parser. 2023-03-27 12:52:49 -04:00			`use nom::combinator::peek;`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`use nom::combinator::recognize;`
			`use nom::multi::many0;`
			`use nom::sequence::tuple;`

Simulate trailing whitespace in empty greater blocks just like drawers. 2023-04-22 21:45:18 -04:00			`use super::parser_context::ContextElement;`
			`use super::Context;`
			`use crate::error::CustomError;`
			`use crate::error::MyError;`
			`use crate::error::Res;`
			`use crate::parser::parser_with_context::parser_with_context;`

Add initial implementation of footnote definition. 2023-04-07 17:14:44 -04:00			`pub const WORD_CONSTITUENT_CHARACTERS: &str =`
			`"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";`
Move in_section to a util module. 2022-12-18 03:18:43 -05:00
Add a DocumentRoot context element storing the original full document. This might be used for look-behind instead of storing previous element nodes in the context tree. 2023-03-23 16:40:39 -04:00			`/// Check if we are below a section of the given section type regardless of depth`
Cleaning up compiler warnings. 2023-04-21 18:22:17 -04:00			`#[allow(dead_code)]`
Standardize order of lifetimes for r and s. 2022-12-18 03:30:28 -05:00			`pub fn in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {`
Move in_section to a util module. 2022-12-18 03:18:43 -05:00			`for thing in context.iter() {`
			`match thing.get_data() {`
			`ContextElement::Context(name) if *name == section_name => return true,`
Add a DocumentRoot context element storing the original full document. This might be used for look-behind instead of storing previous element nodes in the context tree. 2023-03-23 16:40:39 -04:00			`_ => {}`
			`}`
			`}`
			`false`
			`}`

			`/// Checks if we are currently an immediate child of the given section type`
			`pub fn immediate_in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {`
			`for thing in context.iter() {`
			`match thing.get_data() {`
			`ContextElement::Context(name) if *name == section_name => return true,`
			`ContextElement::Context(name) if *name != section_name => return false,`
			`_ => {}`
Move in_section to a util module. 2022-12-18 03:18:43 -05:00			`}`
			`}`
			`false`
			`}`
Move some functions into util. 2023-03-25 11:25:10 -04:00
			`/// Get one character from before the current position.`
			`pub fn get_one_before<'s>(document: &'s str, current_position: &'s str) -> Option<&'s str> {`
			`assert!(is_slice_of(document, current_position));`
			`if document.as_ptr() as usize == current_position.as_ptr() as usize {`
			`return None;`
			`}`
			`let offset = current_position.as_ptr() as usize - document.as_ptr() as usize;`
			`let previous_character_offset = document.floor_char_boundary(offset - 1);`
			`Some(&document[previous_character_offset..offset])`
			`}`

			`/// Check if the child string slice is a slice of the parent string slice.`
			`fn is_slice_of(parent: &str, child: &str) -> bool {`
			`let parent_start = parent.as_ptr() as usize;`
			`let parent_end = parent_start + parent.len();`
			`let child_start = child.as_ptr() as usize;`
			`let child_end = child_start + child.len();`
			`child_start >= parent_start && child_end <= parent_end`
			`}`

			`/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser.`
			`pub fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {`
			`assert!(is_slice_of(input, remaining));`
			`let source = {`
			`let offset = remaining.as_ptr() as usize - input.as_ptr() as usize;`
			`&input[..offset]`
			`};`
			`source`
			`}`

Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`/// A line containing only whitespace and then a line break`
			`///`
			`/// It is up to the caller to ensure this is called at the start of a line.`
Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`pub fn blank_line(input: &str) -> Res<&str, &str> {`
			`not(eof)(input)?;`
			`recognize(tuple((space0, alt((line_ending, eof)))))(input)`
			`}`

Move trailing whitespace parsing to a separate element. I still need to parse the line break at the end of elements. 2023-03-31 11:16:37 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn element_trailing_whitespace<'r, 's>(`
			`context: Context<'r, 's>,`
			`input: &'s str,`
			`) -> Res<&'s str, &'s str> {`
			`start_of_line(context, input)?;`
			`alt((eof, recognize(many0(blank_line))))(input)`
			`}`

Do not consume trailing whitespace if the parent exit matcher is matching. 2023-04-10 13:13:11 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn maybe_consume_trailing_whitespace_if_not_exiting<'r, 's>(`
			`context: Context<'r, 's>,`
			`input: &'s str,`
			`) -> Res<&'s str, Option<&'s str>> {`
			`if context.should_consume_trailing_whitespace() && exit_matcher_parser(context, input).is_err()`
			`{`
			`Ok(opt(parser_with_context!(element_trailing_whitespace)(`
			`context,`
			`))(input)?)`
			`} else {`
			`Ok((input, None))`
			`}`
			`}`

Footnote definitions are parsing on their own. 2023-04-10 11:50:43 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn maybe_consume_trailing_whitespace<'r, 's>(`
			`context: Context<'r, 's>,`
			`input: &'s str,`
			`) -> Res<&'s str, Option<&'s str>> {`
			`if context.should_consume_trailing_whitespace() {`
			`Ok(opt(parser_with_context!(element_trailing_whitespace)(`
			`context,`
			`))(input)?)`
			`} else {`
			`Ok((input, None))`
			`}`
			`}`

Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`pub fn trailing_whitespace(input: &str) -> Res<&str, &str> {`
			`alt((eof, recognize(tuple((line_ending, many0(blank_line))))))(input)`
			`}`

Building the plain list item context. 2023-03-25 14:10:22 -04:00			`/// Check that we are at the start of a line`
Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
Building the plain list item context. 2023-03-25 14:10:22 -04:00			`pub fn start_of_line<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, ()> {`
			`let document_root = context.get_document_root().unwrap();`
			`let preceding_character = get_one_before(document_root, input)`
			`.map(\|slice\| slice.chars().next())`
			`.flatten();`
			`match preceding_character {`
			`Some('\n') => {}`
			`Some(_) => {`
			`// Not at start of line, cannot be a heading`
			`return Err(nom::Err::Error(CustomError::MyError(MyError(`
			`"Not at start of line",`
			`))));`
			`}`
			`// If None, we are at the start of the file which allows for headings`
			`None => {}`
			`};`
			`Ok((input, ()))`
			`}`

			`/// Pull one non-whitespace character.`
			`///`
			`/// This function only operates on spaces, tabs, carriage returns, and line feeds. It does not handle fancy unicode whitespace.`
Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
Building the plain list item context. 2023-03-25 14:10:22 -04:00			`pub fn non_whitespace_character(input: &str) -> Res<&str, char> {`
			`none_of(" \t\r\n")(input)`
			`}`

I seem to have solved the infinite loop issue by moving the exit check into the plain list parser. 2023-03-27 12:52:49 -04:00			`/// Check that we are at the start of a line`
Instrument the code. 2023-03-27 15:08:29 -04:00			`#[tracing::instrument(ret, level = "debug")]`
I seem to have solved the infinite loop issue by moving the exit check into the plain list parser. 2023-03-27 12:52:49 -04:00			`pub fn exit_matcher_parser<'r, 's>(`
			`context: Context<'r, 's>,`
			`input: &'s str,`
			`) -> Res<&'s str, &'s str> {`
			`peek(\|i\| context.check_exit_matcher(i))(input)`
			`}`

Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn always_fail<'r, 's>(_context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> {`
			`Err(nom::Err::Error(CustomError::MyError(MyError(`
			`"Always fail",`
			`))))`
			`}`

Regurgitate seems to have made all text a paragraph. 2023-03-27 19:12:20 -04:00			`/// Walk backwards unconsuming blank lines and line endings.`
			`///`
			`/// List items are a special case where the trailing blank lines do not belong to it, unlike all other elements. Rather than write that special logic into each child parser, this just walks backwards through the consumed input to unconsume trailing blank lines and line breaks.`
Not sure whats going on. 2023-03-27 19:19:51 -04:00			`#[tracing::instrument(ret, level = "debug")]`
Regurgitate seems to have made all text a paragraph. 2023-03-27 19:12:20 -04:00			`pub fn regurgitate<'s>(input: &'s str, remaining: &'s str) -> &'s str {`
			`assert!(is_slice_of(input, remaining));`
			`let mut offset = remaining.as_ptr() as usize - input.as_ptr() as usize;`
			`let source = &input[..offset];`
			`let mut char_indices = source.char_indices().rev();`
			`loop {`
			`match char_indices.next() {`
			`Some((off, chr)) => {`
			`if chr == '\n' {`
			`offset = off;`
			`} else if chr != ' ' && chr != '\t' {`
			`return &input[offset..];`
			`}`
			`}`
			`None => {`
			`// It was all whitespace, so return the full input string`
			`return input;`
			`}`
			`};`
			`}`
			`}`

Add initial implementation of footnote definition. 2023-04-07 17:14:44 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn whitespace_eof(input: &str) -> Res<&str, &str> {`
			`recognize(tuple((multispace0, eof)))(input)`
			`}`

Move some functions into util. 2023-03-25 11:25:10 -04:00			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn get_one_before_unicode() {`
			`let input = "🧡💛💚💙💜";`
			`let (green_heart_index, _) = input.char_indices().skip(2).next().unwrap();`
			`let starting_with_green_heart = &input[green_heart_index..];`
			`let yellow_heart = get_one_before(input, starting_with_green_heart).unwrap();`
			`assert!(is_slice_of(input, yellow_heart));`
			`assert_eq!(yellow_heart, "💛");`
			`}`
Regurgitate seems to have made all text a paragraph. 2023-03-27 19:12:20 -04:00
			`#[test]`
			`fn regurgitate_unicode() {`
			`let input = "🧡💛\n\t \t \n\n💚💙💜";`
			`let (green_heart_index, _) = input.char_indices().skip(12).next().unwrap();`
			`let starting_with_green_heart = &input[green_heart_index..];`
			`let after_yellow = regurgitate(input, starting_with_green_heart);`
			`assert!(is_slice_of(input, after_yellow));`
			`assert_eq!(after_yellow, "\n\t \t \n\n💚💙💜");`
			`}`
Move some functions into util. 2023-03-25 11:25:10 -04:00			`}`