organic/src/parser/util.rs

use nom::branch::alt;
use nom::character::complete::anychar;
use nom::character::complete::line_ending;
use nom::character::complete::multispace0;
use nom::character::complete::none_of;
use nom::character::complete::space0;
use nom::combinator::eof;
use nom::combinator::not;
use nom::combinator::opt;
use nom::combinator::peek;
use nom::combinator::recognize;
use nom::combinator::verify;
use nom::multi::many0;
use nom::multi::many_till;
use nom::sequence::tuple;

use super::parser_context::ContextElement;
use super::Context;
use crate::error::CustomError;
use crate::error::MyError;
use crate::error::Res;
use crate::parser::parser_with_context::parser_with_context;

pub const WORD_CONSTITUENT_CHARACTERS: &str =
    "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";

/// Check if we are below a section of the given section type regardless of depth
#[allow(dead_code)]
pub fn in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {
    for thing in context.iter() {
        match thing.get_data() {
            ContextElement::Context(name) if *name == section_name => return true,
            _ => {}
        }
    }
    false
}

/// Checks if we are currently an immediate child of the given section type
pub fn immediate_in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {
    for thing in context.iter() {
        match thing.get_data() {
            ContextElement::Context(name) if *name == section_name => return true,
            ContextElement::Context(name) if *name != section_name => return false,
            _ => {}
        }
    }
    false
}

/// Get one character from before the current position.
pub fn get_one_before<'s>(document: &'s str, current_position: &'s str) -> Option<&'s str> {
    assert!(is_slice_of(document, current_position));
    if document.as_ptr() as usize == current_position.as_ptr() as usize {
        return None;
    }
    let offset = current_position.as_ptr() as usize - document.as_ptr() as usize;
    let previous_character_offset = document.floor_char_boundary(offset - 1);
    Some(&document[previous_character_offset..offset])
}

/// Get the line current_position is on up until current_position
pub fn get_current_line_before_position<'s>(
    document: &'s str,
    current_position: &'s str,
) -> Option<&'s str> {
    assert!(is_slice_of(document, current_position));
    if document.as_ptr() as usize == current_position.as_ptr() as usize {
        return None;
    }
    let offset = current_position.as_ptr() as usize - document.as_ptr() as usize;
    let mut previous_character_offset = offset;
    loop {
        let new_offset = document.floor_char_boundary(previous_character_offset - 1);
        let new_line = &document[new_offset..offset];
        let leading_char = new_line
            .chars()
            .next()
            .expect("Impossible to not have at least 1 character to read.");
        if "\r\n".contains(leading_char) || new_offset == 0 {
            break;
        }
        previous_character_offset = new_offset;
    }
    Some(&document[previous_character_offset..offset])
}

/// Check if the child string slice is a slice of the parent string slice.
fn is_slice_of(parent: &str, child: &str) -> bool {
    let parent_start = parent.as_ptr() as usize;
    let parent_end = parent_start + parent.len();
    let child_start = child.as_ptr() as usize;
    let child_end = child_start + child.len();
    child_start >= parent_start && child_end <= parent_end
}

/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser.
pub fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {
    assert!(is_slice_of(input, remaining));
    let source = {
        let offset = remaining.as_ptr() as usize - input.as_ptr() as usize;
        &input[..offset]
    };
    source
}

/// A line containing only whitespace and then a line break
///
/// It is up to the caller to ensure this is called at the start of a line.
#[tracing::instrument(ret, level = "debug")]
pub fn blank_line(input: &str) -> Res<&str, &str> {
    not(eof)(input)?;
    recognize(tuple((space0, alt((line_ending, eof)))))(input)
}

#[tracing::instrument(ret, level = "debug")]
pub fn element_trailing_whitespace<'r, 's>(
    context: Context<'r, 's>,
    input: &'s str,
) -> Res<&'s str, &'s str> {
    start_of_line(context, input)?;
    alt((eof, recognize(many0(blank_line))))(input)
}

#[tracing::instrument(ret, level = "debug")]
pub fn maybe_consume_trailing_whitespace_if_not_exiting<'r, 's>(
    context: Context<'r, 's>,
    input: &'s str,
) -> Res<&'s str, Option<&'s str>> {
    if context.should_consume_trailing_whitespace() && exit_matcher_parser(context, input).is_err()
    {
        Ok(opt(parser_with_context!(element_trailing_whitespace)(
            context,
        ))(input)?)
    } else {
        Ok((input, None))
    }
}

#[tracing::instrument(ret, level = "debug")]
pub fn maybe_consume_trailing_whitespace<'r, 's>(
    context: Context<'r, 's>,
    input: &'s str,
) -> Res<&'s str, Option<&'s str>> {
    if context.should_consume_trailing_whitespace() {
        Ok(opt(parser_with_context!(element_trailing_whitespace)(
            context,
        ))(input)?)
    } else {
        Ok((input, None))
    }
}

#[tracing::instrument(ret, level = "debug")]
pub fn trailing_whitespace(input: &str) -> Res<&str, &str> {
    alt((eof, recognize(tuple((line_ending, many0(blank_line))))))(input)
}

/// Check that we are at the start of a line
#[tracing::instrument(ret, level = "debug")]
pub fn start_of_line<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, ()> {
    let document_root = context.get_document_root().unwrap();
    let preceding_character = get_one_before(document_root, input)
        .map(|slice| slice.chars().next())
        .flatten();
    match preceding_character {
        Some('\n') => {}
        Some(_) => {
            // Not at start of line, cannot be a heading
            return Err(nom::Err::Error(CustomError::MyError(MyError(
                "Not at start of line",
            ))));
        }
        // If None, we are at the start of the file which allows for headings
        None => {}
    };
    Ok((input, ()))
}

/// Check that we are at the start of a line
#[tracing::instrument(ret, level = "debug")]
pub fn preceded_by_whitespace<'r, 's>(
    context: Context<'r, 's>,
    input: &'s str,
) -> Res<&'s str, ()> {
    let document_root = context.get_document_root().unwrap();
    let preceding_character = get_one_before(document_root, input)
        .map(|slice| slice.chars().next())
        .flatten();
    match preceding_character {
        Some('\n') | Some('\r') | Some(' ') | Some('\t') => {}
        // If None, we are at the start of the file which is not allowed
        None | Some(_) => {
            return Err(nom::Err::Error(CustomError::MyError(MyError(
                "Not preceded by whitespace.",
            ))));
        }
    };
    Ok((input, ()))
}

/// Pull one non-whitespace character.
///
/// This function only operates on spaces, tabs, carriage returns, and line feeds. It does not handle fancy unicode whitespace.
#[tracing::instrument(ret, level = "debug")]
pub fn non_whitespace_character(input: &str) -> Res<&str, char> {
    none_of(" \t\r\n")(input)
}

/// Check that we are at the start of a line
#[tracing::instrument(ret, level = "debug")]
pub fn exit_matcher_parser<'r, 's>(
    context: Context<'r, 's>,
    input: &'s str,
) -> Res<&'s str, &'s str> {
    peek(|i| context.check_exit_matcher(i))(input)
}

#[tracing::instrument(ret, level = "debug")]
pub fn always_fail<'r, 's>(_context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> {
    Err(nom::Err::Error(CustomError::MyError(MyError(
        "Always fail",
    ))))
}

#[tracing::instrument(ret, level = "debug")]
pub fn whitespace_eof(input: &str) -> Res<&str, &str> {
    recognize(tuple((multispace0, eof)))(input)
}

#[tracing::instrument(ret, level = "debug")]
pub fn text_until_exit<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> {
    recognize(verify(
        many_till(anychar, parser_with_context!(exit_matcher_parser)(context)),
        |(children, _exit_contents)| !children.is_empty(),
    ))(input)
}

#[allow(dead_code)]
pub fn not_yet_implemented() -> Res<&'static str, ()> {
    return Err(nom::Err::Error(CustomError::MyError(MyError(
        "Not implemented yet.",
    ))));
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn get_one_before_unicode() {
        let input = "🧡💛💚💙💜";
        let (green_heart_index, _) = input.char_indices().skip(2).next().unwrap();
        let starting_with_green_heart = &input[green_heart_index..];
        let yellow_heart = get_one_before(input, starting_with_green_heart).unwrap();
        assert!(is_slice_of(input, yellow_heart));
        assert_eq!(yellow_heart, "💛");
    }
}
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`use nom::branch::alt;`
Switch to using plain text with no additional exit matcher added. 2023-04-22 22:06:34 -04:00			`use nom::character::complete::anychar;`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`use nom::character::complete::line_ending;`
Add initial implementation of footnote definition. 2023-04-07 17:14:44 -04:00			`use nom::character::complete::multispace0;`
Building the plain list item context. 2023-03-25 14:10:22 -04:00			`use nom::character::complete::none_of;`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`use nom::character::complete::space0;`
			`use nom::combinator::eof;`
			`use nom::combinator::not;`
Footnote definitions are parsing on their own. 2023-04-10 11:50:43 -04:00			`use nom::combinator::opt;`
I seem to have solved the infinite loop issue by moving the exit check into the plain list parser. 2023-03-27 12:52:49 -04:00			`use nom::combinator::peek;`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`use nom::combinator::recognize;`
Switch to using plain text with no additional exit matcher added. 2023-04-22 22:06:34 -04:00			`use nom::combinator::verify;`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`use nom::multi::many0;`
Switch to using plain text with no additional exit matcher added. 2023-04-22 22:06:34 -04:00			`use nom::multi::many_till;`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`use nom::sequence::tuple;`

Simulate trailing whitespace in empty greater blocks just like drawers. 2023-04-22 21:45:18 -04:00			`use super::parser_context::ContextElement;`
			`use super::Context;`
			`use crate::error::CustomError;`
			`use crate::error::MyError;`
			`use crate::error::Res;`
			`use crate::parser::parser_with_context::parser_with_context;`

Add initial implementation of footnote definition. 2023-04-07 17:14:44 -04:00			`pub const WORD_CONSTITUENT_CHARACTERS: &str =`
			`"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";`
Move in_section to a util module. 2022-12-18 03:18:43 -05:00
Add a DocumentRoot context element storing the original full document. This might be used for look-behind instead of storing previous element nodes in the context tree. 2023-03-23 16:40:39 -04:00			`/// Check if we are below a section of the given section type regardless of depth`
Cleaning up compiler warnings. 2023-04-21 18:22:17 -04:00			`#[allow(dead_code)]`
Standardize order of lifetimes for r and s. 2022-12-18 03:30:28 -05:00			`pub fn in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {`
Move in_section to a util module. 2022-12-18 03:18:43 -05:00			`for thing in context.iter() {`
			`match thing.get_data() {`
			`ContextElement::Context(name) if *name == section_name => return true,`
Add a DocumentRoot context element storing the original full document. This might be used for look-behind instead of storing previous element nodes in the context tree. 2023-03-23 16:40:39 -04:00			`_ => {}`
			`}`
			`}`
			`false`
			`}`

			`/// Checks if we are currently an immediate child of the given section type`
			`pub fn immediate_in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {`
			`for thing in context.iter() {`
			`match thing.get_data() {`
			`ContextElement::Context(name) if *name == section_name => return true,`
			`ContextElement::Context(name) if *name != section_name => return false,`
			`_ => {}`
Move in_section to a util module. 2022-12-18 03:18:43 -05:00			`}`
			`}`
			`false`
			`}`
Move some functions into util. 2023-03-25 11:25:10 -04:00
			`/// Get one character from before the current position.`
			`pub fn get_one_before<'s>(document: &'s str, current_position: &'s str) -> Option<&'s str> {`
			`assert!(is_slice_of(document, current_position));`
			`if document.as_ptr() as usize == current_position.as_ptr() as usize {`
			`return None;`
			`}`
			`let offset = current_position.as_ptr() as usize - document.as_ptr() as usize;`
			`let previous_character_offset = document.floor_char_boundary(offset - 1);`
			`Some(&document[previous_character_offset..offset])`
			`}`

Check that the preceding line for a line break is non-empty. 2023-07-22 00:26:54 -04:00			`/// Get the line current_position is on up until current_position`
			`pub fn get_current_line_before_position<'s>(`
			`document: &'s str,`
			`current_position: &'s str,`
			`) -> Option<&'s str> {`
			`assert!(is_slice_of(document, current_position));`
			`if document.as_ptr() as usize == current_position.as_ptr() as usize {`
			`return None;`
			`}`
			`let offset = current_position.as_ptr() as usize - document.as_ptr() as usize;`
			`let mut previous_character_offset = offset;`
			`loop {`
			`let new_offset = document.floor_char_boundary(previous_character_offset - 1);`
			`let new_line = &document[new_offset..offset];`
			`let leading_char = new_line`
			`.chars()`
			`.next()`
			`.expect("Impossible to not have at least 1 character to read.");`
			`if "\r\n".contains(leading_char) \|\| new_offset == 0 {`
			`break;`
			`}`
			`previous_character_offset = new_offset;`
			`}`
			`Some(&document[previous_character_offset..offset])`
			`}`

Move some functions into util. 2023-03-25 11:25:10 -04:00			`/// Check if the child string slice is a slice of the parent string slice.`
			`fn is_slice_of(parent: &str, child: &str) -> bool {`
			`let parent_start = parent.as_ptr() as usize;`
			`let parent_end = parent_start + parent.len();`
			`let child_start = child.as_ptr() as usize;`
			`let child_end = child_start + child.len();`
			`child_start >= parent_start && child_end <= parent_end`
			`}`

			`/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser.`
			`pub fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {`
			`assert!(is_slice_of(input, remaining));`
			`let source = {`
			`let offset = remaining.as_ptr() as usize - input.as_ptr() as usize;`
			`&input[..offset]`
			`};`
			`source`
			`}`

Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`/// A line containing only whitespace and then a line break`
			`///`
			`/// It is up to the caller to ensure this is called at the start of a line.`
Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`pub fn blank_line(input: &str) -> Res<&str, &str> {`
			`not(eof)(input)?;`
			`recognize(tuple((space0, alt((line_ending, eof)))))(input)`
			`}`

Move trailing whitespace parsing to a separate element. I still need to parse the line break at the end of elements. 2023-03-31 11:16:37 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn element_trailing_whitespace<'r, 's>(`
			`context: Context<'r, 's>,`
			`input: &'s str,`
			`) -> Res<&'s str, &'s str> {`
			`start_of_line(context, input)?;`
			`alt((eof, recognize(many0(blank_line))))(input)`
			`}`

Do not consume trailing whitespace if the parent exit matcher is matching. 2023-04-10 13:13:11 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn maybe_consume_trailing_whitespace_if_not_exiting<'r, 's>(`
			`context: Context<'r, 's>,`
			`input: &'s str,`
			`) -> Res<&'s str, Option<&'s str>> {`
			`if context.should_consume_trailing_whitespace() && exit_matcher_parser(context, input).is_err()`
			`{`
			`Ok(opt(parser_with_context!(element_trailing_whitespace)(`
			`context,`
			`))(input)?)`
			`} else {`
			`Ok((input, None))`
			`}`
			`}`

Footnote definitions are parsing on their own. 2023-04-10 11:50:43 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn maybe_consume_trailing_whitespace<'r, 's>(`
			`context: Context<'r, 's>,`
			`input: &'s str,`
			`) -> Res<&'s str, Option<&'s str>> {`
			`if context.should_consume_trailing_whitespace() {`
			`Ok(opt(parser_with_context!(element_trailing_whitespace)(`
			`context,`
			`))(input)?)`
			`} else {`
			`Ok((input, None))`
			`}`
			`}`

Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
Consume the trailing whitespace after a headline. 2023-03-25 11:59:19 -04:00			`pub fn trailing_whitespace(input: &str) -> Res<&str, &str> {`
			`alt((eof, recognize(tuple((line_ending, many0(blank_line))))))(input)`
			`}`

Building the plain list item context. 2023-03-25 14:10:22 -04:00			`/// Check that we are at the start of a line`
Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
Building the plain list item context. 2023-03-25 14:10:22 -04:00			`pub fn start_of_line<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, ()> {`
			`let document_root = context.get_document_root().unwrap();`
			`let preceding_character = get_one_before(document_root, input)`
			`.map(\|slice\| slice.chars().next())`
			`.flatten();`
			`match preceding_character {`
			`Some('\n') => {}`
			`Some(_) => {`
			`// Not at start of line, cannot be a heading`
			`return Err(nom::Err::Error(CustomError::MyError(MyError(`
			`"Not at start of line",`
			`))));`
			`}`
			`// If None, we are at the start of the file which allows for headings`
			`None => {}`
			`};`
			`Ok((input, ()))`
			`}`

Make sure text markup doesn't have interior spaces. 2023-04-22 22:34:37 -04:00			`/// Check that we are at the start of a line`
			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn preceded_by_whitespace<'r, 's>(`
			`context: Context<'r, 's>,`
			`input: &'s str,`
			`) -> Res<&'s str, ()> {`
			`let document_root = context.get_document_root().unwrap();`
			`let preceding_character = get_one_before(document_root, input)`
			`.map(\|slice\| slice.chars().next())`
			`.flatten();`
			`match preceding_character {`
			`Some('\n') \| Some('\r') \| Some(' ') \| Some('\t') => {}`
			`// If None, we are at the start of the file which is not allowed`
			`None \| Some(_) => {`
			`return Err(nom::Err::Error(CustomError::MyError(MyError(`
			`"Not preceded by whitespace.",`
			`))));`
			`}`
			`};`
			`Ok((input, ()))`
			`}`

Building the plain list item context. 2023-03-25 14:10:22 -04:00			`/// Pull one non-whitespace character.`
			`///`
			`/// This function only operates on spaces, tabs, carriage returns, and line feeds. It does not handle fancy unicode whitespace.`
Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
Building the plain list item context. 2023-03-25 14:10:22 -04:00			`pub fn non_whitespace_character(input: &str) -> Res<&str, char> {`
			`none_of(" \t\r\n")(input)`
			`}`

I seem to have solved the infinite loop issue by moving the exit check into the plain list parser. 2023-03-27 12:52:49 -04:00			`/// Check that we are at the start of a line`
Instrument the code. 2023-03-27 15:08:29 -04:00			`#[tracing::instrument(ret, level = "debug")]`
I seem to have solved the infinite loop issue by moving the exit check into the plain list parser. 2023-03-27 12:52:49 -04:00			`pub fn exit_matcher_parser<'r, 's>(`
			`context: Context<'r, 's>,`
			`input: &'s str,`
			`) -> Res<&'s str, &'s str> {`
			`peek(\|i\| context.check_exit_matcher(i))(input)`
			`}`

Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. 2023-03-27 18:08:17 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn always_fail<'r, 's>(_context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> {`
			`Err(nom::Err::Error(CustomError::MyError(MyError(`
			`"Always fail",`
			`))))`
			`}`

Add initial implementation of footnote definition. 2023-04-07 17:14:44 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn whitespace_eof(input: &str) -> Res<&str, &str> {`
			`recognize(tuple((multispace0, eof)))(input)`
			`}`

Switch to using plain text with no additional exit matcher added. 2023-04-22 22:06:34 -04:00			`#[tracing::instrument(ret, level = "debug")]`
			`pub fn text_until_exit<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> {`
			`recognize(verify(`
			`many_till(anychar, parser_with_context!(exit_matcher_parser)(context)),`
			`\|(children, _exit_contents)\| !children.is_empty(),`
			`))(input)`
			`}`

Add a not yet implemented function. This helps when creating new parsers since todo!()s will panic the whole parser. 2023-04-24 20:08:12 -04:00			`#[allow(dead_code)]`
			`pub fn not_yet_implemented() -> Res<&'static str, ()> {`
			`return Err(nom::Err::Error(CustomError::MyError(MyError(`
			`"Not implemented yet.",`
			`))));`
			`}`

Move some functions into util. 2023-03-25 11:25:10 -04:00			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn get_one_before_unicode() {`
			`let input = "🧡💛💚💙💜";`
			`let (green_heart_index, _) = input.char_indices().skip(2).next().unwrap();`
			`let starting_with_green_heart = &input[green_heart_index..];`
			`let yellow_heart = get_one_before(input, starting_with_green_heart).unwrap();`
			`assert!(is_slice_of(input, yellow_heart));`
			`assert_eq!(yellow_heart, "💛");`
			`}`
			`}`