organic/src/parser/util.rs

230 lines
8.0 KiB
Rust
Raw Normal View History

use nom::branch::alt;
use nom::character::complete::line_ending;
use nom::character::complete::multispace0;
2023-03-25 18:10:22 +00:00
use nom::character::complete::none_of;
use nom::character::complete::space0;
use nom::combinator::eof;
use nom::combinator::not;
use nom::combinator::opt;
use nom::combinator::peek;
use nom::combinator::recognize;
use nom::multi::many0;
use nom::sequence::tuple;
use super::parser_context::ContextElement;
use super::Context;
use crate::error::CustomError;
use crate::error::MyError;
use crate::error::Res;
use crate::parser::parser_with_context::parser_with_context;
pub const WORD_CONSTITUENT_CHARACTERS: &str =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
2022-12-18 08:18:43 +00:00
/// Check if we are below a section of the given section type regardless of depth
2023-04-21 22:22:17 +00:00
#[allow(dead_code)]
pub fn in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {
2022-12-18 08:18:43 +00:00
for thing in context.iter() {
match thing.get_data() {
ContextElement::Context(name) if *name == section_name => return true,
_ => {}
}
}
false
}
/// Checks if we are currently an immediate child of the given section type
pub fn immediate_in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {
for thing in context.iter() {
match thing.get_data() {
ContextElement::Context(name) if *name == section_name => return true,
ContextElement::Context(name) if *name != section_name => return false,
_ => {}
2022-12-18 08:18:43 +00:00
}
}
false
}
2023-03-25 15:25:10 +00:00
/// Get one character from before the current position.
pub fn get_one_before<'s>(document: &'s str, current_position: &'s str) -> Option<&'s str> {
assert!(is_slice_of(document, current_position));
if document.as_ptr() as usize == current_position.as_ptr() as usize {
return None;
}
let offset = current_position.as_ptr() as usize - document.as_ptr() as usize;
let previous_character_offset = document.floor_char_boundary(offset - 1);
Some(&document[previous_character_offset..offset])
}
/// Check if the child string slice is a slice of the parent string slice.
fn is_slice_of(parent: &str, child: &str) -> bool {
let parent_start = parent.as_ptr() as usize;
let parent_end = parent_start + parent.len();
let child_start = child.as_ptr() as usize;
let child_end = child_start + child.len();
child_start >= parent_start && child_end <= parent_end
}
/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser.
pub fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {
assert!(is_slice_of(input, remaining));
let source = {
let offset = remaining.as_ptr() as usize - input.as_ptr() as usize;
&input[..offset]
};
source
}
/// A line containing only whitespace and then a line break
///
/// It is up to the caller to ensure this is called at the start of a line.
#[tracing::instrument(ret, level = "debug")]
pub fn blank_line(input: &str) -> Res<&str, &str> {
not(eof)(input)?;
recognize(tuple((space0, alt((line_ending, eof)))))(input)
}
#[tracing::instrument(ret, level = "debug")]
pub fn element_trailing_whitespace<'r, 's>(
context: Context<'r, 's>,
input: &'s str,
) -> Res<&'s str, &'s str> {
start_of_line(context, input)?;
alt((eof, recognize(many0(blank_line))))(input)
}
#[tracing::instrument(ret, level = "debug")]
pub fn maybe_consume_trailing_whitespace_if_not_exiting<'r, 's>(
context: Context<'r, 's>,
input: &'s str,
) -> Res<&'s str, Option<&'s str>> {
if context.should_consume_trailing_whitespace() && exit_matcher_parser(context, input).is_err()
{
Ok(opt(parser_with_context!(element_trailing_whitespace)(
context,
))(input)?)
} else {
Ok((input, None))
}
}
#[tracing::instrument(ret, level = "debug")]
pub fn maybe_consume_trailing_whitespace<'r, 's>(
context: Context<'r, 's>,
input: &'s str,
) -> Res<&'s str, Option<&'s str>> {
if context.should_consume_trailing_whitespace() {
Ok(opt(parser_with_context!(element_trailing_whitespace)(
context,
))(input)?)
} else {
Ok((input, None))
}
}
#[tracing::instrument(ret, level = "debug")]
pub fn trailing_whitespace(input: &str) -> Res<&str, &str> {
alt((eof, recognize(tuple((line_ending, many0(blank_line))))))(input)
}
2023-03-25 18:10:22 +00:00
/// Check that we are at the start of a line
#[tracing::instrument(ret, level = "debug")]
2023-03-25 18:10:22 +00:00
pub fn start_of_line<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, ()> {
let document_root = context.get_document_root().unwrap();
let preceding_character = get_one_before(document_root, input)
.map(|slice| slice.chars().next())
.flatten();
match preceding_character {
Some('\n') => {}
Some(_) => {
// Not at start of line, cannot be a heading
return Err(nom::Err::Error(CustomError::MyError(MyError(
"Not at start of line",
))));
}
// If None, we are at the start of the file which allows for headings
None => {}
};
Ok((input, ()))
}
/// Pull one non-whitespace character.
///
/// This function only operates on spaces, tabs, carriage returns, and line feeds. It does not handle fancy unicode whitespace.
#[tracing::instrument(ret, level = "debug")]
2023-03-25 18:10:22 +00:00
pub fn non_whitespace_character(input: &str) -> Res<&str, char> {
none_of(" \t\r\n")(input)
}
/// Check that we are at the start of a line
2023-03-27 19:08:29 +00:00
#[tracing::instrument(ret, level = "debug")]
pub fn exit_matcher_parser<'r, 's>(
context: Context<'r, 's>,
input: &'s str,
) -> Res<&'s str, &'s str> {
peek(|i| context.check_exit_matcher(i))(input)
}
#[tracing::instrument(ret, level = "debug")]
pub fn always_fail<'r, 's>(_context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> {
Err(nom::Err::Error(CustomError::MyError(MyError(
"Always fail",
))))
}
/// Walk backwards unconsuming blank lines and line endings.
///
/// List items are a special case where the trailing blank lines do not belong to it, unlike all other elements. Rather than write that special logic into each child parser, this just walks backwards through the consumed input to unconsume trailing blank lines and line breaks.
2023-03-27 23:19:51 +00:00
#[tracing::instrument(ret, level = "debug")]
pub fn regurgitate<'s>(input: &'s str, remaining: &'s str) -> &'s str {
assert!(is_slice_of(input, remaining));
let mut offset = remaining.as_ptr() as usize - input.as_ptr() as usize;
let source = &input[..offset];
let mut char_indices = source.char_indices().rev();
loop {
match char_indices.next() {
Some((off, chr)) => {
if chr == '\n' {
offset = off;
} else if chr != ' ' && chr != '\t' {
return &input[offset..];
}
}
None => {
// It was all whitespace, so return the full input string
return input;
}
};
}
}
#[tracing::instrument(ret, level = "debug")]
pub fn whitespace_eof(input: &str) -> Res<&str, &str> {
recognize(tuple((multispace0, eof)))(input)
}
2023-03-25 15:25:10 +00:00
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn get_one_before_unicode() {
let input = "🧡💛💚💙💜";
let (green_heart_index, _) = input.char_indices().skip(2).next().unwrap();
let starting_with_green_heart = &input[green_heart_index..];
let yellow_heart = get_one_before(input, starting_with_green_heart).unwrap();
assert!(is_slice_of(input, yellow_heart));
assert_eq!(yellow_heart, "💛");
}
#[test]
fn regurgitate_unicode() {
let input = "🧡💛\n\t \t \n\n💚💙💜";
let (green_heart_index, _) = input.char_indices().skip(12).next().unwrap();
let starting_with_green_heart = &input[green_heart_index..];
let after_yellow = regurgitate(input, starting_with_green_heart);
assert!(is_slice_of(input, after_yellow));
assert_eq!(after_yellow, "\n\t \t \n\n💚💙💜");
}
2023-03-25 15:25:10 +00:00
}