organic/src/parser/util.rs

260 lines
8.8 KiB
Rust
Raw Normal View History

use nom::branch::alt;
use nom::character::complete::anychar;
use nom::character::complete::line_ending;
use nom::character::complete::multispace0;
2023-03-25 18:10:22 +00:00
use nom::character::complete::none_of;
use nom::character::complete::space0;
use nom::combinator::eof;
use nom::combinator::not;
use nom::combinator::opt;
use nom::combinator::peek;
use nom::combinator::recognize;
use nom::combinator::verify;
use nom::multi::many0;
use nom::multi::many_till;
use nom::sequence::tuple;
use super::parser_context::ContextElement;
use super::Context;
use crate::error::CustomError;
use crate::error::MyError;
use crate::error::Res;
use crate::parser::parser_with_context::parser_with_context;
pub const WORD_CONSTITUENT_CHARACTERS: &str =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
2022-12-18 08:18:43 +00:00
/// Check if we are below a section of the given section type regardless of depth
2023-04-21 22:22:17 +00:00
#[allow(dead_code)]
pub fn in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {
2022-12-18 08:18:43 +00:00
for thing in context.iter() {
match thing.get_data() {
ContextElement::Context(name) if *name == section_name => return true,
_ => {}
}
}
false
}
/// Checks if we are currently an immediate child of the given section type
pub fn immediate_in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool {
for thing in context.iter() {
match thing.get_data() {
ContextElement::Context(name) if *name == section_name => return true,
ContextElement::Context(name) if *name != section_name => return false,
_ => {}
2022-12-18 08:18:43 +00:00
}
}
false
}
2023-03-25 15:25:10 +00:00
/// Get one character from before the current position.
pub fn get_one_before<'s>(document: &'s str, current_position: &'s str) -> Option<&'s str> {
assert!(is_slice_of(document, current_position));
if document.as_ptr() as usize == current_position.as_ptr() as usize {
return None;
}
let offset = current_position.as_ptr() as usize - document.as_ptr() as usize;
let previous_character_offset = document.floor_char_boundary(offset - 1);
Some(&document[previous_character_offset..offset])
}
/// Get the line current_position is on up until current_position
pub fn get_current_line_before_position<'s>(
document: &'s str,
current_position: &'s str,
) -> Option<&'s str> {
assert!(is_slice_of(document, current_position));
if document.as_ptr() as usize == current_position.as_ptr() as usize {
return None;
}
let offset = current_position.as_ptr() as usize - document.as_ptr() as usize;
let mut previous_character_offset = offset;
loop {
let new_offset = document.floor_char_boundary(previous_character_offset - 1);
let new_line = &document[new_offset..offset];
let leading_char = new_line
.chars()
.next()
.expect("Impossible to not have at least 1 character to read.");
if "\r\n".contains(leading_char) || new_offset == 0 {
break;
}
previous_character_offset = new_offset;
}
Some(&document[previous_character_offset..offset])
}
2023-03-25 15:25:10 +00:00
/// Check if the child string slice is a slice of the parent string slice.
fn is_slice_of(parent: &str, child: &str) -> bool {
let parent_start = parent.as_ptr() as usize;
let parent_end = parent_start + parent.len();
let child_start = child.as_ptr() as usize;
let child_end = child_start + child.len();
child_start >= parent_start && child_end <= parent_end
}
/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser.
pub fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str {
assert!(is_slice_of(input, remaining));
let source = {
let offset = remaining.as_ptr() as usize - input.as_ptr() as usize;
&input[..offset]
};
source
}
/// A line containing only whitespace and then a line break
///
/// It is up to the caller to ensure this is called at the start of a line.
#[tracing::instrument(ret, level = "debug")]
pub fn blank_line(input: &str) -> Res<&str, &str> {
not(eof)(input)?;
recognize(tuple((space0, alt((line_ending, eof)))))(input)
}
#[tracing::instrument(ret, level = "debug")]
pub fn element_trailing_whitespace<'r, 's>(
context: Context<'r, 's>,
input: &'s str,
) -> Res<&'s str, &'s str> {
start_of_line(context, input)?;
alt((eof, recognize(many0(blank_line))))(input)
}
#[tracing::instrument(ret, level = "debug")]
pub fn maybe_consume_trailing_whitespace_if_not_exiting<'r, 's>(
context: Context<'r, 's>,
input: &'s str,
) -> Res<&'s str, Option<&'s str>> {
if context.should_consume_trailing_whitespace() && exit_matcher_parser(context, input).is_err()
{
Ok(opt(parser_with_context!(element_trailing_whitespace)(
context,
))(input)?)
} else {
Ok((input, None))
}
}
#[tracing::instrument(ret, level = "debug")]
pub fn maybe_consume_trailing_whitespace<'r, 's>(
context: Context<'r, 's>,
input: &'s str,
) -> Res<&'s str, Option<&'s str>> {
if context.should_consume_trailing_whitespace() {
Ok(opt(parser_with_context!(element_trailing_whitespace)(
context,
))(input)?)
} else {
Ok((input, None))
}
}
#[tracing::instrument(ret, level = "debug")]
pub fn trailing_whitespace(input: &str) -> Res<&str, &str> {
alt((eof, recognize(tuple((line_ending, many0(blank_line))))))(input)
}
2023-03-25 18:10:22 +00:00
/// Check that we are at the start of a line
#[tracing::instrument(ret, level = "debug")]
2023-03-25 18:10:22 +00:00
pub fn start_of_line<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, ()> {
let document_root = context.get_document_root().unwrap();
let preceding_character = get_one_before(document_root, input)
.map(|slice| slice.chars().next())
.flatten();
match preceding_character {
Some('\n') => {}
Some(_) => {
// Not at start of line, cannot be a heading
return Err(nom::Err::Error(CustomError::MyError(MyError(
"Not at start of line",
))));
}
// If None, we are at the start of the file which allows for headings
None => {}
};
Ok((input, ()))
}
/// Check that we are at the start of a line
#[tracing::instrument(ret, level = "debug")]
pub fn preceded_by_whitespace<'r, 's>(
context: Context<'r, 's>,
input: &'s str,
) -> Res<&'s str, ()> {
let document_root = context.get_document_root().unwrap();
let preceding_character = get_one_before(document_root, input)
.map(|slice| slice.chars().next())
.flatten();
match preceding_character {
Some('\n') | Some('\r') | Some(' ') | Some('\t') => {}
// If None, we are at the start of the file which is not allowed
None | Some(_) => {
return Err(nom::Err::Error(CustomError::MyError(MyError(
"Not preceded by whitespace.",
))));
}
};
Ok((input, ()))
}
2023-03-25 18:10:22 +00:00
/// Pull one non-whitespace character.
///
/// This function only operates on spaces, tabs, carriage returns, and line feeds. It does not handle fancy unicode whitespace.
#[tracing::instrument(ret, level = "debug")]
2023-03-25 18:10:22 +00:00
pub fn non_whitespace_character(input: &str) -> Res<&str, char> {
none_of(" \t\r\n")(input)
}
/// Check that we are at the start of a line
2023-03-27 19:08:29 +00:00
#[tracing::instrument(ret, level = "debug")]
pub fn exit_matcher_parser<'r, 's>(
context: Context<'r, 's>,
input: &'s str,
) -> Res<&'s str, &'s str> {
peek(|i| context.check_exit_matcher(i))(input)
}
#[tracing::instrument(ret, level = "debug")]
pub fn always_fail<'r, 's>(_context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> {
Err(nom::Err::Error(CustomError::MyError(MyError(
"Always fail",
))))
}
#[tracing::instrument(ret, level = "debug")]
pub fn whitespace_eof(input: &str) -> Res<&str, &str> {
recognize(tuple((multispace0, eof)))(input)
}
#[tracing::instrument(ret, level = "debug")]
pub fn text_until_exit<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> {
recognize(verify(
many_till(anychar, parser_with_context!(exit_matcher_parser)(context)),
|(children, _exit_contents)| !children.is_empty(),
))(input)
}
#[allow(dead_code)]
pub fn not_yet_implemented() -> Res<&'static str, ()> {
return Err(nom::Err::Error(CustomError::MyError(MyError(
"Not implemented yet.",
))));
}
2023-03-25 15:25:10 +00:00
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn get_one_before_unicode() {
let input = "🧡💛💚💙💜";
let (green_heart_index, _) = input.char_indices().skip(2).next().unwrap();
let starting_with_green_heart = &input[green_heart_index..];
let yellow_heart = get_one_before(input, starting_with_green_heart).unwrap();
assert!(is_slice_of(input, yellow_heart));
assert_eq!(yellow_heart, "💛");
}
}