diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..b90d38b5 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +SHELL := bash +.ONESHELL: +.SHELLFLAGS := -eu -o pipefail -c +.DELETE_ON_ERROR: +MAKEFLAGS += --warn-undefined-variables +MAKEFLAGS += --no-builtin-rules + +ifeq ($(origin .RECIPEPREFIX), undefined) + $(error This Make does not support .RECIPEPREFIX. Please use GNU Make 4.0 or later) +endif +.RECIPEPREFIX = > + +.PHONY: build +build: target/debug/toy + +.PHONY: clean +clean: +> cargo clean + +target/debug/toy: +> cargo build + +.PHONY: jaeger +jaeger: +> docker run -d --rm -p 6831:6831/udp -p 6832:6832/udp -p 16686:16686 -p 14268:14268 jaegertracing/all-in-one:latest diff --git a/language_rules.txt b/language_rules.txt deleted file mode 100644 index 11441967..00000000 --- a/language_rules.txt +++ /dev/null @@ -1 +0,0 @@ -Two line breaks to end paragraph except in code blocks diff --git a/org_mode_samples/exit_matcher_investigation/bold_with_asterisk_inside.org b/org_mode_samples/exit_matcher_investigation/bold_with_asterisk_inside.org new file mode 100644 index 00000000..d990b435 --- /dev/null +++ b/org_mode_samples/exit_matcher_investigation/bold_with_asterisk_inside.org @@ -0,0 +1 @@ +foo *bar baz * lorem* ipsum diff --git a/org_mode_samples/paragraphs/Makefile b/org_mode_samples/paragraphs/Makefile new file mode 100644 index 00000000..c47a86c1 --- /dev/null +++ b/org_mode_samples/paragraphs/Makefile @@ -0,0 +1,23 @@ +SHELL := bash +.ONESHELL: +.SHELLFLAGS := -eu -o pipefail -c +.DELETE_ON_ERROR: +MAKEFLAGS += --warn-undefined-variables +MAKEFLAGS += --no-builtin-rules +SRCFILES := $(wildcard *.org) +OUTFILES := $(patsubst %.org,%.tree.txt,$(SRCFILES)) + +ifeq ($(origin .RECIPEPREFIX), undefined) + $(error This Make does not support .RECIPEPREFIX. Please use GNU Make 4.0 or later) +endif +.RECIPEPREFIX = > + +.PHONY: all +all: $(OUTFILES) + +.PHONY: clean +clean: +> rm -rf $(OUTFILES) + +%.tree.txt: %.org ../common.el ../dump_org_ast.bash +> ../dump_org_ast.bash $< $@ diff --git a/org_mode_samples/paragraphs/paragraph_with_backslash_line_breaks.org b/org_mode_samples/paragraphs/paragraph_with_backslash_line_breaks.org new file mode 100644 index 00000000..12ce1b7d --- /dev/null +++ b/org_mode_samples/paragraphs/paragraph_with_backslash_line_breaks.org @@ -0,0 +1,7 @@ +This is a paragraph + +This is another paragraph +This is a second line in that paragraph + +This is a third paragraph \\ +This is a second line in that paragraph diff --git a/org_mode_samples/sections_and_headings/Makefile b/org_mode_samples/sections_and_headings/Makefile new file mode 100644 index 00000000..c47a86c1 --- /dev/null +++ b/org_mode_samples/sections_and_headings/Makefile @@ -0,0 +1,23 @@ +SHELL := bash +.ONESHELL: +.SHELLFLAGS := -eu -o pipefail -c +.DELETE_ON_ERROR: +MAKEFLAGS += --warn-undefined-variables +MAKEFLAGS += --no-builtin-rules +SRCFILES := $(wildcard *.org) +OUTFILES := $(patsubst %.org,%.tree.txt,$(SRCFILES)) + +ifeq ($(origin .RECIPEPREFIX), undefined) + $(error This Make does not support .RECIPEPREFIX. Please use GNU Make 4.0 or later) +endif +.RECIPEPREFIX = > + +.PHONY: all +all: $(OUTFILES) + +.PHONY: clean +clean: +> rm -rf $(OUTFILES) + +%.tree.txt: %.org ../common.el ../dump_org_ast.bash +> ../dump_org_ast.bash $< $@ diff --git a/org_mode_samples/sections_and_headings/immediate_heading.org b/org_mode_samples/sections_and_headings/immediate_heading.org new file mode 100644 index 00000000..5a8e221b --- /dev/null +++ b/org_mode_samples/sections_and_headings/immediate_heading.org @@ -0,0 +1 @@ +* Start a document with an immediate heading diff --git a/org_mode_samples/sections_and_headings/sections_and_headings.org b/org_mode_samples/sections_and_headings/sections_and_headings.org new file mode 100644 index 00000000..1b49c09c --- /dev/null +++ b/org_mode_samples/sections_and_headings/sections_and_headings.org @@ -0,0 +1,7 @@ +Before the first heading +* The first heading +body of the first section +** Child heading +body of child heading +* second top-level heading +body of second top-level heading diff --git a/src/main.rs b/src/main.rs index a50a86c3..6906a5b5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +#![feature(round_char_boundary)] use crate::parser::document; use tracing::Level; use tracing_subscriber::fmt::format::FmtSpan; diff --git a/src/parser/bold.rs b/src/parser/bold.rs index a8dc00aa..004e7f02 100644 --- a/src/parser/bold.rs +++ b/src/parser/bold.rs @@ -96,6 +96,9 @@ fn _preceded_by_whitespace<'r, 's>(context: Context<'r, 's>) -> bool { } ContextElement::Context(_) => {} ContextElement::ListItem(_) => {} + ContextElement::DocumentRoot(_) => { + return true; + } } } else { break; diff --git a/src/parser/combinator.rs b/src/parser/combinator.rs index 4b99f845..ceab6c81 100644 --- a/src/parser/combinator.rs +++ b/src/parser/combinator.rs @@ -1,37 +1,26 @@ -use super::parser_context::ContextElement; -use super::parser_context::PreviousElementNode; -use super::token::Token; -use super::Context; -use nom::error::ErrorKind; use nom::error::ParseError; use nom::IResult; use nom::InputLength; -pub fn context_many1<'r, 's, I, O, E, M>( +use super::Context; + +pub fn context_many1<'r: 's, 's, I, O, E, M>( context: Context<'r, 's>, mut many_matcher: M, -) -> impl FnMut(I) -> IResult>, E> + 'r +) -> impl FnMut(I) -> IResult, E> + 'r where I: Clone + InputLength, E: ParseError, M: for<'x> Fn(Context<'x, 's>, I) -> IResult + 'r, - O: Into>, { move |mut i: I| { let mut err = None; - // TODO: Can I eliminate the clone? I think this is incrementing the reference count - let mut current_context = context.clone(); - // Despite the clone, the Rc should still point to the same value. - assert!(current_context.ptr_eq(context)); + let mut elements: Vec = Vec::new(); loop { - match many_matcher(¤t_context, i.clone()) { + match many_matcher(&context, i.clone()) { Ok((remaining, many_elem)) => { - current_context = current_context.with_additional_node( - ContextElement::PreviousElementNode(PreviousElementNode { - element: many_elem.into(), - }), - ); i = remaining; + elements.push(many_elem); } the_error @ Err(_) => { err = Some(the_error); @@ -39,93 +28,11 @@ where } } } - let mut elements: Vec> = current_context - .into_iter_until(context) - .filter_map(|context_element| match context_element { - ContextElement::PreviousElementNode(elem) => Some(elem.element), - ContextElement::ExitMatcherNode(_) => None, - ContextElement::Context(_) => None, - ContextElement::StartOfParagraph => None, - ContextElement::ListItem(_) => None, - }) - .collect(); if elements.is_empty() { if let Some(err) = err { err?; } } - elements.reverse(); Ok((i, elements)) } } - -pub fn context_many_till<'r, 's, I, O, E, F, M, T>( - context: Context<'r, 's>, - mut many_matcher: M, - mut till_matcher: T, -) -> impl FnMut(I) -> IResult>, F), E> + 'r -where - I: Clone + InputLength, - E: ParseError, - M: for<'x> Fn(Context<'x, 's>, I) -> IResult + 'r, - T: for<'x> Fn(Context<'x, 's>, I) -> IResult + 'r, - O: Into>, -{ - move |mut i: I| { - // TODO: Can I eliminate the clone? I think this is incrementing the reference count - let mut current_context = context.clone(); - // Despite the clone, the Rc should still point to the same value, otherwise we'll get stuck in an endless loop. - assert!(current_context.ptr_eq(context)); - loop { - let len = i.input_len(); - match till_matcher(¤t_context, i.clone()) { - Ok((remaining, finish)) => { - let mut ret = Vec::new(); - while !current_context.ptr_eq(context) { - let (context_element, next_context) = current_context.pop_front(); - let context_element = context_element.expect("We only pop off context elements created in this function, so they are all Some()"); - current_context = next_context; - match context_element { - ContextElement::ExitMatcherNode(_) => {} - ContextElement::StartOfParagraph => {} - ContextElement::Context(_) => {} - ContextElement::PreviousElementNode(PreviousElementNode { - element: token, - }) => { - ret.push(token); - } - ContextElement::ListItem(_) => {} - }; - } - ret.reverse(); - return Ok((remaining, (ret, finish))); - } - Err(nom::Err::Error(_)) => { - match many_matcher(¤t_context, i.clone()) { - Err(nom::Err::Error(err)) => { - return Err(nom::Err::Error(E::append(i, ErrorKind::ManyTill, err))) - } - Err(e) => return Err(e), - Ok((remaining, many_elem)) => { - // infinite loop check: the parser must always consume - if remaining.input_len() == len { - return Err(nom::Err::Error(E::from_error_kind( - remaining, - ErrorKind::ManyTill, - ))); - } - - current_context = current_context.with_additional_node( - ContextElement::PreviousElementNode(PreviousElementNode { - element: many_elem.into(), - }), - ); - i = remaining; - } - } - } - Err(e) => return Err(e), - }; - } - } -} diff --git a/src/parser/document.rs b/src/parser/document.rs index d9571a8e..81f1a68c 100644 --- a/src/parser/document.rs +++ b/src/parser/document.rs @@ -1,26 +1,182 @@ -//! A single element of text. -use super::combinator::context_many1; +use nom::branch::alt; +use nom::bytes::complete::tag; +use nom::character::complete::line_ending; +use nom::character::complete::space1; +use nom::combinator::eof; +use nom::combinator::map; +use nom::combinator::not; +use nom::combinator::opt; +use nom::combinator::recognize; +use nom::combinator::verify; +use nom::multi::many0; +use nom::multi::many1; +use nom::multi::many1_count; +use nom::sequence::tuple; + +use crate::parser::element::element; +use crate::parser::error::CustomError; +use crate::parser::error::MyError; +use crate::parser::object::standard_set_object; +use crate::parser::parser_context::ChainBehavior; +use crate::parser::parser_context::ContextElement; +use crate::parser::parser_context::ContextTree; +use crate::parser::parser_context::ExitMatcherNode; + +use super::element::Element; use super::error::Res; -use super::paragraph::paragraph; -use super::parser_context::ContextTree; -use super::token::Paragraph; -use super::token::Token; +use super::object::Object; +use super::parser_with_context::parser_with_context; +use super::source::Source; +use super::util::get_consumed; +use super::util::get_one_before; +use super::util::trailing_whitespace; use super::Context; -use nom::IResult; -type UnboundMatcher<'r, 's, I, O, E> = dyn Fn(Context<'r, 's>, I) -> IResult; - -// TODO: Implement FromStr for Document - -pub fn document(input: &str) -> Res<&str, Vec> { - let initial_context: ContextTree<'_, '_> = ContextTree::new(); - let (remaining, tokens) = context_many1(&initial_context, paragraph)(input)?; - let paragraphs = tokens - .into_iter() - .map(|token| match token { - Token::TextElement(_) => unreachable!(), - Token::Paragraph(paragraph) => paragraph, - }) - .collect(); - Ok((remaining, paragraphs)) +#[derive(Debug)] +pub struct Document<'s> { + pub source: &'s str, + pub zeroth_section: Option>, + pub children: Vec>, +} + +#[derive(Debug)] +pub struct Heading<'s> { + pub source: &'s str, + pub stars: usize, + pub children: Vec>, +} + +#[derive(Debug)] +pub struct Section<'s> { + pub source: &'s str, + pub children: Vec>, +} + +#[derive(Debug)] +pub enum DocumentElement<'s> { + Heading(Heading<'s>), + Section(Section<'s>), +} + +impl<'s> Source<'s> for Document<'s> { + fn get_source(&'s self) -> &'s str { + self.source + } +} + +impl<'s> Source<'s> for DocumentElement<'s> { + fn get_source(&'s self) -> &'s str { + match self { + DocumentElement::Heading(obj) => obj.source, + DocumentElement::Section(obj) => obj.source, + } + } +} + +#[allow(dead_code)] +pub fn document(input: &str) -> Res<&str, Document> { + let initial_context: ContextTree<'_, '_> = ContextTree::new(); + let document_context = + initial_context.with_additional_node(ContextElement::DocumentRoot(input)); + let section_matcher = parser_with_context!(section)(&document_context); + let heading_matcher = parser_with_context!(heading)(&document_context); + let (remaining, zeroth_section) = opt(section_matcher)(input)?; + let (remaining, children) = many0(heading_matcher)(remaining)?; + let source = get_consumed(input, remaining); + Ok(( + remaining, + Document { + source, + zeroth_section, + children, + }, + )) +} + +fn section<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, Section<'s>> { + // TODO: The zeroth section is specialized so it probably needs its own parser + let parser_context = context + .with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { + exit_matcher: ChainBehavior::AndParent(Some(§ion_end)), + })) + .with_additional_node(ContextElement::Context("section")); + not(|i| parser_context.check_exit_matcher(i))(input)?; + let element_matcher = parser_with_context!(element)(&parser_context); + let (remaining, children) = many1(element_matcher)(input)?; + let source = get_consumed(input, remaining); + Ok((remaining, Section { source, children })) +} + +fn section_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { + let headline_matcher = parser_with_context!(headline)(context); + alt((recognize(headline_matcher), eof))(input) +} + +fn heading<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, Heading<'s>> { + not(|i| context.check_exit_matcher(i))(input)?; + let (remaining, (star_count, _ws, title, _ws2)) = headline(context, input)?; + let section_matcher = parser_with_context!(section)(context); + // TODO: This needs to only match headings below the current level + let heading_matcher = parser_with_context!(heading)(context); + let (remaining, children) = many0(alt(( + map( + verify(heading_matcher, |h| h.stars > star_count), + DocumentElement::Heading, + ), + map(section_matcher, DocumentElement::Section), + )))(remaining)?; + let source = get_consumed(input, remaining); + Ok(( + remaining, + Heading { + source, + stars: star_count, + children, + }, + )) +} + +fn headline<'r, 's>( + context: Context<'r, 's>, + input: &'s str, +) -> Res<&'s str, (usize, &'s str, Vec>, &'s str)> { + let parser_context = + context.with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { + exit_matcher: ChainBehavior::AndParent(Some(&headline_end)), + })); + let standard_set_object_matcher = parser_with_context!(standard_set_object)(&parser_context); + let start_of_line_matcher = parser_with_context!(start_of_line)(&parser_context); + + let (remaining, (_sol, star_count, ws, title, ws2)) = tuple(( + start_of_line_matcher, + many1_count(tag("*")), + space1, + many1(standard_set_object_matcher), + trailing_whitespace, + ))(input)?; + Ok((remaining, (star_count, ws, title, ws2))) +} + +fn headline_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { + alt((line_ending, eof))(input) +} + +/// Check that we are at the start of a line +fn start_of_line<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, ()> { + let document_root = context.get_document_root().unwrap(); + let preceding_character = get_one_before(document_root, input) + .map(|slice| slice.chars().next()) + .flatten(); + match preceding_character { + Some('\n') => {} + Some(_) => { + // Not at start of line, cannot be a heading + return Err(nom::Err::Error(CustomError::MyError(MyError( + "Not at start of line", + )))); + } + // If None, we are at the start of the file which allows for headings + None => {} + }; + Ok((input, ())) } diff --git a/src/parser/element.rs b/src/parser/element.rs new file mode 100644 index 00000000..3ae3d1a3 --- /dev/null +++ b/src/parser/element.rs @@ -0,0 +1,69 @@ +use nom::branch::alt; +use nom::character::complete::line_ending; +use nom::character::complete::space0; +use nom::combinator::eof; +use nom::combinator::map; +use nom::combinator::not; +use nom::combinator::recognize; +use nom::multi::many0; +use nom::multi::many1; +use nom::sequence::tuple; + +use crate::parser::object::standard_set_object; +use crate::parser::parser_context::ChainBehavior; +use crate::parser::parser_context::ContextElement; +use crate::parser::parser_context::ExitMatcherNode; +use crate::parser::parser_with_context::parser_with_context; + +use super::error::Res; +use super::greater_element::PlainList; +use super::lesser_element::Paragraph; +use super::source::Source; +use super::util::blank_line; +use super::util::get_consumed; +use super::util::trailing_whitespace; +use super::Context; + +#[derive(Debug)] +pub enum Element<'s> { + Paragraph(Paragraph<'s>), + PlainList(PlainList<'s>), +} + +impl<'s> Source<'s> for Element<'s> { + fn get_source(&'s self) -> &'s str { + match self { + Element::Paragraph(obj) => obj.source, + Element::PlainList(obj) => obj.source, + } + } +} + +pub fn element<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, Element<'s>> { + not(|i| context.check_exit_matcher(i))(input)?; + + let paragraph_matcher = parser_with_context!(paragraph)(context); + + map(paragraph_matcher, Element::Paragraph)(input) +} + +fn paragraph<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, Paragraph<'s>> { + let parser_context = + context.with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { + exit_matcher: ChainBehavior::AndParent(Some(¶graph_end)), + })); + let standard_set_object_matcher = parser_with_context!(standard_set_object)(&parser_context); + + let (remaining, children) = many1(standard_set_object_matcher)(input)?; + + let (remaining, _trailing_whitespace) = trailing_whitespace(remaining)?; + + let source = get_consumed(input, remaining); + + Ok((remaining, Paragraph { source, children })) +} + +fn paragraph_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { + // TODO: Other elements should also end paragraphs + alt((recognize(tuple((line_ending, many1(blank_line)))), eof))(input) +} diff --git a/src/parser/greater_element.rs b/src/parser/greater_element.rs new file mode 100644 index 00000000..38d2a7a4 --- /dev/null +++ b/src/parser/greater_element.rs @@ -0,0 +1,4 @@ +#[derive(Debug)] +pub struct PlainList<'s> { + pub source: &'s str, +} diff --git a/src/parser/lesser_element.rs b/src/parser/lesser_element.rs new file mode 100644 index 00000000..5abc4b38 --- /dev/null +++ b/src/parser/lesser_element.rs @@ -0,0 +1,7 @@ +use super::object::Object; + +#[derive(Debug)] +pub struct Paragraph<'s> { + pub source: &'s str, + pub children: Vec>, +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 545ef9fc..18cb3147 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,17 +1,20 @@ -mod bold; +// mod bold; mod combinator; mod document; +mod element; mod error; -mod link; +mod greater_element; +mod lesser_element; +// mod link; mod list; -mod paragraph; +mod object; +// mod paragraph; mod parser_context; mod parser_with_context; -mod plain_list; -mod text; -mod token; +// mod plain_list; +mod source; +// mod text; +// mod token; mod util; pub use document::document; type Context<'r, 's> = &'r parser_context::ContextTree<'r, 's>; -pub use parser_context::ContextTree; -pub use plain_list::item; diff --git a/src/parser/object.rs b/src/parser/object.rs new file mode 100644 index 00000000..aaf20240 --- /dev/null +++ b/src/parser/object.rs @@ -0,0 +1,108 @@ +use nom::combinator::map; +use nom::combinator::not; + +use crate::parser::error::CustomError; +use crate::parser::error::MyError; + +use super::error::Res; +use super::parser_with_context::parser_with_context; +use super::source::Source; +use super::Context; + +#[derive(Debug)] +pub enum Object<'s> { + TextMarkup(TextMarkup<'s>), + PlainText(PlainText<'s>), + RegularLink(RegularLink<'s>), +} + +#[derive(Debug)] +pub struct TextMarkup<'s> { + pub source: &'s str, +} + +#[derive(Debug)] +pub struct PlainText<'s> { + pub source: &'s str, +} + +#[derive(Debug)] +pub struct RegularLink<'s> { + pub source: &'s str, +} + +impl<'s> Source<'s> for Object<'s> { + fn get_source(&'s self) -> &'s str { + match self { + Object::TextMarkup(obj) => obj.source, + Object::PlainText(obj) => obj.source, + Object::RegularLink(obj) => obj.source, + } + } +} + +pub fn standard_set_object<'r, 's>( + context: Context<'r, 's>, + input: &'s str, +) -> Res<&'s str, Object<'s>> { + not(|i| context.check_exit_matcher(i))(input)?; + + let plain_text_matcher = parser_with_context!(plain_text)(context); + + map(plain_text_matcher, Object::PlainText)(input) +} + +fn plain_text<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, PlainText<'s>> { + if input.len() == 0 { + return Err(nom::Err::Error(CustomError::MyError(MyError( + "Zero input length to plain_text.", + )))); + } + // not(|i| context.check_exit_matcher(i))(input)?; + let mut current_input = input.char_indices(); + loop { + match current_input.next() { + Some((offset, _char)) => { + let remaining = &input[offset..]; + let exit_matcher_status = not(|i| context.check_exit_matcher(i))(remaining); + if exit_matcher_status.is_err() { + if offset == 0 { + // If we're at the start of the input, then nothing is plain text, so fire an error for zero-length match. + exit_matcher_status?; + } else { + return Ok(( + &input[offset..], + PlainText { + source: &input[..offset], + }, + )); + } + } + } + None => { + // We hit the end of the file, so all input must be plain text + return Ok((&input[input.len()..], PlainText { source: input })); + } + }; + } +} + +#[cfg(test)] +mod tests { + use crate::parser::parser_context::ContextElement; + use crate::parser::parser_context::ContextTree; + + use super::*; + + #[test] + fn plain_text_simple() { + let input = "foobarbaz"; + let initial_context: ContextTree<'_, '_> = ContextTree::new(); + let document_context = + initial_context.with_additional_node(ContextElement::DocumentRoot(input)); + let plain_text_matcher = parser_with_context!(plain_text)(&document_context); + let (remaining, result) = map(plain_text_matcher, Object::PlainText)(input).unwrap(); + assert_eq!(remaining, ""); + assert_eq!(result.get_source(), input); + } +} diff --git a/src/parser/old_combinator.rs b/src/parser/old_combinator.rs new file mode 100644 index 00000000..5cb31021 --- /dev/null +++ b/src/parser/old_combinator.rs @@ -0,0 +1,125 @@ +use super::parser_context::ContextElement; +use super::parser_context::PreviousElementNode; +use super::token::Token; +use super::Context; +use nom::error::ErrorKind; +use nom::error::ParseError; +use nom::IResult; +use nom::InputLength; + +pub fn context_many1<'r, 's, I, O, E, M>( + context: Context<'r, 's>, + mut many_matcher: M, +) -> impl FnMut(I) -> IResult>, E> + 'r +where + I: Clone + InputLength, + E: ParseError, + M: for<'x> Fn(Context<'x, 's>, I) -> IResult + 'r, + O: Into>, +{ + move |mut i: I| { + let mut err = None; + // TODO: Can I eliminate the clone? I think this is incrementing the reference count + let mut current_context = context.clone(); + // Despite the clone, the Rc should still point to the same value. + assert!(current_context.ptr_eq(context)); + loop { + match many_matcher(¤t_context, i.clone()) { + Ok((remaining, many_elem)) => { + current_context = current_context.with_additional_node( + ContextElement::PreviousElementNode(PreviousElementNode { + element: many_elem.into(), + }), + ); + i = remaining; + } + the_error @ Err(_) => { + err = Some(the_error); + break; + } + } + } + let mut elements: Vec> = current_context + .into_iter_until(context) + .filter_map(|context_element| match context_element { + ContextElement::PreviousElementNode(elem) => Some(elem.element), + _ => None, + }) + .collect(); + if elements.is_empty() { + if let Some(err) = err { + err?; + } + } + elements.reverse(); + Ok((i, elements)) + } +} + +pub fn context_many_till<'r, 's, I, O, E, F, M, T>( + context: Context<'r, 's>, + mut many_matcher: M, + mut till_matcher: T, +) -> impl FnMut(I) -> IResult>, F), E> + 'r +where + I: Clone + InputLength, + E: ParseError, + M: for<'x> Fn(Context<'x, 's>, I) -> IResult + 'r, + T: for<'x> Fn(Context<'x, 's>, I) -> IResult + 'r, + O: Into>, +{ + move |mut i: I| { + // TODO: Can I eliminate the clone? I think this is incrementing the reference count + let mut current_context = context.clone(); + // Despite the clone, the Rc should still point to the same value, otherwise we'll get stuck in an endless loop. + assert!(current_context.ptr_eq(context)); + loop { + let len = i.input_len(); + match till_matcher(¤t_context, i.clone()) { + Ok((remaining, finish)) => { + let mut ret = Vec::new(); + while !current_context.ptr_eq(context) { + let (context_element, next_context) = current_context.pop_front(); + let context_element = context_element.expect("We only pop off context elements created in this function, so they are all Some()"); + current_context = next_context; + match context_element { + ContextElement::PreviousElementNode(PreviousElementNode { + element: token, + }) => { + ret.push(token); + } + _ => {} + }; + } + ret.reverse(); + return Ok((remaining, (ret, finish))); + } + Err(nom::Err::Error(_)) => { + match many_matcher(¤t_context, i.clone()) { + Err(nom::Err::Error(err)) => { + return Err(nom::Err::Error(E::append(i, ErrorKind::ManyTill, err))) + } + Err(e) => return Err(e), + Ok((remaining, many_elem)) => { + // infinite loop check: the parser must always consume + if remaining.input_len() == len { + return Err(nom::Err::Error(E::from_error_kind( + remaining, + ErrorKind::ManyTill, + ))); + } + + current_context = current_context.with_additional_node( + ContextElement::PreviousElementNode(PreviousElementNode { + element: many_elem.into(), + }), + ); + i = remaining; + } + } + } + Err(e) => return Err(e), + }; + } + } +} diff --git a/src/parser/old_document.rs b/src/parser/old_document.rs new file mode 100644 index 00000000..1c8dd04a --- /dev/null +++ b/src/parser/old_document.rs @@ -0,0 +1,29 @@ +//! A single element of text. +use super::combinator::context_many1; +use super::error::Res; +use super::paragraph::paragraph; +use super::parser_context::ContextElement; +use super::parser_context::ContextTree; +use super::token::Paragraph; +use super::token::Token; +use super::Context; +use nom::IResult; + +type UnboundMatcher<'r, 's, I, O, E> = dyn Fn(Context<'r, 's>, I) -> IResult; + +// TODO: Implement FromStr for Document + +pub fn document(input: &str) -> Res<&str, Vec> { + let initial_context: ContextTree<'_, '_> = ContextTree::new(); + let document_context = + initial_context.with_additional_node(ContextElement::DocumentRoot(input)); + let (remaining, tokens) = context_many1(&document_context, paragraph)(input)?; + let paragraphs = tokens + .into_iter() + .map(|token| match token { + Token::TextElement(_) => unreachable!(), + Token::Paragraph(paragraph) => paragraph, + }) + .collect(); + Ok((remaining, paragraphs)) +} diff --git a/src/parser/parser_context.rs b/src/parser/parser_context.rs index 44fc9e1a..fd2b4058 100644 --- a/src/parser/parser_context.rs +++ b/src/parser/parser_context.rs @@ -7,7 +7,6 @@ use super::error::MyError; use super::error::Res; use super::list::List; use super::list::Node; -use super::token::Token; use super::Context; type Matcher = dyn for<'r, 's> Fn(Context<'r, 's>, &'s str) -> Res<&'s str, &'s str>; @@ -90,15 +89,27 @@ impl<'r, 's> ContextTree<'r, 's> { // TODO: Make this a specific error instead of just a generic MyError return Err(nom::Err::Error(CustomError::MyError(MyError("NoExit")))); } + + pub fn get_document_root(&self) -> Option<&'s str> { + for current_node in self.iter() { + let context_element = current_node.get_data(); + match context_element { + ContextElement::DocumentRoot(body) => { + return Some(body); + } + _ => {} + } + } + None + } } #[derive(Debug)] pub enum ContextElement<'r, 's> { + DocumentRoot(&'s str), ExitMatcherNode(ExitMatcherNode<'r>), - PreviousElementNode(PreviousElementNode<'s>), Context(&'r str), ListItem(usize), - StartOfParagraph, } #[derive(Debug)] @@ -106,11 +117,6 @@ pub struct ExitMatcherNode<'r> { pub exit_matcher: ChainBehavior<'r>, } -#[derive(Debug)] -pub struct PreviousElementNode<'r> { - pub element: Token<'r>, -} - #[derive(Clone)] pub enum ChainBehavior<'r> { AndParent(Option<&'r Matcher>), diff --git a/src/parser/source.rs b/src/parser/source.rs new file mode 100644 index 00000000..c8e54176 --- /dev/null +++ b/src/parser/source.rs @@ -0,0 +1,3 @@ +pub trait Source<'s> { + fn get_source(&'s self) -> &'s str; +} diff --git a/src/parser/util.rs b/src/parser/util.rs index 3600c136..9962dc78 100644 --- a/src/parser/util.rs +++ b/src/parser/util.rs @@ -1,16 +1,92 @@ +use nom::branch::alt; +use nom::character::complete::line_ending; +use nom::character::complete::space0; +use nom::combinator::eof; +use nom::combinator::not; +use nom::combinator::recognize; +use nom::multi::many0; +use nom::sequence::tuple; + +use super::error::Res; use super::parser_context::ContextElement; use super::Context; +/// Check if we are below a section of the given section type regardless of depth pub fn in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool { for thing in context.iter() { match thing.get_data() { - ContextElement::ExitMatcherNode(_) => {} - ContextElement::PreviousElementNode(_) => {} ContextElement::Context(name) if *name == section_name => return true, - ContextElement::Context(_) => {} - ContextElement::StartOfParagraph => {} // TODO: If we specialize this to bold then this would be a good spot to stop scanning - ContextElement::ListItem(_) => {} + _ => {} } } false } + +/// Checks if we are currently an immediate child of the given section type +pub fn immediate_in_section<'r, 's, 'x>(context: Context<'r, 's>, section_name: &'x str) -> bool { + for thing in context.iter() { + match thing.get_data() { + ContextElement::Context(name) if *name == section_name => return true, + ContextElement::Context(name) if *name != section_name => return false, + _ => {} + } + } + false +} + +/// Get one character from before the current position. +pub fn get_one_before<'s>(document: &'s str, current_position: &'s str) -> Option<&'s str> { + assert!(is_slice_of(document, current_position)); + if document.as_ptr() as usize == current_position.as_ptr() as usize { + return None; + } + let offset = current_position.as_ptr() as usize - document.as_ptr() as usize; + let previous_character_offset = document.floor_char_boundary(offset - 1); + Some(&document[previous_character_offset..offset]) +} + +/// Check if the child string slice is a slice of the parent string slice. +fn is_slice_of(parent: &str, child: &str) -> bool { + let parent_start = parent.as_ptr() as usize; + let parent_end = parent_start + parent.len(); + let child_start = child.as_ptr() as usize; + let child_end = child_start + child.len(); + child_start >= parent_start && child_end <= parent_end +} + +/// Get a slice of the string that was consumed in a parser using the original input to the parser and the remaining input after the parser. +pub fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str { + assert!(is_slice_of(input, remaining)); + let source = { + let offset = remaining.as_ptr() as usize - input.as_ptr() as usize; + &input[..offset] + }; + source +} + +/// A line containing only whitespace and then a line break +/// +/// It is up to the caller to ensure this is called at the start of a line. +pub fn blank_line(input: &str) -> Res<&str, &str> { + not(eof)(input)?; + recognize(tuple((space0, alt((line_ending, eof)))))(input) +} + +pub fn trailing_whitespace(input: &str) -> Res<&str, &str> { + alt((eof, recognize(tuple((line_ending, many0(blank_line))))))(input) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn get_one_before_unicode() { + let input = "๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ"; + let (green_heart_index, _) = input.char_indices().skip(2).next().unwrap(); + let starting_with_green_heart = &input[green_heart_index..]; + let yellow_heart = get_one_before(input, starting_with_green_heart).unwrap(); + assert!(is_slice_of(input, yellow_heart)); + assert_eq!(yellow_heart, "๐Ÿ’›"); + } +} diff --git a/toy_language.txt b/toy_language.txt index 2f8aa337..ac4a2d64 100644 --- a/toy_language.txt +++ b/toy_language.txt @@ -11,3 +11,12 @@ text* *nesting *bold entrances* and* exits + +* Heading + +body of heading + +** Child heading +** Immediate second child heading + +* Second top-level heading