From e1fbe36297874908b75f3061e715ba591d08aacd Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Mon, 27 Mar 2023 17:06:22 -0400 Subject: [PATCH 01/14] Exporting traces to jaeger. --- Cargo.toml | 3 +++ src/main.rs | 26 ++++++++++++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bc11dfc0..627ec01f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,10 @@ path = "src/main.rs" [dependencies] nom = "7.1.1" +opentelemetry = "0.17.0" +opentelemetry-jaeger = "0.16.0" tracing = "0.1.37" +tracing-opentelemetry = "0.17.2" tracing-subscriber = {version="0.3.16", features=["env-filter"]} [features] diff --git a/src/main.rs b/src/main.rs index 2d4a3dcb..cc494715 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,10 @@ #![feature(round_char_boundary)] use crate::parser::document; -use tracing_subscriber::fmt::format::FmtSpan; use tracing_subscriber::EnvFilter; mod parser; +use tracing_subscriber::fmt; +use tracing_subscriber::layer::SubscriberExt; +use tracing_subscriber::util::SubscriberInitExt; const TEST_DOC: &'static str = include_str!("../toy_language.txt"); @@ -16,17 +18,25 @@ fn main() -> Result<(), Box> { fn init_telemetry() -> Result<(), Box> { let env_filter = EnvFilter::try_from_default_env().unwrap_or(EnvFilter::new("WARN")); - let format = tracing_subscriber::fmt::format() + + let stdout = fmt::Layer::new() .pretty() .with_file(true) .with_line_number(true) .with_thread_ids(false) .with_target(false); - let subscriber = tracing_subscriber::fmt() - .event_format(format) - .with_span_events(FmtSpan::ENTER | FmtSpan::EXIT) - .with_env_filter(env_filter) - .finish(); - tracing::subscriber::set_global_default(subscriber)?; + + opentelemetry::global::set_text_map_propagator(opentelemetry_jaeger::Propagator::new()); + let tracer = opentelemetry_jaeger::new_pipeline() + .with_service_name("toy_language") + .install_simple()?; + + let opentelemetry = tracing_opentelemetry::layer().with_tracer(tracer); + + tracing_subscriber::registry() + .with(env_filter) + .with(opentelemetry) + .with(stdout) + .try_init()?; Ok(()) } From 028946ec901c1b8a08c6a9a4e3c274b3adaff2d7 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Mon, 27 Mar 2023 18:08:17 -0400 Subject: [PATCH 02/14] Identified the problem. The issue is plain text is eating the line break so paragraph is failing since it expects a line break at the end. --- src/parser/document.rs | 3 +-- src/parser/element.rs | 1 + src/parser/object.rs | 1 + src/parser/paragraph.rs | 2 ++ src/parser/parser_context.rs | 10 ++++++++-- src/parser/plain_list.rs | 8 ++++++-- src/parser/plain_text.rs | 1 + src/parser/util.rs | 11 +++++++++++ 8 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/parser/document.rs b/src/parser/document.rs index 79a161c7..dd5eb67e 100644 --- a/src/parser/document.rs +++ b/src/parser/document.rs @@ -83,8 +83,7 @@ pub fn document(input: &str) -> Res<&str, Document> { let section_matcher = parser_with_context!(section)(&document_context); let heading_matcher = parser_with_context!(heading)(&document_context); let (remaining, zeroth_section) = opt(section_matcher)(input)?; - // let (remaining, children) = many0(heading_matcher)(remaining)?; - let children = Vec::new(); + let (remaining, children) = many0(heading_matcher)(remaining)?; let source = get_consumed(input, remaining); Ok(( remaining, diff --git a/src/parser/element.rs b/src/parser/element.rs index 7b4f8834..d0f2ee0a 100644 --- a/src/parser/element.rs +++ b/src/parser/element.rs @@ -24,6 +24,7 @@ impl<'s> Source<'s> for Element<'s> { } } +#[tracing::instrument(ret, level = "debug")] pub fn element<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, Element<'s>> { let non_paragraph_matcher = parser_with_context!(non_paragraph_element)(context); let paragraph_matcher = parser_with_context!(paragraph)(context); diff --git a/src/parser/object.rs b/src/parser/object.rs index d94e7837..50afeb4e 100644 --- a/src/parser/object.rs +++ b/src/parser/object.rs @@ -39,6 +39,7 @@ impl<'s> Source<'s> for Object<'s> { } } +#[tracing::instrument(ret, level = "debug")] pub fn standard_set_object<'r, 's>( context: Context<'r, 's>, input: &'s str, diff --git a/src/parser/paragraph.rs b/src/parser/paragraph.rs index be96c61e..feb1e57a 100644 --- a/src/parser/paragraph.rs +++ b/src/parser/paragraph.rs @@ -19,6 +19,7 @@ use super::util::get_consumed; use super::util::trailing_whitespace; use super::Context; +#[tracing::instrument(ret, level = "debug")] pub fn paragraph<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, Paragraph<'s>> { let parser_context = context.with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { @@ -35,6 +36,7 @@ pub fn paragraph<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s st Ok((remaining, Paragraph { source, children })) } +#[tracing::instrument(ret, level = "debug")] fn paragraph_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { let non_paragraph_element_matcher = parser_with_context!(non_paragraph_element)(context); alt(( diff --git a/src/parser/parser_context.rs b/src/parser/parser_context.rs index 8f4ac3a6..df39b3be 100644 --- a/src/parser/parser_context.rs +++ b/src/parser/parser_context.rs @@ -8,6 +8,7 @@ use super::error::MyError; use super::error::Res; use super::list::List; use super::list::Node; +use super::util::always_fail; use super::Context; type Matcher = dyn for<'r, 's> Fn(Context<'r, 's>, &'s str) -> Res<&'s str, &'s str>; @@ -64,20 +65,25 @@ impl<'r, 's> ContextTree<'r, 's> { return at_end_of_file; } + let blocked_context = + self.with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { + exit_matcher: ChainBehavior::IgnoreParent(Some(&always_fail)), + })); + for current_node in self.iter() { let context_element = current_node.get_data(); match context_element { ContextElement::ExitMatcherNode(exit_matcher) => { match exit_matcher.exit_matcher { ChainBehavior::AndParent(Some(matcher)) => { - let local_result = matcher(self, i); + let local_result = matcher(&blocked_context, i); if local_result.is_ok() { return local_result; } } ChainBehavior::AndParent(None) => {} ChainBehavior::IgnoreParent(Some(matcher)) => { - let local_result = matcher(self, i); + let local_result = matcher(&blocked_context, i); if local_result.is_ok() { return local_result; } diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 32115cb1..08cf419c 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -25,7 +25,7 @@ use nom::multi::many0; use nom::multi::many_till; use nom::sequence::tuple; -#[allow(dead_code)] +#[tracing::instrument(ret, level = "debug")] pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, PlainList<'s>> { let (remaining, first_item) = plain_list_item(context, input)?; let plain_list_item_matcher = parser_with_context!(plain_list_item)(context); @@ -41,7 +41,7 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s Ok((remaining, PlainList { source, children })) } -#[allow(dead_code)] +#[tracing::instrument(ret, level = "debug")] pub fn plain_list_item<'r, 's>( context: Context<'r, 's>, input: &'s str, @@ -72,6 +72,7 @@ pub fn plain_list_item<'r, 's>( )) } +#[tracing::instrument(ret, level = "debug")] fn bullet<'s>(i: &'s str) -> Res<&'s str, &'s str> { alt(( tag("*"), @@ -81,10 +82,12 @@ fn bullet<'s>(i: &'s str) -> Res<&'s str, &'s str> { ))(i) } +#[tracing::instrument(ret, level = "debug")] fn counter<'s>(i: &'s str) -> Res<&'s str, &'s str> { alt((recognize(one_of("abcdefghijklmnopqrstuvwxyz")), digit1))(i) } +#[tracing::instrument(ret, level = "debug")] fn plain_list_item_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { let plain_list_item_matcher = parser_with_context!(plain_list_item)(context); let line_indented_lte_matcher = parser_with_context!(line_indented_lte)(context); @@ -95,6 +98,7 @@ fn plain_list_item_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res< ))(input) } +#[tracing::instrument(ret, level = "debug")] fn line_indented_lte<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { let current_item_indent_level: &usize = get_context_item_indent(context).ok_or(nom::Err::Error(CustomError::MyError(MyError( diff --git a/src/parser/plain_text.rs b/src/parser/plain_text.rs index 87d592b3..3eb76978 100644 --- a/src/parser/plain_text.rs +++ b/src/parser/plain_text.rs @@ -7,6 +7,7 @@ use super::error::Res; use super::object::PlainText; use super::Context; +#[tracing::instrument(ret, level = "debug")] pub fn plain_text<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, PlainText<'s>> { if input.len() == 0 { return Err(nom::Err::Error(CustomError::MyError(MyError( diff --git a/src/parser/util.rs b/src/parser/util.rs index c4f57e5a..a6f1b6ef 100644 --- a/src/parser/util.rs +++ b/src/parser/util.rs @@ -71,16 +71,19 @@ pub fn get_consumed<'s>(input: &'s str, remaining: &'s str) -> &'s str { /// A line containing only whitespace and then a line break /// /// It is up to the caller to ensure this is called at the start of a line. +#[tracing::instrument(ret, level = "debug")] pub fn blank_line(input: &str) -> Res<&str, &str> { not(eof)(input)?; recognize(tuple((space0, alt((line_ending, eof)))))(input) } +#[tracing::instrument(ret, level = "debug")] pub fn trailing_whitespace(input: &str) -> Res<&str, &str> { alt((eof, recognize(tuple((line_ending, many0(blank_line))))))(input) } /// Check that we are at the start of a line +#[tracing::instrument(ret, level = "debug")] pub fn start_of_line<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, ()> { let document_root = context.get_document_root().unwrap(); let preceding_character = get_one_before(document_root, input) @@ -103,6 +106,7 @@ pub fn start_of_line<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&' /// Pull one non-whitespace character. /// /// This function only operates on spaces, tabs, carriage returns, and line feeds. It does not handle fancy unicode whitespace. +#[tracing::instrument(ret, level = "debug")] pub fn non_whitespace_character(input: &str) -> Res<&str, char> { none_of(" \t\r\n")(input) } @@ -116,6 +120,13 @@ pub fn exit_matcher_parser<'r, 's>( peek(|i| context.check_exit_matcher(i))(input) } +#[tracing::instrument(ret, level = "debug")] +pub fn always_fail<'r, 's>(_context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { + Err(nom::Err::Error(CustomError::MyError(MyError( + "Always fail", + )))) +} + #[cfg(test)] mod tests { use super::*; From 2c7a559869049e7f5e28831d7321787f82061e08 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Mon, 27 Mar 2023 18:10:14 -0400 Subject: [PATCH 03/14] Fix the line break consumption issue. This leaves us with an issue of lists becoming needlessly nested. --- src/parser/paragraph.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/paragraph.rs b/src/parser/paragraph.rs index feb1e57a..0c916a4c 100644 --- a/src/parser/paragraph.rs +++ b/src/parser/paragraph.rs @@ -41,7 +41,7 @@ fn paragraph_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s st let non_paragraph_element_matcher = parser_with_context!(non_paragraph_element)(context); alt(( recognize(tuple((line_ending, many1(blank_line)))), - recognize(non_paragraph_element_matcher), + recognize(tuple((line_ending, non_paragraph_element_matcher))), eof, ))(input) } From 3d8fe253c91f38a7ee69b159dd50ba7c55ed57ff Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Mon, 27 Mar 2023 18:22:08 -0400 Subject: [PATCH 04/14] Check for exit matcher between elements in a plain list item. --- src/parser/parser_context.rs | 1 + src/parser/plain_list.rs | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/parser/parser_context.rs b/src/parser/parser_context.rs index df39b3be..445884ac 100644 --- a/src/parser/parser_context.rs +++ b/src/parser/parser_context.rs @@ -55,6 +55,7 @@ impl<'r, 's> ContextTree<'r, 's> { self.tree.into_iter_until(&other.tree) } + #[tracing::instrument(ret, level = "debug")] pub fn check_exit_matcher( &'r self, i: &'s str, diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 08cf419c..9e461215 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -57,8 +57,10 @@ pub fn plain_list_item<'r, 's>( .with_additional_node(ContextElement::ListItem(indent_level)); let element_matcher = parser_with_context!(element)(&parser_context); + let exit_matcher = parser_with_context!(exit_matcher_parser)(&parser_context); let (remaining, (bull, _ws)) = tuple((bullet, space0))(remaining)?; - let (remaining, contents) = many0(element_matcher)(remaining)?; + let (remaining, (contents, _exit_contents)) = + many_till(element_matcher, exit_matcher)(remaining)?; let source = get_consumed(input, remaining); Ok(( From 3643f91bac8fd80622f095e99e8d2fa2eaad3a6a Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Mon, 27 Mar 2023 18:50:25 -0400 Subject: [PATCH 05/14] The current problem is whitespace at the end of a list item should not be consumed. --- src/parser/paragraph.rs | 9 ++++++++- src/parser/plain_list.rs | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/parser/paragraph.rs b/src/parser/paragraph.rs index 0c916a4c..b0e6d2ff 100644 --- a/src/parser/paragraph.rs +++ b/src/parser/paragraph.rs @@ -2,7 +2,9 @@ use nom::branch::alt; use nom::character::complete::line_ending; use nom::combinator::eof; use nom::combinator::recognize; +use nom::combinator::verify; use nom::multi::many1; +use nom::multi::many_till; use nom::sequence::tuple; use crate::parser::object::standard_set_object; @@ -10,6 +12,7 @@ use crate::parser::parser_context::ChainBehavior; use crate::parser::parser_context::ContextElement; use crate::parser::parser_context::ExitMatcherNode; use crate::parser::parser_with_context::parser_with_context; +use crate::parser::util::exit_matcher_parser; use super::element::non_paragraph_element; use super::error::Res; @@ -26,8 +29,12 @@ pub fn paragraph<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s st exit_matcher: ChainBehavior::AndParent(Some(¶graph_end)), })); let standard_set_object_matcher = parser_with_context!(standard_set_object)(&parser_context); + let exit_matcher = parser_with_context!(exit_matcher_parser)(&parser_context); - let (remaining, children) = many1(standard_set_object_matcher)(input)?; + let (remaining, (children, _exit_contents)) = verify( + many_till(standard_set_object_matcher, exit_matcher), + |(children, _exit_contents)| !children.is_empty(), + )(input)?; let (remaining, _trailing_whitespace) = trailing_whitespace(remaining)?; diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 9e461215..9743fd6c 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -16,6 +16,7 @@ use crate::parser::util::start_of_line; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::digit1; +use nom::character::complete::line_ending; use nom::character::complete::one_of; use nom::character::complete::space0; use nom::combinator::eof; @@ -94,8 +95,8 @@ fn plain_list_item_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res< let plain_list_item_matcher = parser_with_context!(plain_list_item)(context); let line_indented_lte_matcher = parser_with_context!(line_indented_lte)(context); alt(( - recognize(plain_list_item_matcher), - line_indented_lte_matcher, + recognize(tuple((line_ending, plain_list_item_matcher))), + recognize(tuple((line_ending, line_indented_lte_matcher))), eof, ))(input) } From 9545990b522f11224ba5c6fa75e19fe80374e9f3 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Mon, 27 Mar 2023 19:12:20 -0400 Subject: [PATCH 06/14] Regurgitate seems to have made all text a paragraph. --- src/parser/plain_list.rs | 2 ++ src/parser/util.rs | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 9743fd6c..ed7a310c 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -12,6 +12,7 @@ use crate::parser::parser_context::ContextElement; use crate::parser::parser_context::ExitMatcherNode; use crate::parser::util::exit_matcher_parser; use crate::parser::util::get_consumed; +use crate::parser::util::regurgitate; use crate::parser::util::start_of_line; use nom::branch::alt; use nom::bytes::complete::tag; @@ -62,6 +63,7 @@ pub fn plain_list_item<'r, 's>( let (remaining, (bull, _ws)) = tuple((bullet, space0))(remaining)?; let (remaining, (contents, _exit_contents)) = many_till(element_matcher, exit_matcher)(remaining)?; + let remaining = regurgitate(input, remaining); let source = get_consumed(input, remaining); Ok(( diff --git a/src/parser/util.rs b/src/parser/util.rs index a6f1b6ef..9804b13d 100644 --- a/src/parser/util.rs +++ b/src/parser/util.rs @@ -127,6 +127,31 @@ pub fn always_fail<'r, 's>(_context: Context<'r, 's>, input: &'s str) -> Res<&'s )))) } +/// Walk backwards unconsuming blank lines and line endings. +/// +/// List items are a special case where the trailing blank lines do not belong to it, unlike all other elements. Rather than write that special logic into each child parser, this just walks backwards through the consumed input to unconsume trailing blank lines and line breaks. +pub fn regurgitate<'s>(input: &'s str, remaining: &'s str) -> &'s str { + assert!(is_slice_of(input, remaining)); + let mut offset = remaining.as_ptr() as usize - input.as_ptr() as usize; + let source = &input[..offset]; + let mut char_indices = source.char_indices().rev(); + loop { + match char_indices.next() { + Some((off, chr)) => { + if chr == '\n' { + offset = off; + } else if chr != ' ' && chr != '\t' { + return &input[offset..]; + } + } + None => { + // It was all whitespace, so return the full input string + return input; + } + }; + } +} + #[cfg(test)] mod tests { use super::*; @@ -140,4 +165,14 @@ mod tests { assert!(is_slice_of(input, yellow_heart)); assert_eq!(yellow_heart, "๐Ÿ’›"); } + + #[test] + fn regurgitate_unicode() { + let input = "๐Ÿงก๐Ÿ’›\n\t \t \n\n๐Ÿ’š๐Ÿ’™๐Ÿ’œ"; + let (green_heart_index, _) = input.char_indices().skip(12).next().unwrap(); + let starting_with_green_heart = &input[green_heart_index..]; + let after_yellow = regurgitate(input, starting_with_green_heart); + assert!(is_slice_of(input, after_yellow)); + assert_eq!(after_yellow, "\n\t \t \n\n๐Ÿ’š๐Ÿ’™๐Ÿ’œ"); + } } From 775e703aded7ef27420161ef5c60e429a5370c56 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Mon, 27 Mar 2023 19:19:51 -0400 Subject: [PATCH 07/14] Not sure whats going on. --- src/parser/plain_list.rs | 8 ++++++-- src/parser/util.rs | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index ed7a310c..453ba2ce 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -14,6 +14,7 @@ use crate::parser::util::exit_matcher_parser; use crate::parser::util::get_consumed; use crate::parser::util::regurgitate; use crate::parser::util::start_of_line; +use crate::parser::util::trailing_whitespace; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::digit1; @@ -38,6 +39,7 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s }), exit_matcher, )(remaining)?; + let (remaining, _trailing_whitespace) = trailing_whitespace(remaining)?; let source = get_consumed(input, remaining); children.insert(0, first_item); Ok((remaining, PlainList { source, children })) @@ -61,8 +63,10 @@ pub fn plain_list_item<'r, 's>( let element_matcher = parser_with_context!(element)(&parser_context); let exit_matcher = parser_with_context!(exit_matcher_parser)(&parser_context); let (remaining, (bull, _ws)) = tuple((bullet, space0))(remaining)?; - let (remaining, (contents, _exit_contents)) = - many_till(element_matcher, exit_matcher)(remaining)?; + let (remaining, (contents, _exit_contents)) = many_till(element_matcher, |i| { + let with_whitespace_added_back = regurgitate(input, i); + exit_matcher(with_whitespace_added_back) + })(remaining)?; let remaining = regurgitate(input, remaining); let source = get_consumed(input, remaining); diff --git a/src/parser/util.rs b/src/parser/util.rs index 9804b13d..ecd08444 100644 --- a/src/parser/util.rs +++ b/src/parser/util.rs @@ -130,6 +130,7 @@ pub fn always_fail<'r, 's>(_context: Context<'r, 's>, input: &'s str) -> Res<&'s /// Walk backwards unconsuming blank lines and line endings. /// /// List items are a special case where the trailing blank lines do not belong to it, unlike all other elements. Rather than write that special logic into each child parser, this just walks backwards through the consumed input to unconsume trailing blank lines and line breaks. +#[tracing::instrument(ret, level = "debug")] pub fn regurgitate<'s>(input: &'s str, remaining: &'s str) -> &'s str { assert!(is_slice_of(input, remaining)); let mut offset = remaining.as_ptr() as usize - input.as_ptr() as usize; From 602cf4c374766ccdf2417cb7cf34b3305bc97d89 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 31 Mar 2023 09:54:48 -0400 Subject: [PATCH 08/14] Removing regurgitate calls. This hacky solution ends up with whitespace getting captured twice so I will need to either use context or a separate parser. --- src/parser/plain_list.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 453ba2ce..c3d50db9 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -63,11 +63,8 @@ pub fn plain_list_item<'r, 's>( let element_matcher = parser_with_context!(element)(&parser_context); let exit_matcher = parser_with_context!(exit_matcher_parser)(&parser_context); let (remaining, (bull, _ws)) = tuple((bullet, space0))(remaining)?; - let (remaining, (contents, _exit_contents)) = many_till(element_matcher, |i| { - let with_whitespace_added_back = regurgitate(input, i); - exit_matcher(with_whitespace_added_back) - })(remaining)?; - let remaining = regurgitate(input, remaining); + let (remaining, (contents, _exit_contents)) = + many_till(element_matcher, exit_matcher)(remaining)?; let source = get_consumed(input, remaining); Ok(( From 707eac5bf810dffe7cac8ca22d11a489f8c51edd Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 31 Mar 2023 11:16:37 -0400 Subject: [PATCH 09/14] Move trailing whitespace parsing to a separate element. I still need to parse the line break at the end of elements. --- src/parser/document.rs | 30 ++++++++++++++++++++++++++++-- src/parser/element.rs | 5 +++++ src/parser/paragraph.rs | 2 -- src/parser/plain_list.rs | 1 - src/parser/util.rs | 9 +++++++++ 5 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/parser/document.rs b/src/parser/document.rs index dd5eb67e..7c002c89 100644 --- a/src/parser/document.rs +++ b/src/parser/document.rs @@ -20,10 +20,12 @@ use crate::parser::parser_context::ChainBehavior; use crate::parser::parser_context::ContextElement; use crate::parser::parser_context::ContextTree; use crate::parser::parser_context::ExitMatcherNode; +use crate::parser::util::element_trailing_whitespace; use super::element::Element; use super::error::Res; use super::object::Object; +use super::parser_context; use super::parser_with_context::parser_with_context; use super::source::Source; use super::util::exit_matcher_parser; @@ -105,12 +107,36 @@ fn section<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, Sec .with_additional_node(ContextElement::Context("section")); let element_matcher = parser_with_context!(element)(&parser_context); let exit_matcher = parser_with_context!(exit_matcher_parser)(&parser_context); + let trailing_matcher = parser_with_context!(element_trailing_whitespace)(&parser_context); let (remaining, (children, _exit_contents)) = verify( - many_till(element_matcher, exit_matcher), + many_till( + tuple(( + element_matcher, + opt(map(trailing_matcher, Element::TrailingWhitespace)), + )), + exit_matcher, + ), |(children, _exit_contents)| !children.is_empty(), )(input)?; + let flattened_children: Vec = children + .into_iter() + .flat_map(|tpl| { + let mut flattened_children = Vec::with_capacity(2); + flattened_children.push(tpl.0); + if let Some(bar) = tpl.1 { + flattened_children.push(bar); + } + flattened_children.into_iter() + }) + .collect(); let source = get_consumed(input, remaining); - Ok((remaining, Section { source, children })) + Ok(( + remaining, + Section { + source, + children: flattened_children, + }, + )) } #[tracing::instrument(ret, level = "debug")] diff --git a/src/parser/element.rs b/src/parser/element.rs index d0f2ee0a..db304976 100644 --- a/src/parser/element.rs +++ b/src/parser/element.rs @@ -13,6 +13,10 @@ use nom::combinator::map; pub enum Element<'s> { Paragraph(Paragraph<'s>), PlainList(PlainList<'s>), + /// The whitespace that follows an element. + /// + /// This isn't a real org-mode element. Except for items in plain lists, trailing blank lines belong to the preceding element. It is a separate `Element` in this enum to make parsing easier. + TrailingWhitespace(&'s str), } impl<'s> Source<'s> for Element<'s> { @@ -20,6 +24,7 @@ impl<'s> Source<'s> for Element<'s> { match self { Element::Paragraph(obj) => obj.source, Element::PlainList(obj) => obj.source, + Element::TrailingWhitespace(src) => src, } } } diff --git a/src/parser/paragraph.rs b/src/parser/paragraph.rs index b0e6d2ff..09903b81 100644 --- a/src/parser/paragraph.rs +++ b/src/parser/paragraph.rs @@ -36,8 +36,6 @@ pub fn paragraph<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s st |(children, _exit_contents)| !children.is_empty(), )(input)?; - let (remaining, _trailing_whitespace) = trailing_whitespace(remaining)?; - let source = get_consumed(input, remaining); Ok((remaining, Paragraph { source, children })) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index c3d50db9..9e314286 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -39,7 +39,6 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s }), exit_matcher, )(remaining)?; - let (remaining, _trailing_whitespace) = trailing_whitespace(remaining)?; let source = get_consumed(input, remaining); children.insert(0, first_item); Ok((remaining, PlainList { source, children })) diff --git a/src/parser/util.rs b/src/parser/util.rs index ecd08444..8abbe1af 100644 --- a/src/parser/util.rs +++ b/src/parser/util.rs @@ -77,6 +77,15 @@ pub fn blank_line(input: &str) -> Res<&str, &str> { recognize(tuple((space0, alt((line_ending, eof)))))(input) } +#[tracing::instrument(ret, level = "debug")] +pub fn element_trailing_whitespace<'r, 's>( + context: Context<'r, 's>, + input: &'s str, +) -> Res<&'s str, &'s str> { + start_of_line(context, input)?; + alt((eof, recognize(many0(blank_line))))(input) +} + #[tracing::instrument(ret, level = "debug")] pub fn trailing_whitespace(input: &str) -> Res<&str, &str> { alt((eof, recognize(tuple((line_ending, many0(blank_line))))))(input) From 68156f3667c3241210a741f837907e31e656da37 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 31 Mar 2023 11:42:04 -0400 Subject: [PATCH 10/14] Consume line break at the end of paragraph. --- src/parser/paragraph.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/parser/paragraph.rs b/src/parser/paragraph.rs index 09903b81..764b646f 100644 --- a/src/parser/paragraph.rs +++ b/src/parser/paragraph.rs @@ -36,6 +36,7 @@ pub fn paragraph<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s st |(children, _exit_contents)| !children.is_empty(), )(input)?; + let (remaining, _linebreak) = alt((eof, line_ending))(remaining)?; let source = get_consumed(input, remaining); Ok((remaining, Paragraph { source, children })) From 2b0e88dc01395a7208349c52dc7e7a954d6b899c Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 31 Mar 2023 13:08:53 -0400 Subject: [PATCH 11/14] The current problem is plain_list_item_end is not taking into account depth. --- src/parser/paragraph.rs | 10 +++++++--- src/parser/plain_list.rs | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/parser/paragraph.rs b/src/parser/paragraph.rs index 764b646f..bea09086 100644 --- a/src/parser/paragraph.rs +++ b/src/parser/paragraph.rs @@ -1,6 +1,7 @@ use nom::branch::alt; use nom::character::complete::line_ending; use nom::combinator::eof; +use nom::combinator::peek; use nom::combinator::recognize; use nom::combinator::verify; use nom::multi::many1; @@ -32,7 +33,10 @@ pub fn paragraph<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s st let exit_matcher = parser_with_context!(exit_matcher_parser)(&parser_context); let (remaining, (children, _exit_contents)) = verify( - many_till(standard_set_object_matcher, exit_matcher), + many_till( + standard_set_object_matcher, + peek(alt((eof, recognize(tuple((line_ending, exit_matcher)))))), + ), |(children, _exit_contents)| !children.is_empty(), )(input)?; @@ -46,8 +50,8 @@ pub fn paragraph<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s st fn paragraph_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { let non_paragraph_element_matcher = parser_with_context!(non_paragraph_element)(context); alt(( - recognize(tuple((line_ending, many1(blank_line)))), - recognize(tuple((line_ending, non_paragraph_element_matcher))), + recognize(many1(blank_line)), + recognize(non_paragraph_element_matcher), eof, ))(input) } diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 9e314286..2abc7ee1 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -97,8 +97,8 @@ fn plain_list_item_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res< let plain_list_item_matcher = parser_with_context!(plain_list_item)(context); let line_indented_lte_matcher = parser_with_context!(line_indented_lte)(context); alt(( - recognize(tuple((line_ending, plain_list_item_matcher))), - recognize(tuple((line_ending, line_indented_lte_matcher))), + recognize(plain_list_item_matcher), + recognize(line_indented_lte_matcher), eof, ))(input) } From e681f8fdff2e07e0765227caabddf5c08434f088 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 31 Mar 2023 13:22:30 -0400 Subject: [PATCH 12/14] Remove the exit matcher block. I'm not sure this is a problem, but while I'm debugging I want this removed to be safe. --- src/parser/parser_context.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/parser/parser_context.rs b/src/parser/parser_context.rs index 445884ac..8618e0ee 100644 --- a/src/parser/parser_context.rs +++ b/src/parser/parser_context.rs @@ -66,10 +66,10 @@ impl<'r, 's> ContextTree<'r, 's> { return at_end_of_file; } - let blocked_context = - self.with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { - exit_matcher: ChainBehavior::IgnoreParent(Some(&always_fail)), - })); + // let blocked_context = + // self.with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { + // exit_matcher: ChainBehavior::IgnoreParent(Some(&always_fail)), + // })); for current_node in self.iter() { let context_element = current_node.get_data(); @@ -77,14 +77,14 @@ impl<'r, 's> ContextTree<'r, 's> { ContextElement::ExitMatcherNode(exit_matcher) => { match exit_matcher.exit_matcher { ChainBehavior::AndParent(Some(matcher)) => { - let local_result = matcher(&blocked_context, i); + let local_result = matcher(self, i); if local_result.is_ok() { return local_result; } } ChainBehavior::AndParent(None) => {} ChainBehavior::IgnoreParent(Some(matcher)) => { - let local_result = matcher(&blocked_context, i); + let local_result = matcher(self, i); if local_result.is_ok() { return local_result; } From 942b4860781f78b4871418479ec0b94162d4cc45 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 31 Mar 2023 13:32:07 -0400 Subject: [PATCH 13/14] Add a test file showing lists where the earlier one is indented. --- org_mode_samples/plain_lists/indented_then_less_indented.org | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 org_mode_samples/plain_lists/indented_then_less_indented.org diff --git a/org_mode_samples/plain_lists/indented_then_less_indented.org b/org_mode_samples/plain_lists/indented_then_less_indented.org new file mode 100644 index 00000000..c325ab99 --- /dev/null +++ b/org_mode_samples/plain_lists/indented_then_less_indented.org @@ -0,0 +1,2 @@ + 1. foo +1. bar From 2552ba28d11eb2c7e7063f1fa01a879d88c5c89b Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Mon, 3 Apr 2023 15:06:12 -0400 Subject: [PATCH 14/14] Correctly parsing plain list items. --- src/parser/plain_list.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 2abc7ee1..9911f3f3 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -94,10 +94,16 @@ fn counter<'s>(i: &'s str) -> Res<&'s str, &'s str> { #[tracing::instrument(ret, level = "debug")] fn plain_list_item_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { + let current_item_indent_level: &usize = + get_context_item_indent(context).ok_or(nom::Err::Error(CustomError::MyError(MyError( + "Not inside a plain list item", + ))))?; let plain_list_item_matcher = parser_with_context!(plain_list_item)(context); let line_indented_lte_matcher = parser_with_context!(line_indented_lte)(context); alt(( - recognize(plain_list_item_matcher), + recognize(verify(plain_list_item_matcher, |pli| { + pli.indentation <= *current_item_indent_level + })), recognize(line_indented_lte_matcher), eof, ))(input)