From 08fed1301effbe119d1050aae2572d6e7c62beba Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 14 Apr 2023 19:24:05 -0400 Subject: [PATCH 1/5] Fix plain list parser to not consume trailing whitespace on the last item. --- .../list_paragraph_nesting.org | 4 +- src/parser/plain_list.rs | 99 +++++++++++++++---- toy_language.txt | 4 +- 3 files changed, 85 insertions(+), 22 deletions(-) diff --git a/org_mode_samples/exit_matcher_investigation/list_paragraph_nesting.org b/org_mode_samples/exit_matcher_investigation/list_paragraph_nesting.org index 843a1e0..6497a29 100644 --- a/org_mode_samples/exit_matcher_investigation/list_paragraph_nesting.org +++ b/org_mode_samples/exit_matcher_investigation/list_paragraph_nesting.org @@ -6,5 +6,7 @@ lorem + ipsum -ipsum + +dolar diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 6bd455e..045b924 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -23,6 +23,7 @@ use nom::character::complete::one_of; use nom::character::complete::space0; use nom::character::complete::space1; use nom::combinator::eof; +use nom::combinator::peek; use nom::combinator::recognize; use nom::combinator::verify; use nom::multi::many1; @@ -35,26 +36,73 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s context.with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { exit_matcher: ChainBehavior::AndParent(Some(&plain_list_end)), })); - let (mut remaining, first_item) = plain_list_item(&parser_context, input)?; - let first_item_indentation = first_item.indentation; - let plain_list_item_matcher = parser_with_context!(plain_list_item)(&parser_context); - let exit_matcher = parser_with_context!(exit_matcher_parser)(&parser_context); + let without_consume_context = + parser_context.with_additional_node(ContextElement::ConsumeTrailingWhitespace(false)); + let with_consume_context = + parser_context.with_additional_node(ContextElement::ConsumeTrailingWhitespace(true)); + let without_consume_matcher = parser_with_context!(plain_list_item)(&without_consume_context); + let with_consume_matcher = parser_with_context!(plain_list_item)(&with_consume_context); + let exit_matcher = parser_with_context!(exit_matcher_parser)(&with_consume_context); let mut children = Vec::new(); - children.push(first_item); - loop { - let exit_contents = exit_matcher(remaining); - if exit_contents.is_ok() { - break; - } + let mut first_item_indentation: Option = None; + let mut remaining = input; - let next_list_item = plain_list_item_matcher(remaining); - match next_list_item { - Ok((remain, next_child)) if next_child.indentation == first_item_indentation => { - children.push(next_child); + loop { + /* + Trailing whitespace belongs to the plain list, not the plain list item + + Possible outcomes: + Don't consume, yes exit matcher + Don't consume, no additional item + Consume, additional item + */ + let last_item_then_exit = tuple((without_consume_matcher, exit_matcher))(remaining); + match last_item_then_exit { + Ok((remain, (item, _exit))) + if item.indentation == *first_item_indentation.get_or_insert(item.indentation) => + { remaining = remain; + children.push(item); + break; } - Ok(_) | Err(_) => break, + Ok(_) | Err(_) => {} }; + + let not_last_item = tuple((with_consume_matcher, peek(without_consume_matcher)))(remaining); + match not_last_item { + Ok((remain, (item, _future_item))) + if item.indentation == *first_item_indentation.get_or_insert(item.indentation) => + { + remaining = remain; + children.push(item); + continue; + } + Ok(_) | Err(_) => {} + }; + + // If its not (don't consume, exit) and its not (consume, see another item) then it must be (don't consume, no additional item) + let last_item_then_exit = without_consume_matcher(remaining); + match last_item_then_exit { + Ok((remain, item)) + if item.indentation == *first_item_indentation.get_or_insert(item.indentation) => + { + remaining = remain; + children.push(item); + break; + } + Ok(_) | Err(_) => { + return Err(nom::Err::Error(CustomError::MyError(MyError( + "Should be unreachable.", + )))); + unreachable!(); + } + }; + } + + if children.is_empty() { + return Err(nom::Err::Error(CustomError::MyError(MyError( + "Plain lists require at least one element.", + )))); } let (remaining, _trailing_ws) = @@ -73,20 +121,27 @@ pub fn plain_list_item<'r, 's>( let (remaining, leading_whitespace) = space0(input)?; // It is fine that we get the indent level using the number of bytes rather than the number of characters because nom's space0 only matches space and tab (0x20 and 0x09) let indent_level = leading_whitespace.len(); - let parser_context = context + let with_consume_context = context + .with_additional_node(ContextElement::ConsumeTrailingWhitespace(true)) + .with_additional_node(ContextElement::ListItem(indent_level)) + .with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { + exit_matcher: ChainBehavior::AndParent(Some(&plain_list_item_end)), + })); + let without_consume_context = context .with_additional_node(ContextElement::ConsumeTrailingWhitespace(false)) .with_additional_node(ContextElement::ListItem(indent_level)) .with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { exit_matcher: ChainBehavior::AndParent(Some(&plain_list_item_end)), })); - let element_matcher = parser_with_context!(element)(&parser_context); - let exit_matcher = parser_with_context!(exit_matcher_parser)(&parser_context); + let element_matcher = parser_with_context!(element)(&with_consume_context); + let exit_matcher = parser_with_context!(exit_matcher_parser)(&with_consume_context); let (remaining, bull) = verify(bullet, |bull: &str| bull != "*" || indent_level > 0)(remaining)?; let maybe_contentless_item: Res<&str, &str> = alt((eof, line_ending))(remaining); match maybe_contentless_item { Ok((rem, _ws)) => { + // TODO: do we need to consume if this isn't the last item? let source = get_consumed(input, rem); return Ok(( rem, @@ -329,15 +384,17 @@ baz"#; lorem + ipsum -ipsum"#; + +dolar"#; let initial_context: ContextTree<'_, '_> = ContextTree::new(); let document_context = initial_context.with_additional_node(ContextElement::DocumentRoot(input)); let plain_list_matcher = parser_with_context!(plain_list)(&document_context); let (remaining, result) = plain_list_matcher(input).expect("Should parse the plain list successfully."); - assert_eq!(remaining, "ipsum"); + assert_eq!(remaining, "dolar"); assert_eq!( result.get_source(), r#"1. foo @@ -348,6 +405,8 @@ ipsum"#; lorem + ipsum + "# ); diff --git a/toy_language.txt b/toy_language.txt index 843a1e0..6497a29 100644 --- a/toy_language.txt +++ b/toy_language.txt @@ -6,5 +6,7 @@ lorem + ipsum -ipsum + +dolar From 143ac49777f633b174543c392bbba84cbc512c1e Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 14 Apr 2023 19:32:08 -0400 Subject: [PATCH 2/5] Fix plain list item parser to not consume trailing whitespace on the last element. --- src/parser/plain_list.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 045b924..7259fcd 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -128,13 +128,13 @@ pub fn plain_list_item<'r, 's>( exit_matcher: ChainBehavior::AndParent(Some(&plain_list_item_end)), })); let without_consume_context = context - .with_additional_node(ContextElement::ConsumeTrailingWhitespace(false)) .with_additional_node(ContextElement::ListItem(indent_level)) .with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { exit_matcher: ChainBehavior::AndParent(Some(&plain_list_item_end)), })); - let element_matcher = parser_with_context!(element)(&with_consume_context); + let with_consume_matcher = parser_with_context!(element)(&with_consume_context); + let without_consume_matcher = parser_with_context!(element)(&with_consume_context); let exit_matcher = parser_with_context!(exit_matcher_parser)(&with_consume_context); let (remaining, bull) = verify(bullet, |bull: &str| bull != "*" || indent_level > 0)(remaining)?; @@ -155,9 +155,12 @@ pub fn plain_list_item<'r, 's>( } Err(_) => { let (remaining, _ws) = space1(remaining)?; - // TODO: The problem is we are not capturing trailing whitespace for elements that are before the last element. - let (remaining, (contents, _exit_contents)) = - many_till(element_matcher, exit_matcher)(remaining)?; + let (remaining, (mut contents, (final_element, _exit_contents))) = + many_till( + with_consume_matcher, + tuple((without_consume_matcher, exit_matcher)), + )(remaining)?; + contents.push(final_element); let source = get_consumed(input, remaining); return Ok(( remaining, From 8549a6b0db0c007976736f79cf575d74b94095f5 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 14 Apr 2023 19:56:54 -0400 Subject: [PATCH 3/5] Fix case where indentation was causing the plain list item matcher to match items it wouldn't use in other parts. --- scripts/compare_parse_all.bash | 2 +- src/parser/plain_list.rs | 103 ++++++++++++++++++++------------- 2 files changed, 64 insertions(+), 41 deletions(-) diff --git a/scripts/compare_parse_all.bash b/scripts/compare_parse_all.bash index 8430d23..2970b3e 100755 --- a/scripts/compare_parse_all.bash +++ b/scripts/compare_parse_all.bash @@ -9,7 +9,7 @@ cd "$DIR" : ${org_dir:="$DIR/../org_mode_samples"} : ${compare_bin:="$DIR/../target/debug/org_compare"} -test_files=$(find $org_dir -type f -name '*.org') +test_files=$(find $org_dir -type f -name '*.org' | sort) cargo build --bin org_compare diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 7259fcd..367098e 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -29,6 +29,7 @@ use nom::combinator::verify; use nom::multi::many1; use nom::multi::many_till; use nom::sequence::tuple; +use tracing::span; #[tracing::instrument(ret, level = "debug")] pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, PlainList<'s>> { @@ -55,48 +56,70 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s Don't consume, yes exit matcher Don't consume, no additional item Consume, additional item - */ - let last_item_then_exit = tuple((without_consume_matcher, exit_matcher))(remaining); - match last_item_then_exit { - Ok((remain, (item, _exit))) - if item.indentation == *first_item_indentation.get_or_insert(item.indentation) => - { - remaining = remain; - children.push(item); - break; - } - Ok(_) | Err(_) => {} - }; + */ + { + let span = span!(tracing::Level::DEBUG, "first"); + let _enter = span.enter(); - let not_last_item = tuple((with_consume_matcher, peek(without_consume_matcher)))(remaining); - match not_last_item { - Ok((remain, (item, _future_item))) - if item.indentation == *first_item_indentation.get_or_insert(item.indentation) => - { - remaining = remain; - children.push(item); - continue; - } - Ok(_) | Err(_) => {} - }; + let last_item_then_exit = tuple((without_consume_matcher, exit_matcher))(remaining); + match last_item_then_exit { + Ok((remain, (item, _exit))) + if item.indentation + == *first_item_indentation.get_or_insert(item.indentation) => + { + remaining = remain; + children.push(item); + break; + } + Ok(_) | Err(_) => {} + }; + } - // If its not (don't consume, exit) and its not (consume, see another item) then it must be (don't consume, no additional item) - let last_item_then_exit = without_consume_matcher(remaining); - match last_item_then_exit { - Ok((remain, item)) - if item.indentation == *first_item_indentation.get_or_insert(item.indentation) => - { - remaining = remain; - children.push(item); - break; - } - Ok(_) | Err(_) => { - return Err(nom::Err::Error(CustomError::MyError(MyError( - "Should be unreachable.", - )))); - unreachable!(); - } - }; + { + let span = span!(tracing::Level::DEBUG, "second"); + let _enter = span.enter(); + + let not_last_item = + tuple((with_consume_matcher, peek(without_consume_matcher)))(remaining); + match not_last_item { + Ok((remain, (item, future_item))) + if item.indentation + == *first_item_indentation.get_or_insert(item.indentation) + && future_item.indentation + == *first_item_indentation.get_or_insert(item.indentation) => + { + remaining = remain; + children.push(item); + continue; + } + Ok(_) | Err(_) => {} + }; + } + + { + let span = span!(tracing::Level::DEBUG, "third"); + let _enter = span.enter(); + + // If its not (don't consume, exit) and its not (consume, see another item) then it must be (don't consume, no additional item) + let last_item_then_exit = without_consume_matcher(remaining); + match last_item_then_exit { + Ok((remain, item)) + if item.indentation + == *first_item_indentation.get_or_insert(item.indentation) => + { + remaining = remain; + children.push(item); + break; + } + Ok(_) | Err(_) => { + // TODO: Maybe this is reachable when there are no items at all. + return Err(nom::Err::Error(CustomError::MyError(MyError( + "Should be unreachable.", + )))); + unreachable!(); + } + }; + } } if children.is_empty() { From be64eb92af5b84614738519a699486e7eb97781e Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 14 Apr 2023 20:17:53 -0400 Subject: [PATCH 4/5] Ran into a bigger issue. --- src/parser/plain_list.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 367098e..20d5cf1 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -58,6 +58,7 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s Consume, additional item */ { + // Don't consume, yes exit matcher let span = span!(tracing::Level::DEBUG, "first"); let _enter = span.enter(); @@ -76,6 +77,7 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s } { + // Consume, additional item let span = span!(tracing::Level::DEBUG, "second"); let _enter = span.enter(); @@ -97,10 +99,10 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s } { + // Don't consume, no additional item let span = span!(tracing::Level::DEBUG, "third"); let _enter = span.enter(); - // If its not (don't consume, exit) and its not (consume, see another item) then it must be (don't consume, no additional item) let last_item_then_exit = without_consume_matcher(remaining); match last_item_then_exit { Ok((remain, item)) @@ -157,7 +159,7 @@ pub fn plain_list_item<'r, 's>( })); let with_consume_matcher = parser_with_context!(element)(&with_consume_context); - let without_consume_matcher = parser_with_context!(element)(&with_consume_context); + let without_consume_matcher = parser_with_context!(element)(&without_consume_context); let exit_matcher = parser_with_context!(exit_matcher_parser)(&with_consume_context); let (remaining, bull) = verify(bullet, |bull: &str| bull != "*" || indent_level > 0)(remaining)?; From 412bdcda288b4e2f527a20ea6690cdca3f9c005c Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Fri, 14 Apr 2023 23:56:55 -0400 Subject: [PATCH 5/5] Fix problem with detecting end of plain list item. The problem is line indented lte does not match because we aren't consuming the whitespace that the plain list would consume. --- src/parser/plain_list.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 20d5cf1..133015c 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -28,6 +28,8 @@ use nom::combinator::recognize; use nom::combinator::verify; use nom::multi::many1; use nom::multi::many_till; +use nom::sequence::preceded; +use nom::sequence::terminated; use nom::sequence::tuple; use tracing::span; @@ -180,11 +182,16 @@ pub fn plain_list_item<'r, 's>( } Err(_) => { let (remaining, _ws) = space1(remaining)?; - let (remaining, (mut contents, (final_element, _exit_contents))) = - many_till( - with_consume_matcher, - tuple((without_consume_matcher, exit_matcher)), - )(remaining)?; + let (remaining, (mut contents, final_element)) = many_till( + with_consume_matcher, + alt(( + terminated(without_consume_matcher, exit_matcher), + preceded( + peek(tuple((with_consume_matcher, exit_matcher))), + without_consume_matcher, + ), + )), + )(remaining)?; contents.push(final_element); let source = get_consumed(input, remaining); return Ok((