From 72d5f8f35cb22d09fe89eb0c5591ea57d4143d8d Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Tue, 15 Aug 2023 01:28:19 -0400 Subject: [PATCH 1/7] Make a new naive implementation of plain_list. --- src/parser/plain_list.rs | 105 +++++++-------------------------------- 1 file changed, 18 insertions(+), 87 deletions(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 685701c..62fb7e0 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -14,8 +14,6 @@ use nom::multi::many_till; use nom::sequence::preceded; use nom::sequence::terminated; use nom::sequence::tuple; -#[cfg(feature = "tracing")] -use tracing::span; use super::greater_element::PlainList; use super::greater_element::PlainListItem; @@ -42,97 +40,28 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s class: ExitClass::Beta, exit_matcher: &plain_list_end, })); - let without_consume_context = - parser_context.with_additional_node(ContextElement::ConsumeTrailingWhitespace(false)); - let with_consume_context = - parser_context.with_additional_node(ContextElement::ConsumeTrailingWhitespace(true)); - let without_consume_matcher = parser_with_context!(plain_list_item)(&without_consume_context); - let with_consume_matcher = parser_with_context!(plain_list_item)(&with_consume_context); - let exit_matcher = parser_with_context!(exit_matcher_parser)(&with_consume_context); let mut children = Vec::new(); let mut first_item_indentation: Option = None; let mut remaining = input; loop { - /* - Trailing whitespace belongs to the plain list, not the plain list item - - Possible outcomes: - Don't consume, yes exit matcher - Don't consume, no additional item - Consume, additional item - */ - { - // Don't consume, yes exit matcher - #[cfg(feature = "tracing")] - let span = span!(tracing::Level::DEBUG, "first"); - #[cfg(feature = "tracing")] - let _enter = span.enter(); - - let last_item_then_exit = tuple((without_consume_matcher, exit_matcher))(remaining); - match last_item_then_exit { - Ok((remain, (item, _exit))) - if item.indentation - == *first_item_indentation.get_or_insert(item.indentation) => - { - remaining = remain; - children.push(item); - break; - } - Ok(_) | Err(_) => {} - }; - } - - { - // Consume, additional item - #[cfg(feature = "tracing")] - let span = span!(tracing::Level::DEBUG, "second"); - #[cfg(feature = "tracing")] - let _enter = span.enter(); - - let not_last_item = - tuple((with_consume_matcher, peek(without_consume_matcher)))(remaining); - match not_last_item { - Ok((remain, (item, future_item))) - if item.indentation - == *first_item_indentation.get_or_insert(item.indentation) - && future_item.indentation - == *first_item_indentation.get_or_insert(item.indentation) => - { - remaining = remain; - children.push(item); - continue; - } - Ok(_) | Err(_) => {} - }; - } - - { - // Don't consume, no additional item - #[cfg(feature = "tracing")] - let span = span!(tracing::Level::DEBUG, "third"); - #[cfg(feature = "tracing")] - let _enter = span.enter(); - - let last_item_then_exit = without_consume_matcher(remaining); - match last_item_then_exit { - Ok((remain, item)) - if item.indentation - == *first_item_indentation.get_or_insert(item.indentation) => - { - remaining = remain; - children.push(item); - break; - } - Ok(_) | Err(_) => { - // TODO: Maybe this is reachable when there are no items at all. - return Err(nom::Err::Error(CustomError::MyError(MyError( - "Should be unreachable.", - )))); - // unreachable!(); - } - }; + let maybe_exit = parser_with_context!(exit_matcher_parser)(&parser_context)(remaining); + if maybe_exit.is_ok() { + break; } + let list_item = parser_with_context!(plain_list_item)(&parser_context)(remaining); + match list_item { + Ok((remain, item)) + if item.indentation == *first_item_indentation.get_or_insert(item.indentation) => + { + remaining = remain; + children.push(item); + continue; + } + Ok(_) | Err(_) => { + break; + } + }; } if children.is_empty() { @@ -141,6 +70,8 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s )))); } + // TODO: trailing whitespace + let source = get_consumed(input, remaining); Ok((remaining, PlainList { source, children })) } From 7df393f31d1eb8f365317ee8110627566384a830 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Tue, 15 Aug 2023 02:02:15 -0400 Subject: [PATCH 2/7] Make a new naive implementation of plain_list_item. Still need to update plain_list_item_end and handle the whitespace ownership issues, but starting from a simplified state will help. --- src/parser/plain_list.rs | 71 ++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 42 deletions(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 62fb7e0..08b7e89 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -85,29 +85,12 @@ pub fn plain_list_item<'r, 's>( let (remaining, leading_whitespace) = space0(input)?; // It is fine that we get the indent level using the number of bytes rather than the number of characters because nom's space0 only matches space and tab (0x20 and 0x09) let indent_level = leading_whitespace.len(); - let with_consume_context = context - .with_additional_node(ContextElement::ConsumeTrailingWhitespace(true)) - .with_additional_node(ContextElement::ListItem(indent_level)) - .with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { - class: ExitClass::Beta, - exit_matcher: &plain_list_item_end, - })); - let without_consume_context = context - .with_additional_node(ContextElement::ListItem(indent_level)) - .with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { - class: ExitClass::Beta, - exit_matcher: &plain_list_item_end, - })); - - let with_consume_matcher = parser_with_context!(element(true))(&with_consume_context); - let without_consume_matcher = parser_with_context!(element(true))(&without_consume_context); - let exit_matcher = parser_with_context!(exit_matcher_parser)(&with_consume_context); let (remaining, bull) = verify(bullet, |bull: &str| bull != "*" || indent_level > 0)(remaining)?; + let maybe_contentless_item: Res<&str, &str> = alt((eof, line_ending))(remaining); match maybe_contentless_item { Ok((rem, _ws)) => { - // TODO: do we need to consume if this isn't the last item? let source = get_consumed(input, rem); return Ok(( rem, @@ -119,31 +102,35 @@ pub fn plain_list_item<'r, 's>( }, )); } - Err(_) => { - let (remaining, _ws) = space1(remaining)?; - let (remaining, (mut contents, final_element)) = many_till( - &with_consume_matcher, - alt(( - terminated(&without_consume_matcher, exit_matcher), - preceded( - peek(tuple((&with_consume_matcher, exit_matcher))), - &without_consume_matcher, - ), - )), - )(remaining)?; - contents.push(final_element); - let source = get_consumed(input, remaining); - return Ok(( - remaining, - PlainListItem { - source, - indentation: indent_level, - bullet: bull, - children: contents, - }, - )); - } + Err(_) => {} }; + + let (remaining, _ws) = space1(remaining)?; + let parser_context = context + .with_additional_node(ContextElement::ListItem(indent_level)) + .with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { + class: ExitClass::Beta, + exit_matcher: &plain_list_item_end, + })); + + let (remaining, (children, _exit_contents)) = verify( + many_till( + parser_with_context!(element(true))(&parser_context), + parser_with_context!(exit_matcher_parser)(&parser_context), + ), + |(children, _exit_contents)| !children.is_empty(), + )(remaining)?; + + let source = get_consumed(input, remaining); + return Ok(( + remaining, + PlainListItem { + source, + indentation: indent_level, + bullet: bull, + children, + }, + )); } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] From 823c33ef8efbca9d76adcfb74ef0dcbc49be7638 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Wed, 16 Aug 2023 16:37:14 -0400 Subject: [PATCH 3/7] Reduce use of expect in main.rs --- src/main.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/main.rs b/src/main.rs index 7c6323c..9f76e49 100644 --- a/src/main.rs +++ b/src/main.rs @@ -39,16 +39,15 @@ fn main_body() -> Result<(), Box> { run_compare( std::env::args() .nth(1) - .expect("Pass a single file into this script."), + .ok_or("Pass a single file into this script.")?, ) } #[cfg(feature = "compare")] fn run_compare>(todo_org_path: P) -> Result<(), Box> { - let org_contents = std::fs::read_to_string(todo_org_path.as_ref()).expect("Read org file."); + let org_contents = std::fs::read_to_string(todo_org_path.as_ref())?; let (remaining, rust_parsed) = document(org_contents.as_str()).expect("Org Parse failure"); - let org_sexp = - emacs_parse_org_document(todo_org_path.as_ref()).expect("Use emacs to parse org file."); + let org_sexp = emacs_parse_org_document(todo_org_path.as_ref())?; let (_remaining, parsed_sexp) = sexp_with_padding(org_sexp.as_str()).expect("Sexp Parse failure"); @@ -57,11 +56,8 @@ fn run_compare>(todo_org_path: P) -> Result<(), Box Date: Wed, 16 Aug 2023 17:09:06 -0400 Subject: [PATCH 4/7] Support blank link in plain_list_item_end, move exit matcher to end of loop in plain_list, and maybe consume trailing whitespace in plain_list_item. --- src/parser/plain_list.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 08b7e89..7311048 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -6,6 +6,7 @@ use nom::character::complete::one_of; use nom::character::complete::space0; use nom::character::complete::space1; use nom::combinator::eof; +use nom::combinator::opt; use nom::combinator::peek; use nom::combinator::recognize; use nom::combinator::verify; @@ -30,6 +31,7 @@ use crate::parser::parser_context::ExitMatcherNode; use crate::parser::util::blank_line; use crate::parser::util::exit_matcher_parser; use crate::parser::util::get_consumed; +use crate::parser::util::maybe_consume_trailing_whitespace_if_not_exiting; use crate::parser::util::start_of_line; #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] @@ -45,10 +47,6 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s let mut remaining = input; loop { - let maybe_exit = parser_with_context!(exit_matcher_parser)(&parser_context)(remaining); - if maybe_exit.is_ok() { - break; - } let list_item = parser_with_context!(plain_list_item)(&parser_context)(remaining); match list_item { Ok((remain, item)) @@ -56,12 +54,16 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s { remaining = remain; children.push(item); - continue; } Ok(_) | Err(_) => { break; } }; + + let maybe_exit = parser_with_context!(exit_matcher_parser)(&parser_context)(remaining); + if maybe_exit.is_ok() { + break; + } } if children.is_empty() { @@ -121,6 +123,9 @@ pub fn plain_list_item<'r, 's>( |(children, _exit_contents)| !children.is_empty(), )(remaining)?; + let (remaining, _trailing_ws) = + maybe_consume_trailing_whitespace_if_not_exiting(context, remaining)?; + let source = get_consumed(input, remaining); return Ok(( remaining, @@ -159,7 +164,11 @@ fn plain_list_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] fn plain_list_item_end<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s str, &'s str> { - recognize(parser_with_context!(line_indented_lte)(context))(input) + start_of_line(context, input)?; + recognize(tuple(( + opt(blank_line), + parser_with_context!(line_indented_lte)(context), + )))(input) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] @@ -169,8 +178,6 @@ fn line_indented_lte<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&' "Not inside a plain list item", ))))?; - start_of_line(context, input)?; - let matched = recognize(verify( tuple((space0::<&str, _>, non_whitespace_character)), // It is fine that we get the indent level using the number of bytes rather than the number of characters because nom's space0 only matches space and tab (0x20 and 0x09) From 7fafbfb6bbce54c3b397167b04f9698cef87e26f Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Wed, 16 Aug 2023 17:37:19 -0400 Subject: [PATCH 5/7] Do not consume whitespace in the final plain list item. --- src/parser/plain_list.rs | 41 ++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/parser/plain_list.rs b/src/parser/plain_list.rs index 7311048..4d6e5cf 100644 --- a/src/parser/plain_list.rs +++ b/src/parser/plain_list.rs @@ -42,18 +42,27 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s class: ExitClass::Beta, exit_matcher: &plain_list_end, })); + // children stores tuple of (input string, parsed object) so we can re-parse the final item let mut children = Vec::new(); let mut first_item_indentation: Option = None; let mut remaining = input; + // The final list item does not consume trailing blank lines (which instead get consumed by the list). We have three options here: + // + // 1. Parse all items while consuming trailing whitespace, then edit the final item to remove trailing whitespace. + // 2. Parse all items without consuming trailing whitespace, then edit all but the final one to add in the trailing whitespace. + // 3. Re-parse the final item with consume trailing whitespace disabled. + // + // While #3 is the most slow, it also seems to cleanest and involves the least manual mutation of already-parsed objects so I am going with #3 for now, but we should revisit #1 or #2 when the parser is more developed. + loop { let list_item = parser_with_context!(plain_list_item)(&parser_context)(remaining); match list_item { Ok((remain, item)) if item.indentation == *first_item_indentation.get_or_insert(item.indentation) => { + children.push((remaining, item)); remaining = remain; - children.push(item); } Ok(_) | Err(_) => { break; @@ -66,16 +75,31 @@ pub fn plain_list<'r, 's>(context: Context<'r, 's>, input: &'s str) -> Res<&'s s } } - if children.is_empty() { - return Err(nom::Err::Error(CustomError::MyError(MyError( - "Plain lists require at least one element.", - )))); - } + let (final_child_start, _final_item_first_parse) = match children.pop() { + Some(final_child) => final_child, + None => { + return Err(nom::Err::Error(CustomError::MyError(MyError( + "Plain lists require at least one element.", + )))); + } + }; + let final_item_context = + parser_context.with_additional_node(ContextElement::ConsumeTrailingWhitespace(false)); + let (remaining, reparsed_final_item) = + parser_with_context!(plain_list_item)(&final_item_context)(final_child_start)?; + children.push((final_child_start, reparsed_final_item)); - // TODO: trailing whitespace + let (remaining, _trailing_ws) = + maybe_consume_trailing_whitespace_if_not_exiting(context, remaining)?; let source = get_consumed(input, remaining); - Ok((remaining, PlainList { source, children })) + Ok(( + remaining, + PlainList { + source, + children: children.into_iter().map(|(_start, item)| item).collect(), + }, + )) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] @@ -109,6 +133,7 @@ pub fn plain_list_item<'r, 's>( let (remaining, _ws) = space1(remaining)?; let parser_context = context + .with_additional_node(ContextElement::ConsumeTrailingWhitespace(true)) .with_additional_node(ContextElement::ListItem(indent_level)) .with_additional_node(ContextElement::ExitMatcherNode(ExitMatcherNode { class: ExitClass::Beta, From 6c9c304f37690e284da7d3b9bbc83b257424be1e Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Wed, 16 Aug 2023 17:39:10 -0400 Subject: [PATCH 6/7] Re-enable disabled test. --- build.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/build.rs b/build.rs index 6aa1e02..5478ee2 100644 --- a/build.rs +++ b/build.rs @@ -75,7 +75,6 @@ fn is_expect_fail(name: &str) -> Option<&str> { "element_container_priority_footnote_definition_dynamic_block" => Some("Apparently broken begin lines become their own paragraph."), "paragraphs_paragraph_with_backslash_line_breaks" => Some("The text we're getting out of the parse tree is already processed to remove line breaks, so our comparison needs to take that into account."), "export_snippet_paragraph_break_precedence" => Some("The latest code for org-mode is matching the export snippet without the closing @@."), // https://list.orgmode.org/orgmode/fb61ea28-f004-4c25-adf7-69fc55683ed4@app.fastmail.com/T/#u - "plain_lists_trailing_whitespace_ownership" => Some("My plain list implementation is currently broken."), _ => None, } } From 8e95ce63685f74a75288e33ecb87e4073fda13e8 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Wed, 16 Aug 2023 20:24:00 -0400 Subject: [PATCH 7/7] Add notes about plain list trailing blank line ownership investigation. --- notes/plain_list_ownership_notes.org | 128 +++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 notes/plain_list_ownership_notes.org diff --git a/notes/plain_list_ownership_notes.org b/notes/plain_list_ownership_notes.org new file mode 100644 index 0000000..79e862f --- /dev/null +++ b/notes/plain_list_ownership_notes.org @@ -0,0 +1,128 @@ +* Test 1 +** Source +#+begin_src org + 1. foo + + 1. bar + + 2. baz + + 2. lorem + + ipsum +#+end_src +** Ownership +This table is just showing ownership for the plain list items, not the containing plain list nor the elements inside each item. + +| Plain List *Item* | Owns trailing blank lines | +|------------------------+---------------------------| +| foo (includes bar baz) | Yes | +| bar | Yes | +| baz | Yes | +| lorem | No | +** Analysis +This seems to imply that plain list items own their trailing blank lines except for the final plain list item in the top-most plain list which does not own its trailing blank lines. +* Test 2 +** Source +#+begin_src org + 1. foo + + bar + + 1. baz + + lorem + + ipsum + + + dolar +#+end_src +** Ownership +This table is just showing ownership for the plain list items, not the containing plain list nor the elements inside each item. + +| Plain List *Item* | Owns trailing blank lines | +|--------------------------+---------------------------| +| foo -> ipsum (inclusive) | No | +| baz lorem | No | +** Analysis +This shows that the final plain list item in a nested plain list (baz lorem) does not own its trailing blank lines which conflicts with "baz" from Test 1. +* Test 3 +** Source +#+begin_src org + 1. foo + + 1. bar + + baz + + 2. lorem + + ipsum +#+end_src +** Ownership +| Plain List *Item* | Owns trailing blank lines | +|------------------------+---------------------------| +| foo (includes bar baz) | Yes | +| bar baz | Yes | +| lorem | No | +** Analysis +This was to test if having an extra paragraph in the final list item in the nested list changes the behavior. The behavior is consistent with Test 1, so the extra paragraph is not the cause of the discrepancy. +* Test 4 +** Source +#+begin_src org + 1. foo + + 1. bar + + 2. baz + + candy + + 2. lorem + + ipsum +#+end_src +** Ownership +| Plain List *Item* | Owns trailing blank lines | +|----------------------------------+---------------------------| +| foo (includes bar baz and candy) | Yes | +| bar | Yes | +| baz | No | +| lorem | No | +** Analysis +This was to test if putting a non-plain-list element at the end of foo changes the ownership of blank lines. baz changed to no longer owning its trailing whitespace. + +This seems to imply that list items own their trailing whitespace except for the final item unless that list item is at the end of a list item. +* Test 5 +** Source +#+begin_src org + 1. foo + + 1. bar + + 2. baz + + candy + + 2. lorem + + 1. cat + + 2. dog + + ipsum +#+end_src +** Ownership +| Plain List *Item* | Owns trailing blank lines | +|----------------------------------+---------------------------| +| foo (includes bar baz and candy) | Yes | +| bar | Yes | +| baz | No | +| lorem (includes cat and dog) | No | +| cat | Yes | +| dog | No | +** Analysis +This breaks the theory that the final list item nested at the end of a list item gets to own its trailing blank lines since dog does not own its blank lines despite Test 1's baz owning its blank lines. + +New Theory: final list items only own their blank lines if they are nested at the end of a non-final list item.