From 8cd0e4ec638cecc26934c607adf9c7c332e1573d Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Sun, 24 Sep 2023 02:58:32 -0400 Subject: [PATCH 1/2] Optimize scanning for in-buffer settings by scanning forward for possible keywords. Previously we stepped through the document character by character which involved a lot of extra processing inside OrgSource. By scanning for possible keywords, we can skip many of the intermediate steps. --- src/parser/in_buffer_settings.rs | 45 +++++++++++++++++++------ src/parser/org_source.rs | 56 ++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/src/parser/in_buffer_settings.rs b/src/parser/in_buffer_settings.rs index 40f8505..824c2da 100644 --- a/src/parser/in_buffer_settings.rs +++ b/src/parser/in_buffer_settings.rs @@ -1,17 +1,15 @@ use nom::branch::alt; use nom::bytes::complete::is_not; use nom::bytes::complete::tag_no_case; -use nom::character::complete::anychar; +use nom::bytes::complete::take_until; use nom::character::complete::space1; -use nom::combinator::map; -use nom::multi::many0; -use nom::multi::many_till; use nom::multi::separated_list0; use super::keyword::filtered_keyword; use super::keyword_todo::todo_keywords; use super::OrgSource; use crate::context::HeadlineLevelFilter; +use crate::error::CustomError; use crate::error::Res; use crate::types::Keyword; use crate::GlobalSettings; @@ -20,13 +18,40 @@ use crate::GlobalSettings; pub(crate) fn scan_for_in_buffer_settings<'s>( input: OrgSource<'s>, ) -> Res, Vec>> { - // TODO: Optimization idea: since this is slicing the OrgSource at each character, it might be more efficient to do a parser that uses a search function like take_until, and wrap it in a function similar to consumed but returning the input along with the normal output, then pass all of that into a verify that confirms we were at the start of a line using the input we just returned. + // TODO: Write some tests to make sure this is functioning properly. - let keywords = many0(map( - many_till(anychar, filtered_keyword(in_buffer_settings_key)), - |(_, kw)| kw, - ))(input); - keywords + let mut keywords = Vec::new(); + let mut remaining = input; + loop { + // Skip text until possible in_buffer_setting + let start_of_pound = take_until::<_, _, CustomError<_>>("#+")(remaining); + let start_of_pound = if let Ok((start_of_pound, _)) = start_of_pound { + start_of_pound + } else { + break; + }; + // Go backwards to the start of the line and run the filtered_keyword parser + let start_of_line = start_of_pound.get_start_of_line(); + + let (remain, maybe_kw) = match filtered_keyword(in_buffer_settings_key)(start_of_line) { + Ok((remain, kw)) => (remain, Some(kw)), + Err(_) => { + let end_of_line = take_until::<_, _, CustomError<_>>("\n")(start_of_pound); + if let Ok((end_of_line, _)) = end_of_line { + (end_of_line, None) + } else { + break; + } + } + }; + + if let Some(kw) = maybe_kw { + keywords.push(kw); + } + remaining = remain; + } + + Ok((remaining, keywords)) } #[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))] diff --git a/src/parser/org_source.rs b/src/parser/org_source.rs index 9f8c0f6..252a5fc 100644 --- a/src/parser/org_source.rs +++ b/src/parser/org_source.rs @@ -1,6 +1,7 @@ use std::ops::RangeBounds; use nom::Compare; +use nom::FindSubstring; use nom::InputIter; use nom::InputLength; use nom::InputTake; @@ -77,6 +78,55 @@ impl<'s> OrgSource<'s> { self.slice(..(other.start - self.start)) } + pub(crate) fn get_start_of_line(&self) -> OrgSource<'s> { + let skipped_text = self.text_since_line_break(); + let mut bracket_depth = self.bracket_depth; + let mut brace_depth = self.brace_depth; + let mut parenthesis_depth = self.parenthesis_depth; + // Since we're going backwards, this does the opposite. + for byte in skipped_text.bytes() { + match byte { + b'\n' => { + panic!("Should not hit a line break when only going back to the start of the line."); + } + b'[' => { + bracket_depth -= 1; + } + b']' => { + bracket_depth += 1; + } + b'{' => { + brace_depth -= 1; + } + b'}' => { + brace_depth += 1; + } + b'(' => { + parenthesis_depth -= 1; + } + b')' => { + parenthesis_depth += 1; + } + _ => {} + }; + } + + OrgSource { + full_source: self.full_source, + start: self.start_of_line, + end: self.end, + start_of_line: self.start_of_line, + preceding_character: if self.start_of_line > 0 { + Some('\n') + } else { + None + }, + bracket_depth, + brace_depth, + parenthesis_depth, + } + } + pub(crate) fn get_bracket_depth(&self) -> BracketDepth { self.bracket_depth } @@ -310,6 +360,12 @@ impl<'s> InputTakeAtPosition for OrgSource<'s> { } } +impl<'n, 's> FindSubstring<&'n str> for OrgSource<'s> { + fn find_substring(&self, substr: &'n str) -> Option { + Into::<&str>::into(self).find(substr) + } +} + pub(crate) fn convert_error<'a, I: Into>>( err: nom::Err, ) -> nom::Err> { From cd781a7dcf0b231d219d3af9e65464f9b25b9411 Mon Sep 17 00:00:00 2001 From: Tom Alexander Date: Sun, 24 Sep 2023 03:09:51 -0400 Subject: [PATCH 2/2] Add simple test to prove the scan for in-buffer settings is still working. --- src/parser/in_buffer_settings.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/parser/in_buffer_settings.rs b/src/parser/in_buffer_settings.rs index 824c2da..58976ab 100644 --- a/src/parser/in_buffer_settings.rs +++ b/src/parser/in_buffer_settings.rs @@ -113,3 +113,33 @@ pub(crate) fn apply_in_buffer_settings<'g, 's, 'sf>( Ok(new_settings) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scan_test() -> Result<(), Box> { + let input = OrgSource::new( + r#" +foo + #+archive: bar + +baz #+category: lorem + +#+label: ipsum + +#+todo: dolar +cat +"#, + ); + let (remaining, settings) = scan_for_in_buffer_settings(input)?; + assert_eq!(Into::<&str>::into(remaining), "cat\n"); + let keys: Vec<_> = settings.iter().map(|kw| kw.key).collect(); + // category is skipped because it is not the first non-whitespace on the line. + // + // label is skipped because it is not an in-buffer setting. + assert_eq!(keys, vec!["archive", "todo"]); + Ok(()) + } +}