Merge branch 'accuracy_fixes_from_feeding_large_documents'
Some checks failed
rustfmt Build rustfmt has succeeded
rust-test Build rust-test has succeeded
rust-build Build rust-build has failed

This commit is contained in:
Tom Alexander 2023-08-29 17:49:19 -04:00
commit f4ea1b7303
Signed by: talexander
GPG Key ID: D3A179C9A53C0EDE
12 changed files with 274 additions and 44 deletions

View File

@ -75,7 +75,6 @@ fn is_expect_fail(name: &str) -> Option<&str> {
"autogen_greater_element_drawer_drawer_with_headline_inside" => Some("Apparently lines with :end: become their own paragraph. This odd behavior needs to be investigated more."),
"autogen_element_container_priority_footnote_definition_dynamic_block" => Some("Apparently broken begin lines become their own paragraph."),
"autogen_lesser_element_paragraphs_paragraph_with_backslash_line_breaks" => Some("The text we're getting out of the parse tree is already processed to remove line breaks, so our comparison needs to take that into account."),
"autogen_sections_and_headings_empty_section" => Some("We are not yet handling empty sections properly."),
_ => None,
}
}

View File

@ -0,0 +1,3 @@
(dolist (var org-element-affiliated-keywords)
(message "\"%s\"," (downcase var))
)

View File

@ -0,0 +1,15 @@
#+name: foo
#+caption: bar
#+caption: baz
[[file:lorem/ipsum.png]]
#+name: cat
#+foo: dog
[[file:lorem/ipsum.png]]
#+name: cat
#+foo: dog
foo

View File

@ -0,0 +1,52 @@
non-link text
eww://
rmail://
mhe://
irc://
info://
gnus://
docview://
bibtex://
bbdb://
w3m://
doi://
file+sys://
file+emacs://
shell://
news://
mailto://
https://
http://
ftp://
help://
file://
elisp://
randomfakeprotocl://
non-link text
non-link text
eww:
rmail:
mhe:
irc:
info:
gnus:
docview:
bibtex:
bbdb:
w3m:
doi:
file+sys:
file+emacs:
shell:
news:
mailto:
https:
http:
ftp:
help:
file:
elisp:
randomfakeprotocl:
non-link text

View File

@ -0,0 +1,3 @@
mailto:foo@bar.baz.
mailto:foo@bar.baz....

View File

@ -120,7 +120,7 @@ impl<'s> DiffResult<'s> {
indentation = " ".repeat(indentation),
status_text = status_text,
name = self.name,
char_offset = rust_offset,
char_offset = rust_offset + 1,
message = self.message.as_ref().map(|m| m.as_str()).unwrap_or("")
);
for child in self.children.iter() {

View File

@ -283,10 +283,21 @@ fn _heading<'r, 's>(
headline(context, input, parent_stars)?;
let section_matcher = parser_with_context!(section)(context);
let heading_matcher = parser_with_context!(heading(star_count))(context);
let (remaining, children) = many0(alt((
map(heading_matcher, DocumentElement::Heading),
map(section_matcher, DocumentElement::Section),
)))(remaining)?;
let (remaining, maybe_section) =
opt(map(section_matcher, DocumentElement::Section))(remaining)?;
let (remaining, mut children) =
many0(map(heading_matcher, DocumentElement::Heading))(remaining)?;
if let Some(section) = maybe_section {
children.insert(0, section);
}
let remaining = if children.is_empty() {
// Support empty headings
let (remain, _ws) = many0(blank_line)(remaining)?;
remain
} else {
remaining
};
let source = get_consumed(input, remaining);
Ok((
remaining,

View File

@ -12,6 +12,7 @@ use super::fixed_width_area::fixed_width_area;
use super::footnote_definition::footnote_definition;
use super::greater_block::greater_block;
use super::horizontal_rule::horizontal_rule;
use super::keyword::affiliated_keyword;
use super::keyword::keyword;
use super::latex_environment::latex_environment;
use super::lesser_block::comment_block;
@ -62,10 +63,12 @@ fn _element<'r, 's>(
let fixed_width_area_matcher = parser_with_context!(fixed_width_area)(context);
let horizontal_rule_matcher = parser_with_context!(horizontal_rule)(context);
let keyword_matcher = parser_with_context!(keyword)(context);
let affiliated_keyword_matcher = parser_with_context!(affiliated_keyword)(context);
let paragraph_matcher = parser_with_context!(paragraph)(context);
let latex_environment_matcher = parser_with_context!(latex_environment)(context);
let (remaining, mut affiliated_keywords) = many0(keyword_matcher)(input)?;
// TODO: Affiliated keywords cannot be on comments, clocks, headings, inlinetasks, items, node properties, planning, property drawers, sections, and table rows
let (remaining, mut affiliated_keywords) = many0(affiliated_keyword_matcher)(input)?;
let (remaining, mut element) = match alt((
map(plain_list_matcher, Element::PlainList),
map(greater_block_matcher, Element::GreaterBlock),
@ -84,6 +87,7 @@ fn _element<'r, 's>(
map(fixed_width_area_matcher, Element::FixedWidthArea),
map(horizontal_rule_matcher, Element::HorizontalRule),
map(latex_environment_matcher, Element::LatexEnvironment),
map(keyword_matcher, Element::Keyword),
))(remaining)
{
the_ok @ Ok(_) => the_ok,
@ -93,12 +97,12 @@ fn _element<'r, 's>(
the_ok @ Ok(_) => the_ok,
Err(_) => {
affiliated_keywords.clear();
map(keyword_matcher, Element::Keyword)(input)
map(affiliated_keyword_matcher, Element::Keyword)(input)
}
}
} else {
affiliated_keywords.clear();
map(keyword_matcher, Element::Keyword)(input)
map(affiliated_keyword_matcher, Element::Keyword)(input)
}
}
}?;

View File

@ -15,7 +15,8 @@ use crate::error::Res;
use crate::parser::object::Entity;
use crate::parser::util::get_consumed;
const ENTITIES: [&'static str; 413] = [
// TODO: Make this a user-provided variable corresponding to elisp's org-entities
const ORG_ENTITIES: [&'static str; 413] = [
"Agrave",
"agrave",
"Aacute",
@ -457,8 +458,7 @@ fn name<'r, 's>(
input: OrgSource<'s>,
) -> Res<OrgSource<'s>, OrgSource<'s>> {
// TODO: This should be defined by org-entities and optionally org-entities-user
for entity in ENTITIES {
// foo
for entity in ORG_ENTITIES {
let result = tag_no_case::<_, _, CustomError<_>>(entity)(input);
match result {
Ok((remaining, ent)) => {

View File

@ -2,6 +2,8 @@ use nom::branch::alt;
use nom::bytes::complete::is_not;
use nom::bytes::complete::tag;
use nom::bytes::complete::tag_no_case;
use nom::bytes::complete::take_while1;
use nom::character::complete::anychar;
use nom::character::complete::line_ending;
use nom::character::complete::space0;
use nom::character::complete::space1;
@ -9,14 +11,24 @@ use nom::combinator::eof;
use nom::combinator::not;
use nom::combinator::peek;
use nom::combinator::recognize;
use nom::multi::many_till;
use nom::sequence::tuple;
use super::org_source::BracketDepth;
use super::org_source::OrgSource;
use super::Context;
use crate::error::CustomError;
use crate::error::MyError;
use crate::error::Res;
use crate::parser::util::start_of_line;
use crate::parser::Keyword;
const ORG_ELEMENT_AFFILIATED_KEYWORDS: [&'static str; 13] = [
"caption", "data", "header", "headers", "label", "name", "plot", "resname", "result",
"results", "source", "srcname", "tblname",
];
const ORG_ELEMENT_DUAL_KEYWORDS: [&'static str; 2] = ["caption", "results"];
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
pub fn keyword<'r, 's>(
_context: Context<'r, 's>,
@ -41,3 +53,112 @@ pub fn keyword<'r, 's>(
},
))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
pub fn affiliated_keyword<'r, 's>(
_context: Context<'r, 's>,
input: OrgSource<'s>,
) -> Res<OrgSource<'s>, Keyword<'s>> {
start_of_line(input)?;
// TODO: When key is a member of org-element-parsed-keywords, value can contain the standard set objects, excluding footnote references.
let (remaining, rule) = recognize(tuple((
space0,
tag("#+"),
affiliated_key,
tag(":"),
alt((recognize(tuple((space1, is_not("\r\n")))), space0)),
alt((line_ending, eof)),
)))(input)?;
Ok((
remaining,
Keyword {
source: rule.into(),
},
))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn affiliated_key<'s>(input: OrgSource<'s>) -> Res<OrgSource<'s>, OrgSource<'s>> {
alt((
recognize(tuple((dual_affiliated_key, tag("["), optval, tag("]")))),
plain_affiliated_key,
export_keyword,
))(input)
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn plain_affiliated_key<'s>(input: OrgSource<'s>) -> Res<OrgSource<'s>, OrgSource<'s>> {
for keyword in ORG_ELEMENT_AFFILIATED_KEYWORDS {
let result = tag_no_case::<_, _, CustomError<_>>(keyword)(input);
match result {
Ok((remaining, ent)) => {
return Ok((remaining, ent));
}
Err(_) => {}
}
}
Err(nom::Err::Error(CustomError::MyError(MyError(
"NoKeywordKey".into(),
))))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn dual_affiliated_key<'s>(input: OrgSource<'s>) -> Res<OrgSource<'s>, OrgSource<'s>> {
for keyword in ORG_ELEMENT_DUAL_KEYWORDS {
let result = tag_no_case::<_, _, CustomError<_>>(keyword)(input);
match result {
Ok((remaining, ent)) => {
return Ok((remaining, ent));
}
Err(_) => {}
}
}
Err(nom::Err::Error(CustomError::MyError(MyError(
"NoKeywordKey".into(),
))))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn optval<'s>(input: OrgSource<'s>) -> Res<OrgSource<'s>, OrgSource<'s>> {
recognize(many_till(
anychar,
peek(optval_end(input.get_bracket_depth())),
))(input)
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
const fn optval_end(
starting_bracket_depth: BracketDepth,
) -> impl for<'s> Fn(OrgSource<'s>) -> Res<OrgSource<'s>, OrgSource<'s>> {
move |input: OrgSource<'_>| _optval_end(input, starting_bracket_depth)
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn _optval_end<'s>(
input: OrgSource<'s>,
starting_bracket_depth: BracketDepth,
) -> Res<OrgSource<'s>, OrgSource<'s>> {
let current_depth = input.get_bracket_depth() - starting_bracket_depth;
if current_depth < 0 {
// This shouldn't be possible because if depth is 0 then a closing bracket should end the opval.
unreachable!("Exceeded optval bracket depth.")
}
if current_depth == 0 {
let close_bracket = tag::<_, _, CustomError<_>>("]")(input);
if close_bracket.is_ok() {
return close_bracket;
}
}
tag("\n")(input)
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
fn export_keyword<'s>(input: OrgSource<'s>) -> Res<OrgSource<'s>, OrgSource<'s>> {
recognize(tuple((
tag_no_case("attr_"),
take_while1(|c: char| c.is_alphanumeric() || "-_".contains(c)),
)))(input)
}

View File

@ -1,5 +1,6 @@
use nom::bytes::complete::tag;
use nom::character::complete::anychar;
use nom::character::complete::space0;
use nom::combinator::not;
use nom::combinator::opt;
use nom::combinator::peek;
@ -24,6 +25,7 @@ pub fn org_macro<'r, 's>(
let (remaining, macro_name) = org_macro_name(context, remaining)?;
let (remaining, macro_args) = opt(parser_with_context!(org_macro_args)(context))(remaining)?;
let (remaining, _) = tag("}}}")(remaining)?;
let (remaining, _trailing_whitespace) = space0(remaining)?;
let source = get_consumed(input, remaining);
Ok((

View File

@ -7,6 +7,7 @@ use nom::character::complete::one_of;
use nom::combinator::eof;
use nom::combinator::peek;
use nom::combinator::recognize;
use nom::combinator::verify;
use nom::multi::many_till;
use super::org_source::OrgSource;
@ -23,6 +24,33 @@ use crate::parser::util::exit_matcher_parser;
use crate::parser::util::get_consumed;
use crate::parser::util::WORD_CONSTITUENT_CHARACTERS;
// TODO: Make this a user-provided variable corresponding to elisp's org-link-parameters
const ORG_LINK_PARAMETERS: [&'static str; 23] = [
"id",
"eww",
"rmail",
"mhe",
"irc",
"info",
"gnus",
"docview",
"bibtex",
"bbdb",
"w3m",
"doi",
"file+sys",
"file+emacs",
"shell",
"news",
"mailto",
"https",
"http",
"ftp",
"help",
"file",
"elisp",
];
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
pub fn plain_link<'r, 's>(
context: Context<'r, 's>,
@ -73,36 +101,19 @@ pub fn protocol<'r, 's>(
input: OrgSource<'s>,
) -> Res<OrgSource<'s>, OrgSource<'s>> {
// TODO: This should be defined by org-link-parameters
let (remaining, proto) = alt((
alt((
tag_no_case("id"),
tag_no_case("eww"),
tag_no_case("rmail"),
tag_no_case("mhe"),
tag_no_case("irc"),
tag_no_case("info"),
tag_no_case("gnus"),
tag_no_case("docview"),
tag_no_case("bibtex"),
tag_no_case("bbdb"),
tag_no_case("w3m"),
)),
alt((
tag_no_case("doi"),
tag_no_case("file+sys"),
tag_no_case("file+emacs"),
tag_no_case("shell"),
tag_no_case("news"),
tag_no_case("mailto"),
tag_no_case("https"),
tag_no_case("http"),
tag_no_case("ftp"),
tag_no_case("help"),
tag_no_case("file"),
tag_no_case("elisp"),
)),
))(input)?;
Ok((remaining, proto))
for link_parameter in ORG_LINK_PARAMETERS {
let result = tag_no_case::<_, _, CustomError<_>>(link_parameter)(input);
match result {
Ok((remaining, ent)) => {
return Ok((remaining, ent));
}
Err(_) => {}
}
}
Err(nom::Err::Error(CustomError::MyError(MyError(
"NoLinkProtocol".into(),
))))
}
#[cfg_attr(feature = "tracing", tracing::instrument(ret, level = "debug"))]
@ -119,7 +130,11 @@ fn path_plain<'r, 's>(
let exit_matcher = parser_with_context!(exit_matcher_parser)(&parser_context);
let (remaining, path) = recognize(many_till(anychar, peek(exit_matcher)))(input)?;
let (remaining, path) = recognize(verify(
many_till(anychar, peek(exit_matcher)),
|(children, _exit_contents)| !children.is_empty(),
))(input)?;
Ok((remaining, path))
}
@ -128,5 +143,10 @@ fn path_plain_end<'r, 's>(
_context: Context<'r, 's>,
input: OrgSource<'s>,
) -> Res<OrgSource<'s>, OrgSource<'s>> {
recognize(one_of(" \t\r\n()[]<>"))(input)
recognize(many_till(
verify(anychar, |c| {
*c != '/' && (c.is_ascii_punctuation() || c.is_whitespace())
}),
one_of(" \t\r\n()[]<>"),
))(input)
}