commit ee9e6297a606c9f464077f3ea972de88f70c0ff3 Author: Tom Alexander Date: Fri Jul 15 23:26:49 2022 -0400 Initial setup for the parser. diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..5a6ec02a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +**/.git +target +Cargo.lock diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..96ef6c0b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..6a508a15 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "toy" +version = "0.1.0" +edition = "2021" + +[[bin]] +name = "toy" +path = "src/main.rs" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +log = "0.4.17" +nom = "7.1.1" +pretty_env_logger = "0.4.0" + +[features] diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 00000000..da3d1e78 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,13 @@ +imports_granularity = "Item" + +# In rustfmt 2.0 I will want to adjust these settings. +# +# max_width controls the max length of a line before rustfmt gives up +# but that also scales the length of a bunch of other lines +# automaticaly due to width_heuristics. I want to find a way to enable +# rustfmt to work on longer lines when necessary without making my +# regular code too wide. +# +# max_width = 100 +# error_on_line_overflow = true +# width_heuristics = "Off" diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..9d2e1d5f --- /dev/null +++ b/src/main.rs @@ -0,0 +1,14 @@ +use nom::multi::many1; + +use crate::parser::paragraph; + +mod parser; + +const TEST_DOC: &'static str = include_str!("../toy_language.txt"); + +fn main() -> Result<(), Box> { + pretty_env_logger::init(); + println!("{}\n\n\n", TEST_DOC); + println!("{:#?}", many1(paragraph)(TEST_DOC)); + Ok(()) +} diff --git a/src/parser/bold_parser.rs b/src/parser/bold_parser.rs new file mode 100644 index 00000000..8a8a7fd6 --- /dev/null +++ b/src/parser/bold_parser.rs @@ -0,0 +1,16 @@ +//! Text between asterisks to make it bold. +use super::failable_sequence::failable_sequence; +use super::nom_context::NomContext; +use super::text::bold_end; +use super::text::bold_start; +use super::text_element_parser::text_element; +use nom::branch::alt; +use nom::combinator::map; +use nom::combinator::not; +use nom::combinator::recognize; +use nom::error::VerboseError; +use nom::sequence::tuple; + +// Sequence + +failable_sequence!(bold, i, context, bold_start, text_element, bold_end); diff --git a/src/parser/failable_sequence.rs b/src/parser/failable_sequence.rs new file mode 100644 index 00000000..16f46375 --- /dev/null +++ b/src/parser/failable_sequence.rs @@ -0,0 +1,52 @@ +macro_rules! failable_sequence { + ($name:ident,$inp:ident,$context:ident,$begin_matcher:expr,$element_matcher:expr,$success_matcher:expr) => { + pub fn $name<'b, F>( + $context: &'b NomContext, + ) -> impl for<'a> FnMut( + &'a str, + ) -> nom::IResult< + &'a str, + crate::parser::text::Sequence<'a>, + VerboseError<&'a str>, + > + 'b + where + F: for<'a> nom::Parser<&'a str, &'a str, VerboseError<&'a str>>, + { + let fail_matcher = $context.fail_matcher.clone(); + let new_fail_matcher = alt(( + |i| fail_matcher.borrow_mut().parse(i), + recognize($success_matcher), + )); + + move |$inp: &str| { + let new_context = $context.with_no_bold(); + // let other_new_context = NomContext::with_additional_fail_matcher( + // |i: &str| recognize($success_matcher)(i), + // $context, + // ); + let other_new_context = super::nom_context::NomContext::new(new_fail_matcher); + let element_matcher = recognize($element_matcher(&new_context)); + let local_fail_matcher = $context.fail_matcher.clone(); + let ret = map( + recognize(tuple(( + $begin_matcher, + nom::multi::many_till( + nom::sequence::preceded( + not(|i| local_fail_matcher.borrow_mut().parse(i)), + element_matcher, + ), + nom::sequence::preceded( + not(|i| local_fail_matcher.borrow_mut().parse(i)), + $success_matcher, + ), + ), + ))), + |s: &str| crate::parser::text::Sequence { contents: s }, + )($inp)?; + Ok(ret) + } + } + }; +} + +pub(crate) use failable_sequence; diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 00000000..a1653cdc --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,7 @@ +mod bold_parser; +mod failable_sequence; +mod nom_context; +mod parser_with_context; +mod text; +mod text_element_parser; +pub use text::paragraph; diff --git a/src/parser/nom_context.rs b/src/parser/nom_context.rs new file mode 100644 index 00000000..fb394789 --- /dev/null +++ b/src/parser/nom_context.rs @@ -0,0 +1,34 @@ +use nom::error::VerboseError; +use nom::Parser; +use std::cell::RefCell; +use std::rc::Rc; + +#[derive(Clone)] +pub struct NomContext { + pub fail_matcher: Rc>, + + /// You can't have nested bolds in org-mode + pub can_match_bold: bool, + pub can_match_link: bool, +} + +impl NomContext +where + F: for<'a> Parser<&'a str, &'a str, VerboseError<&'a str>>, +{ + pub fn new(fail_matcher: F) -> Self { + NomContext { + fail_matcher: Rc::new(RefCell::new(fail_matcher)), + can_match_bold: true, + can_match_link: true, + } + } + + pub fn with_no_bold(&self) -> NomContext { + NomContext { + fail_matcher: self.fail_matcher.clone(), + can_match_bold: false, + can_match_link: self.can_match_link, + } + } +} diff --git a/src/parser/parser_with_context.rs b/src/parser/parser_with_context.rs new file mode 100644 index 00000000..b6715a67 --- /dev/null +++ b/src/parser/parser_with_context.rs @@ -0,0 +1,14 @@ +macro_rules! parser_with_context { + ($name:ident,$typ:ty,$inp:ident,$context:ident,$fnbody:block) => { + pub fn $name( + $context: &NomContext, + ) -> impl for<'a> FnMut(&'a str) -> IResult<&'a str, $typ, VerboseError<&'a str>> + '_ + where + F: for<'a> nom::Parser<&'a str, &'a str, VerboseError<&'a str>>, + { + |$inp: &str| $fnbody + } + }; +} + +pub(crate) use parser_with_context; diff --git a/src/parser/text.rs b/src/parser/text.rs new file mode 100644 index 00000000..07709bab --- /dev/null +++ b/src/parser/text.rs @@ -0,0 +1,134 @@ +/* + +hypothetical link: +fn link = many_till(text_element, link_end) + +but what if you start a bold? +fn bold = many_till(text_element, bold_end) could eat the link_end + +Do I pass along break-conditions? Passing link_end into bold's parser? + +I'll try a very simple language first where asterisks always start/end bold and links are just between [ and ]. Paragraphs will have a blank line between them. + +*/ +use nom::bytes::complete::tag; +use nom::character::complete::alphanumeric1; +use nom::character::complete::line_ending; +use nom::character::complete::space1; +use nom::combinator::map; +use nom::combinator::recognize; +use nom::error::VerboseError; +use nom::multi::many_till; +use nom::sequence::tuple; +use nom::IResult; + +pub type Res = IResult>; + +#[derive(Debug)] +pub enum TextElement<'a> { + Span(Span<'a>), + Space(Space<'a>), + LineBreak(LineBreak<'a>), + Symbol(Symbol<'a>), + Bold(Bold<'a>), + Link(Link<'a>), +} + +#[derive(Debug)] +pub struct Span<'a> { + contents: &'a str, +} + +#[derive(Debug)] +pub struct Space<'a> { + contents: &'a str, +} + +#[derive(Debug)] +pub struct LineBreak<'a> { + contents: &'a str, +} + +#[derive(Debug)] +pub struct Symbol<'a> { + contents: &'a str, +} + +#[derive(Debug)] +pub struct BlankLine<'a> { + contents: Vec>, +} + +#[derive(Debug)] +pub struct Sequence<'a> { + pub contents: &'a str, +} + +#[derive(Debug)] +pub struct Bold<'a> { + pub contents: &'a str, +} + +#[derive(Debug)] +pub struct Link<'a> { + contents: &'a str, +} + +pub fn line_break(input: &str) -> Res<&str, LineBreak> { + map(line_ending, |s: &str| LineBreak { contents: s })(input) +} + +pub fn space(input: &str) -> Res<&str, Space> { + map(space1, |s: &str| Space { contents: s })(input) +} + +pub fn span(input: &str) -> Res<&str, Span> { + map(alphanumeric1, |s: &str| Span { contents: s })(input) +} + +pub fn symbol(symbol_tag: &'static str) -> impl for<'a> Fn(&'a str) -> Res<&'a str, Symbol<'a>> { + move |i: &str| map(tag(symbol_tag), |s: &str| Symbol { contents: s })(i) +} + +/// A line containing only whitespace and then a line break +/// +/// It is up to the caller to ensure this is called at the start of a line. +fn blank_line(input: &str) -> Res<&str, BlankLine> { + map( + many_till( + map(space, TextElement::Space), + map(line_break, TextElement::LineBreak), + ), + |(mut whitespace, end_of_line)| { + whitespace.push(end_of_line); + BlankLine { + contents: whitespace, + } + }, + )(input) +} + +pub fn bold_start(input: &str) -> Res<&str, TextElement> { + map(symbol("*"), TextElement::Symbol)(input) +} + +pub fn bold_end(input: &str) -> Res<&str, TextElement> { + map(symbol("*"), TextElement::Symbol)(input) +} + +pub fn link_start(input: &str) -> Res<&str, TextElement> { + map(symbol("["), TextElement::Symbol)(input) +} + +pub fn link_end(input: &str) -> Res<&str, TextElement> { + map(symbol("]"), TextElement::Symbol)(input) +} + +pub fn paragraph(input: &str) -> Res<&str, (Vec, &str)> { + todo!() + // many_till(TextElementParser::new(paragraph_end), paragraph_end)(input) +} + +fn paragraph_end(input: &str) -> Res<&str, &str> { + recognize(tuple((map(line_break, TextElement::LineBreak), blank_line)))(input) +} diff --git a/src/parser/text_element_parser.rs b/src/parser/text_element_parser.rs new file mode 100644 index 00000000..06e852d2 --- /dev/null +++ b/src/parser/text_element_parser.rs @@ -0,0 +1,33 @@ +//! A single element of text. +use super::nom_context::NomContext; +use super::parser_with_context::parser_with_context; +use super::text::line_break; +use super::text::space; +use super::text::span; +use super::text::symbol; +use super::text::TextElement; +use nom::branch::alt; +use nom::combinator::map; +use nom::combinator::not; +use nom::error::VerboseError; +use nom::IResult; + +parser_with_context!(text_element, TextElement, i, context, { + not(|i| context.fail_matcher.borrow_mut().parse(i))(i)?; + alt(( + // map( + // BoldParser::new(slf.context.fail_matcher.clone()), + // TextElement::Bold, + // ), + // map( + // LinkParser::new(slf.context.fail_matcher.clone()), + // TextElement::Link, + // ), + map(span, TextElement::Span), + map(symbol("*"), TextElement::Symbol), + map(symbol("["), TextElement::Symbol), + map(symbol("]"), TextElement::Symbol), + map(space, TextElement::Space), + map(line_break, TextElement::LineBreak), + ))(i) +}); diff --git a/toy_language.txt b/toy_language.txt new file mode 100644 index 00000000..21701187 --- /dev/null +++ b/toy_language.txt @@ -0,0 +1,11 @@ +prologue *goes here* I guess *bold +text* + +I guess *regular + +text* + +[foo *bar] baz* car + + +*nesting *bold entrances* and* exits