>> Problem
I want to write a toy git revision parser (without input validation)
which parses the base revision name then zero or more suffixes, ^
and ~(n)
, into a recursive data structure:
/// Revision as a recursive struct
#[derive(Clone, Debug, PartialEq)]
pub enum Revision {
Ref(String),
Parent(Box<Revision>),
Ancestor(Box<Revision>, usize),
}
fn parse(input: &str) -> Result<Revision, &str> {
todo!()
}
/// Basic Pattern
assert_eq!(parse(&"master"), Ok(Revision::Ref("master".to_owned())));
assert_eq!(
parse(&"main^"),
Ok(Revision::Parent(
Box::new(Revision::Ref("main".to_owned()))))
);
assert_eq!(
parse(&"feature/banana-bread~3"),
Ok(Revision::Ancestor(
Box::new(Revision::Ref("feature/banana-bread".to_owned())),
3
))
);
/// Nested Pattern
assert_eq!(
parse(&"develop^^^"),
Ok(Revision::Parent(
Box::new(Revision::Parent(
Box::new(Revision::Parent(
Box::new(Revision::Ref("develop".to_owned()))))))))
);
assert_eq!(
parse(&"origin/deployment/1.2.3~5^"),
Ok(Revision::Parent(
Box::new(Revision::Ancestor(
Box::new(Revision::Ref("origin/deployment/1.2.3".to_owned())),
5
))))
)
A quick way to parse it recursively with regex would be:
#[macro_use]
extern crate lazy_static; // 1.4.0
use regex::Regex; // 1.4.2
fn parse(input: &str) -> Result<Revision, &str> {
lazy_static! {
static ref PARENT: Regex = Regex::new(r"/^(.+)\^$/").unwrap();
static ref ANCESTOR: Regex = Regex::new(r"/^(.+)~(\d+)$/").unwrap();
}
if let Some(captures) = PARENT.captures(input) {
let inner_input = captures.get(1).unwrap().as_str().to_owned();
let inner_rev = parse(inner_input).map_err(|_| "Parse error")?;
return Ok(Revision::Parent(Box::new(inner_rev)));
}
if let Some(captures) = ANCESTOR.is_match(input) {
let inner_input = captures.get(1).unwrap().as_str().to_owned();
let n = captures.get(2).unwrap().as_str().parse::<usize>().map_err(|_| "Parse error")?;
let inner_rev = parse(inner_input).map_err(|_| "Parse error")?;
return Ok(Revision::Ancestor(Box::new(inner_rev), n));
}
Ok(Revision::Ref(input.to_owned()))
}
However, this inefficiently parses the whole text for each suffix or nesting. Using the parser combinator nom instead, parsing can be better expressed and extended:
NOTE: While this is not a tutorial of nom
which you should check
out, this is about a pitfall/trap with a recursive design. Do pay more
to the general idea than the actual code.
use nom::{ // 5.1.1
branch::alt,
bytes::complete::{tag, take_till},
character::complete::digit1,
combinator::{all_consuming, map, map_res, rest},
sequence::preceded,
IResult
};
/// Parse base REF_NAME
fn parse_ref(i: &str) -> IResult<&str, Revision> {
let ref_name = map(
take_till(|b| b == '~' || b == '^'),
|ref_name: &str| ref_name.to_owned(),
);
let (i, name) = ref_name(i)?;
Ok((i, Revision::Ref(name)))
}
/// Parse carret suffix REV^
fn parse_caret<'a>(inner_rev: Revision) -> impl 'a + Fn(&'a str) -> IResult<&'a str, Revision> {
let suffix = tag("^");
move |i: &str| {
let (i, _) = suffix(i)?;
Ok((i, Revision::Parent(Box::new(inner_rev.clone()))))
}
}
/// Parse tilde suffix REV~n
fn parse_tilde<'a>(inner_rev: Revision) -> impl 'a + Fn(&'a str) -> IResult<&'a str, Revision> {
let number = map_res(digit1, |digits: &str| digits.parse::<usize>());
let suffix = preceded(tag("~"), number);
move |i: &str| {
let (i, n) = suffix(i)?;
Ok((i, Revision::Ancestor(Box::new(inner_rev.clone()), n)))
}
}
/// Parse suffixes recursively
fn parse_suffix<'a>(rev: Revision) -> impl 'a + Fn(&'a str) -> IResult<&'a str, Revision> {
let suffix = alt((
/// Check for REV^ suffix
parse_caret(rev.clone()),
/// Check for REV~n suffix
parse_tilde(rev.clone()),
/// If neither and the whole input is consumed, return the current revision
map(all_consuming(rest), move |_i: &str| rev.clone())
));
move |i: &str| {
let (i, next_rev) = suffix(i)?;
let (i, new_rev) = if i.is_empty() {
/// If nothing else to parse, return the current result
(i, next_rev)
} else {
/// Otherwise, parse recursively
parse_suffix(next_rev)(i)?
};
Ok((i, new_rev))
}
}
/// Main parsing function
fn parse(i: &str) -> IResult<&str, Revision> {
/// Parse the bottommost revision
let (i, base_rev) = parse_ref(i)?;
/// Build the revision recursively
let (i, rev) = parse_suffix(base_rev)(i)?;
Ok((i, rev))
}
The equivalent recursive parser reads the input once and builds the
revision bottom-up; however, the suffix parser (parse_suffix
) clones
the current revision too much (3 times) for each test. While it can be
refactored, recursion is usually harder to optimize and satisfy with
the borrow checker.
>> Solution
Instead of using recursion, an iterative approach can perform better. Rather than immediately building the revision at each recursion or iteration, the newly parsed parent revision can have an empty child revision which can be replaced later and makes each parser cleaner without requiring ownership or reference to the actual child revision:
/// Create a default for `Revision`
impl Default for Revision {
fn default() -> Self {
Self::Ref(String::default())
}
}
/// Without the child revision argument, the parsing is simpler
fn parse_caret(i: &str) -> IResult<&str, Revision> {
let suffix = tag("^");
let (i, _) = suffix(i)?;
/// Create empty revision to be replaced later
let empty_rev = Box::new(Revision::default());
Ok((i, Revision::Parent(empty_rev)))
}
/// Ditto
fn parse_tilde(i: &str) -> IResult<&str, Revision> {
let number = map_res(digit1, |digits: &str| digits.parse::<usize>());
let suffix = preceded(tag("~"), number);
let (i, n) = suffix(i)?;
let empty_rev = Box::new(Revision::default());
Ok((i, Revision::Ancestor(empty_rev, n)))
}
Refactoring the main suffix parser into an iterative loop:
use std::mem;
fn parse_suffix(mut rev: Revision, mut i: &str) -> IResult<&str, Revision> {
/// The branch parser is simpler as well
let suffix = alt((
parse_caret,
parse_tilde,
));
/// Until the whole input is parsed/consumed
while !i.is_empty() {
/// Parse the next parent revision
let (i2, mut acc_rev) = suffix(i)?;
/// To replace the parent's empty child revision, the child is pattern matched
let boxed_rev = match &mut acc_rev {
Revision::Ref(_) => unreachable!(),
Revision::Parent(ref mut boxed_rev) => boxed_rev,
Revision::Ancestor(ref mut boxed_rev, ..) => boxed_rev,
};
/// Then replaced via `std::mem::replace`
let _ = mem::replace(&mut **boxed_rev, rev);
// Finally mark this parent as the current revision
rev = acc_rev;
i = i2;
}
Ok((i, rev))
}
While recursion is natural to a recursive/nested structure, also consider an iterative approach which may be overall simpler and just as efficient.
>> Notes
>>> Using fold
Instead of a loop, it can be directly converted to a more succinct fold with fold_many0:
use nom::multi::fold_many0;
fn parse_suffix(rev: Revision, i: &str) -> IResult<&str, Revision> {
let suffix = alt((parse_caret, parse_tilde,));
/// Apply the `suffix` parser 0 or more times with the same logic
let (i, rev) = all_consuming(fold_many0(suffix, rev, |acc_rev, mut rev| {
let boxed_rev = match &mut rev {
Revision::Ref(_) => unreachable!(),
Revision::Parent(ref mut boxed_rev) => boxed_rev,
Revision::Ancestor(ref mut boxed_rev, ..) => boxed_rev,
};
let _ = mem::replace(&mut **boxed_rev, acc_rev);
rev
}))(i)?;
Ok((i, rev))
}
The only caveat with this combinator alongside alt is that it clones the input after each iteration which contradicts the original goal. In practice this is not detrimental but do check and be aware if the more complex function are bottlenecks. Rewriting them as a simpler function always works if needed:
use nom::error::ErrorKind;
/// Instead of a parser combinator, it can be a sequence of if-let returns
/// A little more verbose but it avoids cloning the input
fn suffix(i: &str) -> IResult<&str, Revision> {
if let Ok((i, rev)) = parse_caret(&i) {
return Ok((i, rev));
}
if let Ok((i, rev)) = parse_tilde(&i) {
return Ok((i, rev));
}
/// Construct nom error manually
Err(nom::Err::Error((i, ErrorKind::Alt)))
}