rustnom

Parse this custom data format for data type with nested list


I have this format, the result should be of Item data type,

test 1: "<A \"Test\">"

test 2: r#"<A "Test">"#

Result: Item { item_type: TEXT, ascii_data: Some("Test") }

test 3: <A>

Result: Item { item_type: TEXT, ascii_data: None }

For test 1 and test 2, following code parse, but for type 3, it is failing,

It also contains nested types.

<L 
    <A "Test1">
    <L
        <A>
        <A "Test2">
    >
    <A "Test3">
>

Result:

Item { 
    item_type: LIST,
    sub_items: [
        Item {
            item_type: ASCII,
            ascii_data: "Test1",
        },
        Item {
            item_type: LIST,
            sub_items: [
                Item {
                    item_type: ASCII,
                    ascii_data: None,
                    }
                Item {
                    item_type: ASCII,
                    ascii_data: "Test2",
                    }
                ], 
        },
        Item {
            item_type: ASCII,
            ascii_data: "Test3",
        },
    ],
}

Rust Playgroud link

use nom::{
    branch::alt,
    bytes::complete::{tag, take_until},
    character::complete::multispace0,
    combinator::map,
    IResult,
};

#[derive(Clone, Debug, PartialEq)]
enum ItemType {
    LIST,
    TEXT,
    NONE,
}

#[derive(Clone, Debug, PartialEq)]
struct Item {
    item_type: ItemType,
    sub_items: Option<Vec<Item>>,
    ascii_data: Option<String>,
}

impl Default for Item {
    fn default() -> Self {
        Item {
            item_type: ItemType::NONE,
            sub_items: None,
            ascii_data: None,
        }
    }
}

// Parse string data, it may empty then return none,
fn parse_ascii_data(input: &str) -> IResult<&str, String> {
    let (input, _) = tag("\"")(input)?;
    let (input, ascii_data) = take_until("\"")(input)?;
    let (input, _) = tag("\"")(input)?;

    Ok((input, ascii_data.to_string()))
}

// Parse <A> or <A "string">, if no string then return empty string then return none
fn parse_ascii_item(input: &str) -> IResult<&str, Item> {
    let (input, _) = tag("<A")(input)?;
    let (input, _) = multispace0(input)?;
    let (input, ascii_data) = alt((parse_ascii_data, map(tag("\"\""), |_| "".to_string())))(input)?;
    let (input, _) = tag(">")(input)?;

    Ok((
        input,
        Item {
            item_type: ItemType::TEXT,
            ascii_data: Some(ascii_data),
            ..Default::default()
        },
    ))
}

// Parse <L> or <L <A "string">>, if no string then return empty string then return none
fn parse_list_item(input: &str) -> IResult<&str, Item> {
    let (input, _) = tag("<L")(input)?;
    let (input, _) = multispace0(input)?;
    let (input, sub_items) = alt((parse_ascii_item, map(tag("<>"), |_| Item::default())))(input)?;
    let (input, _) = tag(">")(input)?;

    Ok((
        input,
        Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![sub_items]),
            ..Default::default()
        },
    ))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_ascii_item() {
        let input = "<A \"Test\">";

        let expected_item = Item {
            item_type: ItemType::TEXT,
            ascii_data: Some("Test".to_string()),
            ..Default::default()
        };

        assert_eq!(parse_ascii_item(input), Ok(("", expected_item)));

        let input = r#"<A "Test">"#;

        let expected_item = Item {
            item_type: ItemType::TEXT,
            ascii_data: Some("Test".to_string()),
            ..Default::default()
        };

        assert_eq!(parse_ascii_item(input), Ok(("", expected_item)));

        assert_eq!(
            parse_ascii_item("<A>"),
            Ok((
                "",
                Item {
                    item_type: ItemType::TEXT,
                    ascii_data: None,
                    ..Default::default()
                }
            ))
        );
    }

    #[test]
    fn test_parse_list_item() {
        let input = "<L <A \"Test\">>";

        let expected_item = Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![Item {
                item_type: ItemType::TEXT,
                ascii_data: Some("Test".to_string()),
                ..Default::default()
            }]),
            ..Default::default()
        };

        assert_eq!(parse_list_item(input), Ok(("", expected_item)));

        assert_eq!(
            parse_list_item("<L>"),
            Ok((
                "",
                Item {
                    item_type: ItemType::LIST,
                    sub_items: Some(vec![Item::default()]),
                    ..Default::default()
                }
            ))
        );
    }

    #[test]
    fn test_parse_nested_list_item() {
        let input = "<L \n    <A \"Test1\">\n    <L\n        <A \"Test2\">\n    >\n>";

        let expected_item = Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![Item {
                item_type: ItemType::TEXT,
                ascii_data: Some("Test".to_string()),
                ..Default::default()
            }]),
            ..Default::default()
        };

        assert_eq!(parse_list_item(input), Ok(("", expected_item)));

        assert_eq!(
            parse_list_item("<L>"),
            Ok((
                "",
                Item {
                    item_type: ItemType::LIST,
                    sub_items: Some(vec![Item::default()]),
                    ..Default::default()
                }
            ))
        );
    }
}

Solution

  • In parse_ascii_item() you don't want to use alt() you want to use opt(). While in parse_list_item() you actually do want do use alt().

    The difference is that alt() executes the parsers one-by-one until one succeeds (if any). While in parse_ascii_item() you want to accept optional ascii_data.

    The fixed parse_ascii_item() looks like this:

    fn parse_ascii_item(input: &str) -> IResult<&str, Item> {
        let (input, _) = multispace0(input)?;
        let (input, _) = tag("<A")(input)?;
        let (input, _) = multispace0(input)?;
        let (input, ascii_data) = opt(parse_ascii_data)(input)?;
        let (input, _) = multispace0(input)?;
        let (input, _) = tag(">")(input)?;
    
        Ok((
            input,
            Item {
                item_type: ItemType::TEXT,
                ascii_data,
                ..Default::default()
            },
        ))
    }
    

    Now in parse_list_item() we do actually want to use alt(), since we want to accept either parse_ascii_item() or parse_list_item(). Additionally, since we want to accept zero-to-many of them, we also need to wrap it in many0().

    The fixed parse_list_item() looks like this:

    fn parse_list_item(input: &str) -> IResult<&str, Item> {
        let (input, _) = multispace0(input)?;
        let (input, _) = tag("<L")(input)?;
        let (input, mut sub_items) = many0(|input| {
            let (input, _) = multispace0(input)?;
            alt((parse_ascii_item, parse_list_item))(input)
        })(input)?;
        let (input, _) = multispace0(input)?;
        let (input, _) = tag(">")(input)?;
    
        if sub_items.is_empty() {
            sub_items.push(Item::default());
        }
    
        Ok((
            input,
            Item {
                item_type: ItemType::LIST,
                sub_items: Some(sub_items),
                ..Default::default()
            },
        ))
    }
    

    Additionally, your test_parse_nested_list_item() is wrong. At least your input doesn't reasonably match the expected_item. So I assume expected_item actually needs to look like this:

    let expected_item = Item {
        item_type: ItemType::LIST,
        sub_items: Some(vec![
            Item {
                item_type: ItemType::TEXT,
                ascii_data: Some("Test1".to_string()),
                ..Default::default()
            },
            Item {
                item_type: ItemType::LIST,
                sub_items: Some(vec![Item {
                    item_type: ItemType::TEXT,
                    ascii_data: Some("Test2".to_string()),
                    ..Default::default()
                }]),
                ..Default::default()
            },
        ]),
        ..Default::default()
    };
    

    Here's a complete example on Rust Playground.


    As an aside, in the future remember to sprinkle multispace0() around, since you allow for optional whitespace in various places:

    let (input, _) = multispace0(input)?;
    

    Another aside Rust's string literally allow for newlines. Additionally, instead of escaping \" you can also use raw string literals:

    // Before
    let input = "<L \n    <A \"Test1\">\n    <L\n        <A \"Test2\">\n    >\n>";
    
    // After:
    let input = r#"
        <L
            <A "Test1">
            <L
                <A "Test2">
            >
        >"#;