rustsplit

rust `split` split before the pattern match


I'm trying to split up a String retaining each pattern match but with the matched part retained as the beginning of the next sub-string rather than the terminator of the last.

Current implementation:

let s = String::from("AaaBbCc");
for block in s.split_inclusive(|c: char| c.is_uppercase()) {
    println!("{block}")
}

Current Output

A
aaB
bC
c

Desired output

Aaa
Bb
Cc

How can this be achieved?


Solution

  • I don't think the standard library can help you out there (though there's a million methods on String, I could be wrong). But it's a simple enough iterator to write yourself, if you really need the laziness:

    struct CamelCaseSplit<'a> {
        char_indices: std::str::CharIndices<'a>,
        chunk_start: usize,
        s: &'a str
    }
    
    impl<'a> CamelCaseSplit<'a> {
        pub fn new(s: &'a str) -> Self {
            let mut char_indices = s.char_indices();
            // We'll never want to split before the first char, so skip it.
            char_indices.next();
            Self {
                char_indices,
                chunk_start: 0,
                s,
            }
        }
    }
    
    impl<'a> Iterator for CamelCaseSplit<'a> {
        type Item = &'a str;
        
        fn next(&mut self) -> Option<Self::Item> {
            // The input is exhausted
            if self.chunk_start == self.s.len() {
                return None;
            }
            // Find the next uppercase letter position OR the end of the string
            let chunk_end = if let Some((chunk_end, _)) = self.char_indices.by_ref().skip_while(|(_, c)| !c.is_uppercase()).next() {
                chunk_end
            } else {
                self.s.len()
            };
            let chunk = &self.s[self.chunk_start..chunk_end];
            self.chunk_start = chunk_end;
            return Some(chunk);
        }
    }
    
    fn main() {
        let split = CamelCaseSplit::new("AaaBbCc");
        for c in split {
            println!("{c}");
            // Aaa
            // Bb
            // Cc
        }
    }