ruststdinchars

How can I create an efficient iterator of chars from stdin with Rust?


Now that the Read::chars iterator has been officially deprecated, what is the the proper way to obtain an iterator over the chars coming from a Reader like stdin without reading the entire stream into memory?


Solution

  • As a couple others have mentioned, it is possible to copy the deprecated implementation of Read::chars for use in your own code. Whether this is truly ideal or not will depend on your use-case--for me, this proved to be good enough for now although it is likely that my application will outgrow this approach in the near-future.

    To illustrate how this can be done, let's look at a concrete example:

    use std::io::{self, Error, ErrorKind, Read};
    use std::result;
    use std::str;
    
    struct MyReader<R> {
        inner: R,
    }
    
    impl<R: Read> MyReader<R> {
        fn new(inner: R) -> MyReader<R> {
            MyReader {
                inner,
            }
        }
    
    #[derive(Debug)]
    enum MyReaderError {
        NotUtf8,
        Other(Error),
    }
    
    impl<R: Read> Iterator for MyReader<R> {
        type Item = result::Result<char, MyReaderError>;
    
        fn next(&mut self) -> Option<result::Result<char, MyReaderError>> {
            let first_byte = match read_one_byte(&mut self.inner)? {
                Ok(b) => b,
                Err(e) => return Some(Err(MyReaderError::Other(e))),
            };
            let width = utf8_char_width(first_byte);
            if width == 1 {
                return Some(Ok(first_byte as char));
            }
            if width == 0 {
                return Some(Err(MyReaderError::NotUtf8));
            }
            let mut buf = [first_byte, 0, 0, 0];
            {
                let mut start = 1;
                while start < width {
                    match self.inner.read(&mut buf[start..width]) {
                        Ok(0) => return Some(Err(MyReaderError::NotUtf8)),
                        Ok(n) => start += n,
                        Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
                        Err(e) => return Some(Err(MyReaderError::Other(e))),
                    }
                }
            }
            Some(match str::from_utf8(&buf[..width]).ok() {
                Some(s) => Ok(s.chars().next().unwrap());
                None => Err(MyReaderError::NotUtf8),
            })
        }
    }
    

    The above code also requires read_one_byte and utf8_char_width to be implemented. Those should look something like:

    static UTF8_CHAR_WIDTH: [u8; 256] = [
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
    ];
    
    fn utf8_char_width(b: u8) -> usize {
        return UTF8_CHAR_WIDTH[b as usize] as usize;
    }
    
    fn read_one_byte(reader: &mut Read) -> Option<io::Result<u8>> {
        let mut buf = [0];
        loop {
            return match reader.read(&mut buf) {
                Ok(0) => None,
                Ok(..) => Some(Ok(buf[0])),
                Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
                Err(e) => Some(Err(e)),
            };
        }
    }
    

    Now we can use the MyReader implementation to produce an iterator of chars over some reader, like io::stdin::Stdin:

    fn main() {
        let stdin = io::stdin();
        let mut reader = MyReader::new(stdin.lock());
        for c in reader {
            println!("{}", c);
        }
    }
    

    The limitations of this approach are discussed at length in the original issue thread. One particular concern worth pointing out however is that this iterator will not handle non-UTF-8 encoded streams correctly.