javagzipinputstream

Strange behavior in GZIPOutputStream/GZIPInputStream


I have reduced the strange issue in this code to the minimum. This program writes 128,000 times the bytes for (int)90000 into a file and then tries to read it back in.

set zipped=false and everything works like a charm set zipped=true and everything works like a charm until the 496th chunk of 1024 bytes. At that point a single byte is lost and everything is shifted to the left by one byte (see output)

...
0 1 95 -112- which is byte code for int 90,000
Counters: 496 126937
1 95 -112 0- which is byte code for int 23,040,000
...

this is the code i came up with. I just can't figure out why it suddenly breaks in the middle of doing the same thing over and over. Any help/insights/explainers much appreciated.

public class TestApp7 {

static final boolean    zipped = true;
static File             theFile = null;

private static void writeZipData() throws Exception {
    FileOutputStream fos = new FileOutputStream(theFile);
    BufferedOutputStream bos = null;
    if (zipped) {
        GZIPOutputStream gzout = new GZIPOutputStream(fos);
        bos = new BufferedOutputStream(gzout);
    } else 
        bos = new BufferedOutputStream(fos);
    byte[] bs9 = RHUtilities.toByteArray((int)90000);
    for (int i=0; i<128000; i++)
        bos.write(bs9);
    bos.flush();
    bos.close();
}

private static void readZipData() throws Exception {
    byte[] buf = new byte[1024];
    int chunkCounter = 0;
    int intCounter = 0;
    FileInputStream fin = new FileInputStream(theFile);
    int rdLen = 0;
    if (zipped) {
        GZIPInputStream gin = new GZIPInputStream(fin);
        while ((rdLen = gin.read(buf)) != -1) {
            System.out.println("Counters: " + chunkCounter + " " + intCounter);
            for (int i=0; i<rdLen/4; i++) {
                byte[] bs = Arrays.copyOfRange(buf,(i*4),((i+1)*4));
                intCounter++;
                System.out.print(bs[0] + " " + bs[1] + " " + bs[2] + " " + bs[3]);
            }
            chunkCounter++;
        }
        gin.close();
    } else {
        while ((rdLen = fin.read(buf)) != -1) {
            System.out.println("Counters: " + chunkCounter + " " + intCounter);
            for (int i=0; i<rdLen/4; i++) {
                byte[] bs = Arrays.copyOfRange(buf,(i*4),((i+1)*4));
                intCounter++;
                System.out.print(bs[0] + " " + bs[1] + " " + bs[2] + " " + bs[3]);
            }
            chunkCounter++;
        }
    }
    fin.close();
}

public static void main(String args[]) {
    try {
        if (zipped)
            theFile = new File("Test.gz");
        else
            theFile = new File("Test.dat");
        writeZipData();
        readZipData();
    } catch (Throwable e) { e.printStackTrace(); }
}
}

Solution

  • So based on Jon's wonderful comments ... you cannot rely on .read(buffer) filling the buffer even when there are more bytes in the stream - it stops at the boundary where the BufferedOutputStream-wrapped GZIPOutputStream saved a chunk of data. just add another read to go beyond the boundary and complete the chunk

            while ((rdLen = gin.read(buf)) != -1) {
                if (rdLen<chunksize) {
                    byte[] missBytes = new byte[chunksize-rdLen];
                    int rdLine_miss = 0;
                    if ((rdLine_miss = gin.read(missBytes)) > 0)
                        System.arraycopy(missBytes,0,buf,rdLen,rdLine_miss);
                    rdLen += rdLine_miss;
                }
                for (int i=0; i<rdLen/4; i++) {
                    byte[] bs = Arrays.copyOfRange(buf,(i*4),((i+1)*4));
                    intCounter++;
                    System.out.println(bs[0] + " " + bs[1] + " " + bs[2] + " " + bs[3] + " ");
                }
                chunkCounter++;
            }