javaiobytebufferfilechannelparity

FileChannel and ByteBuffer writing extra data


I am creating a method that will take in a file and split it into shardCount pieces and generate a parity file.

When I run this method, it appears that I am writing out extra data into my parity file. This is my first time using FileChannel and ByteBuffers, so I'm not certain I completely understand how to use them despite staring at the documentation for about 8 hours.

This code is a simplified version of the parity section.

public static void splitAndGenerateParityFile(File file, int shardCount, String fileID) throws IOException {
    RandomAccessFile rin = new RandomAccessFile(file, "r");
    FileChannel fcin = rin.getChannel();

    //Create parity files
    File parity = new File(fileID + "_parity");
    if (parity.exists()) throw new FileAlreadyExistsException("Could not create parity file! File already exists!");
    RandomAccessFile parityRAF = new RandomAccessFile(parity, "rw");
    FileChannel parityOut = parityRAF.getChannel();

    long bytesPerFile = (long) Math.ceil(rin.length() / shardCount);

    //Make buffers for each section of the file we will be reading from
    for (int i = 0; i < shardCount; i++) {
        ByteBuffer bb = ByteBuffer.allocate(1024);
        shardBuffers.add(bb);
    }

    ByteBuffer parityBuffer = ByteBuffer.allocate(1024);

    //Generate parity
    boolean isParityBufferEmpty = true;
    for (long i = 0; i < bytesPerFile; i++) {
        isParityBufferEmpty = false;
        int pos = (int) (i % 1024);
        byte p = 0;

        if (pos == 0) {
            //Read chunk of file into each buffer
            for (int j = 0; j < shardCount; j++) {
                ByteBuffer bb = shardBuffers.get(j);
                bb.clear();
                fcin.read(bb, bytesPerFile * j + i);
                bb.rewind();
            }
            //Dump parity buffer
            if (i > 0) {
                parityBuffer.rewind();
                parityOut.write(parityBuffer);
                parityBuffer.clear();
                isParityBufferEmpty = true;
            }
        }

        //Get parity
        for (ByteBuffer bb : shardBuffers) {
            if (pos >= bb.limit()) break;
            p ^= bb.get(pos);
        }

        //Put parity in buffer
        parityBuffer.put(pos, p);
    }

    if (!isParityBufferEmpty) {
        parityBuffer.rewind();
        parityOut.write(parityBuffer);
        parityBuffer.clear();
    }

    fcin.close();
    rin.close();
    parityOut.close();
    parityRAF.close();
}

Please let me know if there is anything wrong with either the parity algorithm or the file IO, or if there's anything I can do to optimize this. I'm happy to hear about other (better) ways of doing file IO.


Solution

  • Here is the solution I found (though it may need more tuning):

    public static void splitAndGenerateParityFile(File file, int shardCount, String fileID) throws IOException {
        int BUFFER_SIZE = 4 * 1024 * 1024;
        RandomAccessFile rin = new RandomAccessFile(file, "r");
        FileChannel fcin = rin.getChannel();
    
        //Create parity files
        File parity = new File(fileID + "_parity");
        if (parity.exists()) throw new FileAlreadyExistsException("Could not create parity file! File already exists!");
        RandomAccessFile parityRAF = new RandomAccessFile(parity, "rw");
        FileChannel parityOut = parityRAF.getChannel();
    
        //Create shard files
        ArrayList<File> shards = new ArrayList<>(shardCount);
        for (int i = 0; i < shardCount; i++) {
            File f = new File(fileID + "_part_" + i);
            if (f.exists()) throw new FileAlreadyExistsException("Could not create shard file! File already exists!");
            shards.add(f);
        }
    
        long bytesPerFile = (long) Math.ceil(rin.length() / shardCount);
    
        ArrayList<ByteBuffer> shardBuffers = new ArrayList<>(shardCount);
    
        //Make buffers for each section of the file we will be reading from
        for (int i = 0; i < shardCount; i++) {
            ByteBuffer bb = ByteBuffer.allocate(BUFFER_SIZE);
               shardBuffers.add(bb);
        }
    
        ByteBuffer parityBuffer = ByteBuffer.allocate(BUFFER_SIZE);
    
        //Generate parity
        boolean isParityBufferEmpty = true;
        for (long i = 0; i < bytesPerFile; i++) {
            isParityBufferEmpty = false;
            int pos = (int) (i % BUFFER_SIZE);
            byte p = 0;
    
            if (pos == 0) {
                //Read chunk of file into each buffer
                for (int j = 0; j < shardCount; j++) {
                    ByteBuffer bb = shardBuffers.get(j);
                    bb.clear();
                    fcin.position(bytesPerFile * j + i);
                    fcin.read(bb);
                    bb.flip();
                }
    
                //Dump parity buffer
                if (i > 0) {
                    parityBuffer.flip();
                    while (parityBuffer.hasRemaining()) {
                        parityOut.write(parityBuffer);
                    }
                    parityBuffer.clear();
                    isParityBufferEmpty = true;
                }
            }
    
            //Get parity
            for (ByteBuffer bb : shardBuffers) {
                if (!bb.hasRemaining()) break;
                p ^= bb.get();
            }
    
            //Put parity in buffer
            parityBuffer.put(p);
        }
    
        if (!isParityBufferEmpty) {
            parityBuffer.flip();
            parityOut.write(parityBuffer);
            parityBuffer.clear();
        }
    
        fcin.close();
        rin.close();
        parityOut.close();
        parityRAF.close();
    }
    

    As suggested by VGR, I replaced rewind() with flip(). I also switched to relative operations instead of absolute. I don't think the absolute methods adjust the cursor position or the limit, so that was likely the cause of the error. I also changed the buffer size to 4MB as I am interested in generating the parity for large files.