clinuxasynchronousaio

Why does io_prep_pwritev set nbytes to iovcnt?


In libaio, io_prep_pwritev macro prepares the command and sets the iocb struct. Isn't iocb.c.u.nbytes supposed to be number of bytes? In io_prep_pwritev, this is set to iovcnt which doesn't seem correct. Why is that?

https://kernel.googlesource.com/pub/scm/fs/ext2/xfstests-bld/+/ac7997c9b0f905383e8675a766efc8c8305ce1f1/libaio/src/libaio.h#203


Solution

  • It's correct, but looks strange. The same ABI is shared by PWRITE and PWRITEV. For PWRITE, it works how you'd expect, with buf set to the data buffer and nbytes set to the number of bytes:

    static inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset)
    {
            memset(iocb, 0, sizeof(*iocb));
            iocb->aio_fildes = fd;
            iocb->aio_lio_opcode = IO_CMD_PWRITE;
            iocb->aio_reqprio = 0;
            iocb->u.c.buf = buf;
            iocb->u.c.nbytes = count;
            iocb->u.c.offset = offset;
    }
    

    But for PWRITEV, it repurposes the ABI arguments to convey the iovec array and its length into the kernel.

    static inline void io_prep_pwritev(struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt, long long offset)
    {
            memset(iocb, 0, sizeof(*iocb));
            iocb->aio_fildes = fd;
            iocb->aio_lio_opcode = IO_CMD_PWRITEV;
            iocb->aio_reqprio = 0;
            iocb->u.c.buf = (void *)iov;
            iocb->u.c.nbytes = iovcnt;
            iocb->u.c.offset = offset;
    }
    

    Then, inside the kernel in fs/aio.c, it switches on whether the I/O operation is vectored. In the non-vectored case, it decodes the args as a single buffer pointer and number of bytes. In the vectored case, it calls into lib/iov_iter.c:import_iovec to decode the buf and nbytes args as an iovec and the length of the iovec.

    Here's where it decodes the opcode, note the booleans indicating vectored operations:

        switch (iocb->aio_lio_opcode) {
        case IOCB_CMD_PREAD:
            return aio_read(&req->rw, iocb, false, compat);
        case IOCB_CMD_PWRITE:
            return aio_write(&req->rw, iocb, false, compat);
        case IOCB_CMD_PREADV:
            return aio_read(&req->rw, iocb, true, compat);
        case IOCB_CMD_PWRITEV:
            return aio_write(&req->rw, iocb, true, compat);
        case IOCB_CMD_FSYNC:
            return aio_fsync(&req->fsync, iocb, false);
        case IOCB_CMD_FDSYNC:
            return aio_fsync(&req->fsync, iocb, true);
        case IOCB_CMD_POLL:
            return aio_poll(req, iocb);
        default:
            pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
            return -EINVAL;
        }
    

    And, here's where it handles decoding the args:

    static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
            struct iovec **iovec, bool vectored, bool compat,
            struct iov_iter *iter)
    {
        void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
        size_t len = iocb->aio_nbytes;
    
        if (!vectored) {
            ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
            *iovec = NULL;
            return ret;
        }
    #ifdef CONFIG_COMPAT
        if (compat)
            return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
                    iter);
    #endif
        return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
    }