(TL;DR) On NVME SSDs (Intel p3600 as well as Avant), I am seeing decrease in the IOPS if I issue random reads over a small subset of the disk instead of the entire disk.
While reading the same offset over and over, the IOPS are about 36-40K for 4k blocksize. The IOPS gradually increase as I grow the region over which random reads are being issued. The program (seen below) uses asynchronous IO on Linux to submit the read requests.
Disk Range(in 4k blocks), IOPS
0, 38833
1, 68596
10, 76100
30, 80381
40, 113647
50, 148205
100, 170374
200, 239798
400, 270197
800, 334767
OS : Linux 4.2.0-35-generic
SSD : Intel P3600 NVME Flash
What could be causing this problem ?
The program can be run as follows
$ for i in 0 1 10 30 40 50 100 200 400 800
do
<program_name> /dev/nvme0n1 10 $i
done
and validate if you also see the increasing pattern of IOPS seen above
/**
* $ g++ <progname.cpp> -o progname -std=c++11 -lpthread -laio -O3
* $ progname /dev/nvme0n1 10 100
*/
#include <random>
#include <libaio.h>
#include <stdlib.h>//malloc, exit
#include <future> //async
#include <unistd.h> //usleep
#include <iostream>
#include <sys/time.h> // gettimeofday
#include <vector>
#include <fcntl.h> // open
#include <errno.h>
#include <sys/types.h> // open
#include <sys/stat.h> // open
#include <cassert>
#include <semaphore.h>
io_context_t ioctx;
std::vector<char*> buffers;
int fd = -1;
sem_t sem;
constexpr int numPerRound = 20;
constexpr int numRounds = 100000;
constexpr int MAXEVENT = 10;
constexpr size_t BLKSIZE = 4096;
constexpr int QDEPTH = 200;
off_t startBlock = 0;
off_t numBlocks = 100;
const int numSubmitted = numRounds * numPerRound;
void DoGet()
{
io_event eventsArray[MAXEVENT];
int numCompleted = 0;
while (numCompleted != numSubmitted)
{
bzero(eventsArray, MAXEVENT * sizeof(io_event));
int numEvents;
do {
numEvents = io_getevents(ioctx, 1, MAXEVENT, eventsArray, nullptr);
} while (numEvents == -EINTR);
for (int i = 0; i < numEvents; i++)
{
io_event* ev = &eventsArray[i];
iocb* cb = (iocb*)(ev->data);
assert(ev->res2 == 0);
assert(ev->res == BLKSIZE);
sem_post(&sem); // free ioctx
}
numCompleted += numEvents;
}
std::cout << "completed=" << numCompleted << std::endl;
}
int main(int argc, char* argv[])
{
if (argc == 1) {
std::cout << "usage <nvme_device_name> <start_4k_block> <num_4k_blocks>" << std::endl;
exit(1);
}
char* deviceName = argv[1];
startBlock = atoll(argv[2]);
numBlocks = atoll(argv[3]);
int ret = 0;
ret = io_queue_init(QDEPTH, &ioctx);
assert(ret == 0);
ret = sem_init(&sem, 0, QDEPTH);
assert(ret == 0);
auto DoGetFut = std::async(std::launch::async, DoGet);
// preallocate buffers
for (int i = 0; i < QDEPTH; i++)
{
char* buf ;
ret = posix_memalign((void**)&buf, 4096, BLKSIZE);
assert(ret == 0);
buffers.push_back(buf);
}
fd = open("/dev/nvme0n1", O_DIRECT | O_RDONLY);
assert(fd >= 0);
off_t offset = 0;
struct timeval start;
gettimeofday(&start, 0);
std::mt19937 generator (getpid());
// generate random offsets within [startBlock, startBlock + numBlocks]
std::uniform_int_distribution<off_t> offsetgen(startBlock, startBlock + numBlocks);
for (int j = 0; j < numRounds; j++)
{
iocb mycb[numPerRound];
iocb* posted[numPerRound];
bzero(mycb, sizeof(iocb) * numPerRound);
for (int i = 0; i < numPerRound; i++)
{
// same buffer may get used in 2 different async read
// thats ok - not validating content in this program
char* iobuf = buffers[i];
iocb* cb = &mycb[i];
offset = offsetgen(generator) * BLKSIZE;
io_prep_pread(cb, fd, iobuf, BLKSIZE, offset);
cb->data = iobuf;
posted[i] = cb;
sem_wait(&sem); // wait for ioctx to be free
}
int ret = 0;
do {
ret = io_submit(ioctx, numPerRound, posted);
} while (ret == -EINTR);
assert(ret == numPerRound);
}
DoGetFut.wait();
struct timeval end;
gettimeofday(&end, 0);
uint64_t diff = ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec);
io_queue_release(ioctx);
std::cout
<< "ops=" << numRounds * numPerRound
<< " iops=" << (numRounds * numPerRound *(uint64_t)1000000)/diff
<< " region-size=" << (numBlocks * BLKSIZE)
<< std::endl;
}
Surely it is to do with the structure of the memory. Internally this drive is built from many memory chips and may have multiple memory buses internally. If you do requests across a small range all the requests will resolve to a single or few chips and will have to be queued. If you access across the whole device then the multiple request are across many internal chips and buses and can be run asynchronously so will provide more throughput.