Suppose I use NSight Systems to profile my program, and create an SQLite 3 database, as follows:
nsys profile -o /path/to/db --export=sqlite /path/to/executable --arg1=val1 --arg2
What exactly do I do now to obtain the execution times of my various kernel invocations?
The CUPTI documentation (for CUDA 11.2) says:
3.29. CUpti_ActivityKernel4 Struct Reference [CUPTI Activity API]
This activity record represents a kernel execution (CUPTI_ACTIVITY_KIND_KERNEL and CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
And these are two names of tables in the SQLite3 output DB. Here's how to query them:
sqlite3 -csv /path/to/db.sqlite 'SELECT end-start AS duration FROM CUPTI_ACTIVITY_KIND_KERNEL;'
sqlite3 -csv /path/to/db.sqlite 'SELECT names.value AS name, end - start FROM CUPTI_ACTIVITY_KIND_KERNEL AS k JOIN StringIds AS names ON k.demangledName = names.id;'
It is also educational to run:
sqlite3 /path/to/db.sqlite
and then enter
.schema
to get the SQL creation command for all table in the schema. That would typically look like the following (with CUDA 11.2 and nsys 2020.4.3):
sqlite> .schema
CREATE TABLE StringIds (id INTEGER NOT NULL PRIMARY KEY, value TEXT NOT NULL);
CREATE TABLE ProcessStreams (globalPid INTEGER NOT NULL, filenameId INTEGER NOT NULL, contentId INTEGER NOT NULL);
CREATE TABLE SCHED_EVENTS (start INTEGER NOT NULL, cpu INTEGER NOT NULL, isSchedIn INTEGER NOT NULL, globalTid INTEGER);
CREATE TABLE COMPOSITE_EVENTS (id INTEGER NOT NULL PRIMARY KEY, start INTEGER NOT NULL, cpu INTEGER, threadState INTEGER, globalTid INTEGER, cpuCycles INTEGER NOT NULL);
CREATE TABLE UnwindMethodType (number INTEGER PRIMARY KEY, name TEXT NOT NULL);
CREATE TABLE SAMPLING_CALLCHAINS (id INTEGER NOT NULL REFERENCES COMPOSITE_EVENTS, symbol INTEGER NOT NULL, module INTEGER NOT NULL, kernelMode INTEGER, thumbCode INTEGER, unresolved INTEGER, specialEntry INTEGER, originalIP INTEGER, unwindMethod INTEGER REFERENCES UnwindMethodType(number), stackDepth INTEGER NOT NULL, PRIMARY KEY (id, stackDepth));
CREATE TABLE PROFILER_OVERHEAD (start INTEGER NOT NULL, end INTEGER NOT NULL, globalTid INTEGER, correlationId INTEGER, nameId INTEGER NOT NULL, returnValue INTEGER NOT NULL);
CREATE TABLE OSRT_API (start INTEGER NOT NULL, end INTEGER NOT NULL, eventClass INTEGER NOT NULL, globalTid INTEGER, correlationId INTEGER, nameId INTEGER NOT NULL, returnValue INTEGER NOT NULL, nestingLevel INTEGER, callchainId INTEGER NOT NULL);
CREATE TABLE OSRT_CALLCHAINS (id INTEGER NOT NULL, symbol INTEGER NOT NULL, module INTEGER NOT NULL, kernelMode INTEGER, thumbCode INTEGER, unresolved INTEGER, specialEntry INTEGER, originalIP INTEGER, unwindMethod INTEGER REFERENCES UnwindMethodType(number), stackDepth INTEGER NOT NULL, PRIMARY KEY (id, stackDepth));
CREATE TABLE CUPTI_ACTIVITY_KIND_RUNTIME (start INTEGER NOT NULL, end INTEGER NOT NULL, eventClass INTEGER NOT NULL, globalTid INTEGER, correlationId INTEGER, nameId INTEGER NOT NULL, returnValue INTEGER NOT NULL, callchainId INTEGER REFERENCES CUDA_CALLCHAINS(id));
CREATE TABLE CUPTI_ACTIVITY_KIND_MEMCPY (start INTEGER NOT NULL, end INTEGER NOT NULL, deviceId INTEGER NOT NULL, contextId INTEGER NOT NULL, streamId INTEGER NOT NULL, correlationId INTEGER, globalPid INTEGER, bytes INTEGER NOT NULL, copyKind INTEGER NOT NULL, deprecatedSrcId INTEGER, srcKind INTEGER, dstKind INTEGER, srcDeviceId INTEGER, srcContextId INTEGER, dstDeviceId INTEGER, dstContextId INTEGER, graphNodeId INTEGER);
CREATE TABLE CUPTI_ACTIVITY_KIND_SYNCHRONIZATION (start INTEGER NOT NULL, end INTEGER NOT NULL, deviceId INTEGER NOT NULL, contextId INTEGER NOT NULL, streamId INTEGER NOT NULL, correlationId INTEGER, globalPid INTEGER, syncType INTEGER NOT NULL, eventId INTEGER NOT NULL);
CREATE TABLE CUPTI_ACTIVITY_KIND_KERNEL (start INTEGER NOT NULL, end INTEGER NOT NULL, deviceId INTEGER NOT NULL, contextId INTEGER NOT NULL, streamId INTEGER NOT NULL, correlationId INTEGER, globalPid INTEGER, demangledName INTEGER NOT NULL, shortName INTEGER NOT NULL, launchType INTEGER, cacheConfig INTEGER, registersPerThread INTEGER NOT NULL, gridX INTEGER NOT NULL, gridY INTEGER NOT NULL, gridZ INTEGER NOT NULL, blockX INTEGER NOT NULL, blockY INTEGER NOT NULL, blockZ INTEGER NOT NULL, staticSharedMemory INTEGER NOT NULL, dynamicSharedMemory INTEGER NOT NULL, localMemoryPerThread INTEGER NOT NULL, localMemoryTotal INTEGER NOT NULL, gridId INTEGER NOT NULL, sharedMemoryExecuted INTEGER, graphNodeId INTEGER);
CREATE TABLE ThreadNames (nameId INTEGER NOT NULL, priority INTEGER, globalTid INTEGER);
CREATE TABLE TARGET_INFO_CUDA_GPU (vmId INTEGER NOT NULL, name TEXT NOT NULL, pciBusId TEXT, globalMemoryBandwidth INTEGER NOT NULL, globalMemorySize INTEGER NOT NULL, constantMemorySize INTEGER NOT NULL, l2CacheSize INTEGER NOT NULL, numThreadsPerWarp INTEGER NOT NULL, coreClockRate INTEGER NOT NULL, numMemcpyEngines INTEGER NOT NULL, numMultiprocessors INTEGER NOT NULL, maxIPC INTEGER NOT NULL, maxWarpsPerMultiprocessor INTEGER NOT NULL, maxBlocksPerMultiprocessor INTEGER NOT NULL, maxRegistersPerBlock INTEGER NOT NULL, maxSharedMemoryPerBlock INTEGER NOT NULL, maxThreadsPerBlock INTEGER NOT NULL, maxBlockDimX INTEGER NOT NULL, maxBlockDimY INTEGER NOT NULL, maxBlockDimZ INTEGER NOT NULL, maxGridDimX INTEGER NOT NULL, maxGridDimY INTEGER NOT NULL, maxGridDimZ INTEGER NOT NULL, computeCapabilityMajor INTEGER NOT NULL, computeCapabilityMinor INTEGER NOT NULL, deviceId INTEGER NOT NULL, pid INTEGER, maxSharedMemoryPerMultiprocessor INTEGER, maxRegistersPerMultiprocessor INTEGER);
CREATE TABLE TARGET_INFO_GPU (vmId INTEGER NOT NULL, deviceId INTEGER NOT NULL, name TEXT, busLocation TEXT, isDiscrete INTEGER);
CREATE TABLE TARGET_INFO_CUDA_NULL_STREAM (streamId INTEGER NOT NULL, hwId INTEGER NOT NULL, vmId INTEGER NOT NULL, processId INTEGER NOT NULL, deviceId INTEGER NOT NULL, contextId INTEGER NOT NULL);
CREATE TABLE TARGET_INFO_CUDA_STREAM (streamId INTEGER NOT NULL, hwId INTEGER NOT NULL, vmId INTEGER NOT NULL, processId INTEGER NOT NULL, contextId INTEGER NOT NULL, priority INTEGER NOT NULL, flag INTEGER NOT NULL);
And you can apply any SQL query to this (in SQLite's dialect of course).