When I run my code to measure how much time a program thread spends in CPU (both user and system) I use the API such as
struct timespec start,
end;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
// do something here
// ...
// ...
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
const double elapsed = (end.tv_sec-start.tv_sec)*1e9 + (end.tv_nsec-start.tv_nsec);
I was wondering what is the minimum elapsed time and I tried the following code:
#include <iostream>
#include <time.h>
int main() {
const int c_type[] = {CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_THREAD_CPUTIME_ID};
struct timespec start,
end;
const size_t n_iter = 1024*1024;
for(int i = 0; i < sizeof(c_type)/sizeof(int); ++i) {
double accum = 0.0;
for(int j = 0; j < n_iter; ++j) {
clock_gettime(c_type[i], &start);
clock_gettime(c_type[i], &end);
const double elapsed = (end.tv_sec-start.tv_sec)*1e9 + (end.tv_nsec-start.tv_nsec);
accum += elapsed;
}
std::cout << "[" << i << "] elapsed: " << accum/n_iter << std::endl;
}
}
To my surprise I get the following timings:
[0] elapsed: 19.8536 // CLOCK_REALTIME
[1] elapsed: 19.8697 // CLOCK_MONOTONIC
[2] elapsed: 88.3246 // CLOCK_THREAD_CPUTIME_ID
Which means that the minimum time between two CLOCK_THREAD_CPUTIME_ID
calls is approximately 88 nsec (these stats come from a 9950x3d, Ubuntu 24.04).
Why is it the case? Why is this slower than CLOCK_REALTIME
for example?
I was under the impression that CLOCK_THREAD_CPUTIME_ID
was implemented roughly by reading the rtdsc register and apply some adjustments?
Looking at the VDSO implementation, the relevant function is __cvdso_clock_gettime_common
defined in lib/vdso/gettimeofday.c
:
static __always_inline int
__cvdso_clock_gettime_common(const struct vdso_data *vd, clockid_t clock,
struct __kernel_timespec *ts)
{
u32 msk;
/* Check for negative values or invalid clocks */
if (unlikely((u32) clock >= MAX_CLOCKS))
return -1;
/*
* Convert the clockid to a bitmask and use it to check which
* clocks are handled in the VDSO directly.
*/
msk = 1U << clock;
if (likely(msk & VDSO_HRES))
vd = &vd[CS_HRES_COARSE];
else if (msk & VDSO_COARSE)
return do_coarse(&vd[CS_HRES_COARSE], clock, ts);
else if (msk & VDSO_RAW)
vd = &vd[CS_RAW];
else
return -1;
return do_hres(vd, clock, ts);
}
If -1
is returned, the VDSO can't handle it and must perform a syscall instead, which is much slower.
Elsewhere, in datapage.h
, we see the exact list of clocks that those masks cover:
#define VDSO_HRES (BIT(CLOCK_REALTIME) | \
BIT(CLOCK_MONOTONIC) | \
BIT(CLOCK_BOOTTIME) | \
BIT(CLOCK_TAI))
#define VDSO_COARSE (BIT(CLOCK_REALTIME_COARSE) | \
BIT(CLOCK_MONOTONIC_COARSE))
#define VDSO_RAW (BIT(CLOCK_MONOTONIC_RAW))
This does not include BIT(CLOCK_THREAD_CPUTIME_ID)
so it's clearly going to be slow.
Now, is it possible to implement it in the VDSO? Probably. But nobody has bothered yet (and perhaps it would imply overhead every time the scheduler is invoked?).