How to set affinity on multiple cpus by sched_setaffinity

I want to set affinity on multiple CPUs by sched_affinity as follows.

void
pin(pid_t t, int cpu)
{
  cpu_set_t cpuset;
  CPU_ZERO(&cpuset);
  CPU_SET(cpu, &cpuset);
  sched_setaffinity(t, sizeof(cpu_set_t), &cpuset);
}

My environment is 32 cores, where 4 CPUs exists and single CPU has 8cores.
I want thread 0 ~ 7 run on the same cpu, and thread 8 ~ 15 run on the same cpu, and so on.
I wonder what to set variable cpu in CPU_SET.
This is set as the thread id, if the core numbers are allocated as naively, that is, cpu0 has 0th core, and 1st core, and 2nd core,..., and cpu1 has 8th core, 9th core,... .
On the one hand, cpu is set as the round-robin rule, if the core numbers are allocated as round-robin rule, that is, cpu0 has 0th core, and 4th core, and 8th core,..., and cpu1 has 1th core, and 5th core,... .

Which rule should I set variable cpu, naive rule or round-robin rule?

Solution

Under Linux (and other OS') the programmer may set CPU affinity, i.e. the allowed CPUs that the kernel may schedule this process to. Upon fork(), processes inherit the parents CPU affinity. This comes in very handy, if one wants to limit CPUs for whatever reason.

E.g. one might limit

the processes of certain users to only one CPU, while leaving other users the rest of the CPUS (see man 7 cpuset).
a process to a CPU that is "closer" to something, e.g. limit a process that does communication to the cores on a socket, that is directly connected to a network card (NIC, or HCA).

In general, it may be benefical to limit the process/thread to certain cores or a socket in order to not have them scheduled away by the OS -- maximising the benefits of the L1/L2 cache (when pinning to cores) or the L3/LLC cache (when pinning to sockets).

Regarding Your question on "Thread distribution": Processor development has introduced Symmetrical Multithreading (SMT) or Hyperthreading (as called by Intel), which introduces 2 logical cores (e.g. Intel Xeon) or even 4 logical cores (e.g. Intel Knights Landing, IBM Power) per physical core. These logical cores are as well represented as "CPU" in the cpuset above. Moreover some processors impose NUMA domains, where memory access from one core to it's "own" memory is fast, while access to another cores memory in another NUMA domain is slower...

So, as some of the above comments suggest: it depends! Do Your threads communicate amongst each other (via shared memory), then they should be kept close within the same cache. Do Your threads exercise the same functional units (e.g. FPU), then scheduling two on the same physical core (with 2 logical cores, i.e. Hyperthread) may be detrimental to performance.

To play around, please find enclosed the following code:

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sys/param.h>
#include <sys/sysctl.h>
#include <pthread.h>

// The following is Linux-specific
#include <syscall.h>            // For syscall to gettid()
#include <sched.h>      // sched_[gs]etaffinity require _GNU_SOURCE

#define ERROR(t, e) do { \
    const int __error = (e); \
    fprintf (stderr, "ERROR: %s error:%d [%s] errno:%d [%s]\n", \
             (t), __error, strerror(__error), errno, strerror(errno)); \
    exit(__error); \
  } while(0)

#ifndef MAX
#define MAX(a,b)  ((a) > (b) ? (a) : (b))
#endif
#ifndef MIN
#define MIN(a,b)  ((a) < (b) ? (a) : (b))
#endif



/* Local function definitions */
void print_schedaffinity(const char * text, const cpu_set_t cpuset, const int max_cpus);
void * thread_func(void * arg);

/* Local type definitions */
struct thread_data {
  pthread_t thread;
  int max_cpu;
  int thread_num;
  void * thread_work;
};

/* The highest value for CPU to be specified in cpuset in a call to
 * sched_setaffinity -- otherwise, we get returned -1 and errno==EINVAL
 */
static int max_cpu_available = 0;


/* Local function declarations */
void print_schedaffinity(const char * text, const cpu_set_t cpuset, const int max_cpus) {
  const int max = MIN(8*sizeof(cpu_set_t), max_cpus);
  int i;

  printf("PRINT CPU AFFINITY %s:\n", text);
  printf("cpus:\t");
  for (i = 0; i < max; i++) {
    printf (" %3d", i);
    if (i % 8 == 7)
      printf(" | ");
  }

  printf("\nmask:\t");
  for (i = 0; i < max; i++) {
    if (CPU_ISSET(i, &cpuset))
      printf ("   X");
    else
      printf ("    ");

    if (i % 8 == 7)
      printf(" | ");
  }
  printf("\n");
}


void * thread_func(void * arg) {
  struct thread_data * thread_data = (struct thread_data *)arg;
  const size_t sizeof_cpuset = sizeof(cpu_set_t);
  char print_buffer[64];
  cpu_set_t cpuset;
  long tid;
  int rc;

  CPU_ZERO(&cpuset);
  CPU_SET(thread_data->thread_num % max_cpu_available, &cpuset);

  /* We set the affinity of the CALLING thread, aka 0 */
  tid = syscall(SYS_gettid);
  printf("PID:%ld tid:%ld thread_num:%d\n",
         getpid(), tid, thread_data->thread_num);
  rc = sched_setaffinity(0, sizeof_cpuset, &cpuset);
  if (0 != rc)
    ERROR("sched_setaffinity", rc);


  /* Dooo SCHTUF now */

  /* Somewhat sort the output... */
  sleep (thread_data->thread_num);

  snprintf (print_buffer, sizeof(print_buffer),
            "in thread %d after sched_setaffinity", thread_data->thread_num);

  print_schedaffinity(print_buffer, cpuset, 8);

  return NULL;
}


int main (int argc, char * argv[])
{
  const int NUM = 8;
  const pid_t pid = getpid();
  const size_t size_cpu_set = sizeof(cpu_set_t);
  cpu_set_t cpuset;
  int rc;
  int i;

  /* Get, and print the original CPU affinity setting (scheduling is not limited, i.e. all cores may run this PID) */
  CPU_ZERO (&cpuset);
  rc = sched_getaffinity(pid, size_cpu_set, &cpuset);
  if (0 != rc)
    ERROR("sched_getaffinity", rc);
  print_schedaffinity("in main", cpuset, 8);

  /* Search for the last / highest cpu being set -- claim, that this is the max cpu to be set, cough */
  for (i = 0; i < 8 * size_cpu_set; i++) {
    if (!CPU_ISSET(i, &cpuset)) {
      max_cpu_available = i;
      break;
    }
  }


  /* Limit the process to the first core, only */
  CPU_ZERO (&cpuset);
  CPU_SET (0, &cpuset);
  rc = sched_setaffinity (pid, size_cpu_set, &cpuset);
  if (0 != rc)
    ERROR("sched_setaffinity", rc);
  print_schedaffinity("in main after sched_setaffinity", cpuset, 8);


  /* Let's start NUM threads and have them limit their scheduling */
  sleep(1);
  struct thread_data * thread_data = (struct thread_data*)malloc(sizeof(struct thread_data) * NUM);
  for (i = 0; i < NUM; i++) {
    thread_data[i].thread_num = i;
    pthread_create (&thread_data[i].thread, NULL, thread_func, &thread_data[i]);
  }

  /* And wait for them to finish... */
  for (i = 0; i < NUM; i++) {
    pthread_join (thread_data[i].thread, NULL);
  }
  return 0;
}

Edit: Should clarify Apple since OSX 10.5 (Leopard) offers Affinity as in https://developer.apple.com/library/mac/releasenotes/Performance/RN-AffinityAPI/