c++multithreadinglinux-kernelnetlink

cannot understand this "message sequence mismatch error"


I've used the program answered in this link with some modifications. Below is my modified code:

#include <linux/netlink.h>

#include <netlink/netlink.h>
#include <netlink/route/qdisc.h>
#include <netlink/route/qdisc/plug.h>
#include <netlink/socket.h>

#include <atomic>
#include <csignal>
#include <iostream>
#include <stdexcept>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <thread>
#include <queue>
#include <chrono>

/**
 * Netlink route socket.
 */
struct Socket {
  Socket() : handle{nl_socket_alloc()} {

    if (handle == nullptr) {
      throw std::runtime_error{"Failed to allocate socket!"};
    }

    if (int err = nl_connect(handle, NETLINK_ROUTE); err < 0) {
      throw std::runtime_error{"Unable to connect netlink socket: " +
                               std::string{nl_geterror(err)}};
    }
  }

  Socket(const Socket &) = delete;
  Socket &operator=(const Socket &) = delete;
  Socket(Socket &&) = delete;
  Socket &operator=(Socket &&) = delete;

  ~Socket() { nl_socket_free(handle); }

  struct nl_sock *handle;
};

/**
 * Read all links from netlink socket.
 */
struct LinkCache {
  explicit LinkCache(Socket *socket) : handle{nullptr} {
    if (int err = rtnl_link_alloc_cache(socket->handle, AF_UNSPEC, &handle);
        err < 0) {
      throw std::runtime_error{"Unable to allocate link cache: " +
                               std::string{nl_geterror(err)}};
    }
  }

  LinkCache(const LinkCache &) = delete;
  LinkCache &operator=(const LinkCache &) = delete;
  LinkCache(LinkCache &&) = delete;
  LinkCache &operator=(LinkCache &&) = delete;

  ~LinkCache() { nl_cache_free(handle); }

  struct nl_cache *handle;
};

/**
 * Link (such as "eth0" or "wlan0").
 */
struct Link {
  Link(LinkCache *link_cache, const std::string &iface)
      : handle{rtnl_link_get_by_name(link_cache->handle, iface.c_str())} {

    if (handle == nullptr) {
      throw std::runtime_error{"Link does not exist:" + iface};
    }
  }

  Link(const Link &) = delete;
  Link &operator=(const Link &) = delete;
  Link(Link &&) = delete;
  Link &operator=(Link &&) = delete;

  ~Link() { rtnl_link_put(handle); }

  struct rtnl_link *handle;
};

/**
 * Queuing discipline.
 */
struct QDisc {
  QDisc(const std::string &iface, const std::string &kind)
      : handle{rtnl_qdisc_alloc()} {
    if (handle == nullptr) {
      throw std::runtime_error{"Failed to allocate qdisc!"};
    }

    struct rtnl_tc *tc = TC_CAST(handle);

    // Set link
    LinkCache link_cache{&socket};
    Link link{&link_cache, iface};
    rtnl_tc_set_link(tc, link.handle);

    // Set parent qdisc
    uint32_t parent = 0;

    if (int err = rtnl_tc_str2handle("root", &parent); err < 0) {
      throw std::runtime_error{"Unable to parse handle: " +
                               std::string{nl_geterror(err)}};
    }

    rtnl_tc_set_parent(tc, parent);

    // Set kind (e.g. "plug")
    if (int err = rtnl_tc_set_kind(tc, kind.c_str()); err < 0) {
      throw std::runtime_error{"Unable to set kind: " +
                               std::string{nl_geterror(err)}};
    }
  }

  QDisc(const QDisc &) = delete;
  QDisc &operator=(const QDisc &) = delete;
  QDisc(QDisc &&) = delete;
  QDisc &operator=(QDisc &&) = delete;

  ~QDisc() {
    if (int err = rtnl_qdisc_delete(socket.handle, handle); err < 0) {
      std::cerr << "Unable to delete qdisc: " << nl_geterror(err) << std::endl;
    }

    rtnl_qdisc_put(handle);
  }

  void send_msg() {
    int flags = NLM_F_CREATE;

    if (int err = rtnl_qdisc_add(socket.handle, handle, flags); err < 0) {
      throw std::runtime_error{"Unable to add qdisc: " +
                               std::string{nl_geterror(err)}};
    }
  }

  Socket socket;
  struct rtnl_qdisc *handle;
};

/**
 * Queuing discipline for plugging traffic.
 */
class Plug {
public:
  Plug(const std::string &iface, uint32_t limit, std::string msg)
      : qdisc_{iface, "plug"} {

    rtnl_qdisc_plug_set_limit(qdisc_.handle, limit);
    qdisc_.send_msg();

    // set_enabled(enabled_);
    set_msg(msg);
  }

  // void set_enabled(bool enabled) {
  //   if (enabled) {
  //     rtnl_qdisc_plug_buffer(qdisc_.handle);
  //   } else {
  //     rtnl_qdisc_plug_release_one(qdisc_.handle);
  //   }

  //   qdisc_.send_msg();
  //   enabled_ = enabled;
  // }

  void set_msg(std::string msg) {
    if (msg == "buffer") {
      int ret = rtnl_qdisc_plug_buffer(qdisc_.handle);
      //std::cout<<strerror(ret);
    } else if(msg == "commit") {
      int ret = rtnl_qdisc_plug_release_one(qdisc_.handle);
      //std::cout<<strerror(ret);
    } else {
      int ret = rtnl_qdisc_plug_release_indefinite(qdisc_.handle);
      //std::cout<<strerror(ret);   
    }

    qdisc_.send_msg();
  }  

  // bool is_enabled() const { return enabled_; }

private:
  QDisc qdisc_;

  // bool enabled_;
};

std::atomic<bool> quit{false};

void exit_handler(int /*signal*/) { quit = true; }

// this function busy wait on job queue until there's something 
//and calls release operation i.e. unplug qdisc to release output packets 
//generated for a particular epoch
void transmit_ckpnt(std::queue<int> &job_queue, Plug &plug){

  while(true){

      while(!job_queue.empty()){

        int id = job_queue.front();
        job_queue.pop();
        std::string s = std::to_string(id);

        std::cout<<"called from parallel thread "<<s<<"\n"; 

        //release buffer
        plug.set_msg("commit");  
      }
  }

}

int main() {
  std::string iface{"veth-host"};
  constexpr uint32_t buffer_size = 10485760;
  // bool enabled = true;

  Plug plug{iface, buffer_size, "buffer"};

  /**
   * Set custom exit handler to ensure destructor runs to delete qdisc.
   */
  struct sigaction sa {};
  sa.sa_handler = exit_handler;
  sigfillset(&sa.sa_mask);
  sigaction(SIGINT, &sa, nullptr);

  pid_t wpid;
  int status = 0;
  std::queue<int> job_queue;
  int ckpnt_no  = 1;

  std::thread td(transmit_ckpnt, std::ref(job_queue), std::ref(plug));
  plug.set_msg("indefinite");

  while(true){
    //plug the buffer at start of the epoch
    plug.set_msg("buffer");

    //wait for completion of epoch
    sleep(4);   
    
    job_queue.push(ckpnt_no);
    ckpnt_no += 1;  
  }

  plug.set_msg("indefinite");
  td.join();
  
  // while (!quit) {
  //   std::cout << "Plug set to " << plug.is_enabled() << std::endl;
  //   std::cout << "Press <Enter> to continue.";
  //   std::cin.get();

  //   plug.set_enabled(!plug.is_enabled());
  // }

  return EXIT_SUCCESS;
}

Walkthrough of code: This program creates a plug/unplug type of qdiscs in which during plug operation, the network packets are buffered and during an unplug operation the network packets are released from the first plug(the front of the queuing discipline qdisc) to the second plug in the qdisc. The above program is working correctly if the plug and unplug operation are there alternately. But i want to use it in the way that it was built for, i.e like how this is mentioned in this link , i.e.

     TCQ_PLUG_BUFFER (epoch i)
         TCQ_PLUG_BUFFER (epoch i+1) 
             TCQ_PLUG_RELEASE_ONE (for epoch i)
                 TCQ_PLUG_BUFFER (epoch i+2)
                     ..............................so on

In my program, the main thread start buffering in the begining of every epoch, and continued execution. The job thread takes the job id from job queue and releases the buffered packets from the head of the queue to next plug. But this gives below error:

./a.out: /lib/x86_64-linux-gnu/libnl-3.so.200: no version information available (required by ./a.out)
./a.out /usr/lib/x86_64-linux-gnu/libnl-route-3.so.200: no version information available (required by ./a.out)
called from parallel thread 1
called from parallel thread 2
called from parallel thread 3
called from parallel thread 4
called from parallel thread 5
called from parallel thread 6
called from parallel thread 7
terminate called after throwing an instance of 'std::runtime_error'
 what(): Unable to add qdisc: Message sequence number mismatch
Aborted

Unable to understand what is this and why it is giving this error, when release was performed in sequentially in the main thread then it is working but now when there is another thread to perform the release operation which just checks whether the job_queue is empty or not and perform the release operation until there is something inside job queue and busy wait if job_queue is empty.


Solution

  • The expected sequence counter is stored by libnl as part of the nl_sock struct (reference). When multiple threads call libnl functions, this can cause inconsistencies, such as a data race (two threads writing to the sequence counter at same time) or a race condition (time-of-check-time-of-use problem, where one thread checks if the counter satisfies some condition, then performs some operation, but in between the other thread modifies the counter). See here for more details on data races and race conditions.

    Sidenote: Both g++ and clang++ support the -fsanitize=thread flag, which automatically inserts additional debug code into the binary that can help to detect this kind of data races (reference). Though in this case, it might not be as useful, since you would also have to get libnl compiled with this flag, which might not be easy.

    From the libnl documentation (reference):

    The next step is to check the sequence number of the message against
    the currently expected sequence number. The application may provide
    its own sequence number checking algorithm by setting the callback
    function NL_CB_SEQ_CHECK to its own implementation. In fact, calling
    nl_socket_disable_seq_check() to disable sequence number checking will
    do nothing more than set the NL_CB_SEQ_CHECK hook to a function which
    always returns NL_OK.
    

    This leaves us with the following options:

    1. Use a mutex to guard all access to libnl functions that might modify the sequence counter.

    2. Disable sequence counter checking using nl_socket_disable_seq_check.

    From my point of view, 1) is the more robust solution. If you care more about performance than robustness, then you could go with 2).

    Option 1: Use a mutex to guard access to libnl functions

    Include the mutex header from the standard library:

    #include <mutex>
    

    In the Plug class, add a std::mutex as a member:

    class Plug {
    ...
    private:
      std::mutex seq_counter_mutex_;
    ...
    };
    

    At the beginning of set_msg, use a std::lock_guard for acquiring the mutex for the duration of the function. This ensures that only one thread can enter the function at the same time:

      void set_msg(std::string msg) {
        std::lock_guard guard{seq_counter_mutex_};
        ...
      }
    

    Option 2: Disable sequence number checking

    In the Socket class, at the end of the constructor you can disable sequence counter checking with:

      nl_socket_disable_seq_check(handle);