c++linuxshared-memorymmapmemset

memset bus error after mmap when init shared memory


The scenario is one process writes the program metric stats data to shared memory file.And another process will read the file and send the data to metric server. The bus error and core dump occurs in several minutes after the writer process started trying to memset the shared memroy. The simpilfied code is:

Init() {
    tpid_ = get_thread_pid();
    table_name_ = executable_name();
    max_counter_num_ = 1024;
    MI << "tpid=" << tpid_ << ", table_name=" << table_name_;

    shm_root_ = "/dev/shm/counters";
    shm_dir_ = shm_root_ + "/" + table_name_;
    char buf[1024];
    sprintf(buf, "/counters/%s/%d", table_name_.c_str(), tpid_);
    shm_file_ = buf;
    table_size_ = max_counter_num_ * sizeof(Entry) + sizeof(Table);
    shm_size_ = table_size_;
    MI << "shm_file_=" << shm_file_ << ", shm_size_=" << shm_size_;
    table_ = NULL;
    entries_ = NULL;
    err_ = kOk;
    errno_ = 0;

    Attach();
}

int Attach() {
    int err;
    // only one shm file for a given thread
    int oflag = O_CREAT | O_RDWR | O_EXCL;
    int dir_mode = S_IRWXO | S_IRWXU | S_IRWXG;
    int file_mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;

    umask(0);
    err = mkdir(shm_root_.c_str(), dir_mode);
    if (err && errno != EEXIST) {
        MW << "dir=" << shm_root_ << ", mkdir,err=" << errno << ":" << strerror(errno);
        err_ = kSysMkdir;
        errno_ = errno;
        return kSysMkdir;
    }

    err = mkdir(shm_dir_.c_str(), dir_mode);
    if (err && errno != EEXIST) {
        MW << "dir=" << shm_dir_ << ", mkdir,err=" << errno << ":" << strerror(errno);
        err_ = kSysMkdir;
        errno_ = errno;
        return kSysMkdir;
    }

    shm_fd_ = shm_open(shm_file_.c_str(), oflag, file_mode);
    if (shm_fd_ == -1) {
        MW << "file=" << shm_file_ << ", shm_open,err=" << errno << ":" << strerror(errno);
        err_ = kSysShmOpen;
        errno_ = errno;
        return kSysShmOpen;
    }

    err = ftruncate(shm_fd_, shm_size_);
    if (err == -1) {
        err_ = kSysFtruncate;
        errno_ = errno;
        MW << "file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", ftruncate,err=" << errno << ":" << strerror(errno);
        return kSysFtruncate;
    }
    MI << "ftruncate success, return:" << err;

    int proto = PROT_READ | PROT_WRITE;
    int flags = MAP_SHARED;

    void *addr = mmap(NULL, shm_size_, proto, flags, shm_fd_, 0);

    if (addr == MAP_FAILED) {
        MW << "file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", mmap,err=" << errno << ":" << strerror(errno);
        err_ = kSysMmap;
        errno_ = errno;
        return kSysMmap;
    }
    MI << "mmap success shm_file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", mmap_addr=" << addr << ", shm_size_=" << shm_size_;
    memset(addr, 0, shm_size_);
    MI << "mmemset success";
    table_ = (Table *)addr;
    entries_ = (Entry *)((char *)addr + sizeof(Table));

    table_->n_entry = 0;
    table_->max_entry_num = max_counter_num_;
    table_->last_timestamp = time(NULL);
    gethostname(table_->endpoint, MAX_HOSTNAME_LEN);
    MI << "Attach success shm_file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", mmap_addr=" << addr;
    return 0;
}

memset in Attach method trigger the bus error after running several times. The log is:

I0509 10:34:27.081708   920 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:27.081895   920 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:27.081923   920 shm_counter.cpp:70] tpid=920, table_name=knn_realtime
I0509 10:34:27.081974   920 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/920, shm_size_=8340560
I0509 10:34:27.082068   920 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:27.082094   920 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/920, shm_fd=12, mmap_addr=0x7fb4d700b000, shm_size_=8340560
I0509 10:34:27.087743   920 shm_counter.cpp:153] mmemset success
I0509 10:34:27.087836   920 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/920, shm_fd=12, mmap_addr=0x7fb4d700b000
I0509 10:34:29.168845   867 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:29.189066   867 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:29.192279   867 shm_counter.cpp:70] tpid=867, table_name=knn_realtime
I0509 10:34:29.192375   867 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/867, shm_size_=8340560
I0509 10:34:29.192492   867 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:29.192528   867 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/867, shm_fd=14, mmap_addr=0x7fb4d660b000, shm_size_=8340560
I0509 10:34:29.198365   867 shm_counter.cpp:153] mmemset success
I0509 10:34:29.198501   867 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/867, shm_fd=14, mmap_addr=0x7fb4d660b000
I0509 10:34:33.079138   923 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:33.079949   923 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:33.080663   923 shm_counter.cpp:70] tpid=923, table_name=knn_realtime
I0509 10:34:33.080742   923 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/923, shm_size_=8340560
I0509 10:34:33.080799   923 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:33.080823   923 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/923, shm_fd=17, mmap_addr=0x7fb4d5c0b000, shm_size_=8340560
I0509 10:34:33.086556   923 shm_counter.cpp:153] mmemset success
I0509 10:34:33.086634   923 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/923, shm_fd=17, mmap_addr=0x7fb4d5c0b000
I0509 10:34:35.251169   853 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:35.251964   853 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:35.252004   853 shm_counter.cpp:70] tpid=853, table_name=knn_realtime
I0509 10:34:35.252027   853 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/853, shm_size_=8340560
I0509 10:34:35.252079   853 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:35.252110   853 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/853, shm_fd=19, mmap_addr=0x7fb4d520b000, shm_size_=8340560
I0509 10:34:35.257800   853 shm_counter.cpp:153] mmemset success
I0509 10:34:35.257881   853 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/853, shm_fd=19, mmap_addr=0x7fb4d520b000
I0509 10:34:35.505298   868 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:35.513530   868 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:35.513626   868 shm_counter.cpp:70] tpid=868, table_name=knn_realtime
I0509 10:34:35.513657   868 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/868, shm_size_=8340560
I0509 10:34:35.513706   868 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:35.513723   868 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/868, shm_fd=21, mmap_addr=0x7fb4d480b000, shm_size_=8340560
I0509 10:34:35.519409   868 shm_counter.cpp:153] mmemset success
I0509 10:34:35.519487   868 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/868, shm_fd=21, mmap_addr=0x7fb4d480b000
I0509 10:34:37.302048   899 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:37.308457   899 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:37.308708   899 shm_counter.cpp:70] tpid=899, table_name=knn_realtime
I0509 10:34:37.308753   899 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/899, shm_size_=8340560
I0509 10:34:37.308815   899 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:37.308836   899 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/899, shm_fd=22, mmap_addr=0x7fb4d3e0b000, shm_size_=8340560
I0509 10:34:37.314566   899 shm_counter.cpp:153] mmemset success
I0509 10:34:37.314673   899 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/899, shm_fd=22, mmap_addr=0x7fb4d3e0b000
I0509 10:34:38.097653   872 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:38.097880   872 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:38.097911   872 shm_counter.cpp:70] tpid=872, table_name=knn_realtime
I0509 10:34:38.097932   872 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/872, shm_size_=8340560
I0509 10:34:38.097973   872 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:38.097998   872 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/872, shm_fd=24, mmap_addr=0x7fb4d340b000, shm_size_=8340560
I0509 10:34:38.103730   872 shm_counter.cpp:153] mmemset success
I0509 10:34:38.103808   872 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/872, shm_fd=24, mmap_addr=0x7fb4d340b000
I0509 10:34:41.335608   905 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:41.337450   905 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:41.337491   905 shm_counter.cpp:70] tpid=905, table_name=knn_realtime
I0509 10:34:41.337515   905 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/905, shm_size_=8340560
I0509 10:34:41.337570   905 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:41.337599   905 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/905, shm_fd=26, mmap_addr=0x7fb4d2a0b000, shm_size_=8340560
I0509 10:34:41.343400   905 shm_counter.cpp:153] mmemset success
I0509 10:34:41.343483   905 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/905, shm_fd=26, mmap_addr=0x7fb4d2a0b000
I0509 10:34:41.964365   906 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:41.965782   906 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:41.965902   906 shm_counter.cpp:70] tpid=906, table_name=knn_realtime
I0509 10:34:41.965938   906 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/906, shm_size_=8340560
I0509 10:34:41.965989   906 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:41.966127   906 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/906, shm_fd=28, mmap_addr=0x7fb4d200b000, shm_size_=8340560

The stderr shows:

*** Aborted at 1715222081 (unix time) try "date -d @1715222081" if you are using GNU date ***
PC: @     0x7fb55cc12720 __memset_sse2
*** SIGBUS (@0x7fb4d2062000) received by PID 35 (TID 0x7fb5049ff700) from PID 18446744072938201088; stack trace: ***
    @     0x7fb561e9d6d0 (unknown)
    @     0x7fb55cc12720 __memset_sse2
    @     0x7fb56ffd60b1 falcon::ShmCounterImpl::Attach()
    @     0x7fb56ffd6a17 falcon::ShmCounterImpl::ShmCounterImpl()
    @     0x7fb56ffd7382 __tls_init
    @     0x7fb56ffd74b2 falcon::ShmCounter::Set()
    @           0x58b2f7 KnnRecallServiceHandler::ProcessThriftFramedRequest()
    @     0x7fb56d946b31 brpc::policy::ProcessThriftFramedRequestNoExcept()
    @     0x7fb56d942849 brpc::policy::ProcessThriftRequest()
    @     0x7fb56d99f36a brpc::ProcessInputMessage()
    @     0x7fb56d9a03a8 brpc::InputMessenger::OnNewMessages()
    @     0x7fb56d9a7e6d brpc::Socket::ProcessEvent()
    @     0x7fb56d87fadf bthread::TaskGroup::task_runner()
    @     0x7fb56d88d761 bthread_make_fcontext

The system is 4.19.91-2048.3.miks7.x86_64. The processes running in containers and in one pod.


Solution

  • It seem that the /dev/shm runs out of space: shm 64M 0 64M 0% /dev/shm The shm init 8 times, and the ninth crashes. 8340560 * 8 /1024/1024=63M.The next init will exceed 64M limit.