The scenario is one process writes the program metric stats data to shared memory file.And another process will read the file and send the data to metric server. The bus error and core dump occurs in several minutes after the writer process started trying to memset the shared memroy. The simpilfied code is:
Init() {
tpid_ = get_thread_pid();
table_name_ = executable_name();
max_counter_num_ = 1024;
MI << "tpid=" << tpid_ << ", table_name=" << table_name_;
shm_root_ = "/dev/shm/counters";
shm_dir_ = shm_root_ + "/" + table_name_;
char buf[1024];
sprintf(buf, "/counters/%s/%d", table_name_.c_str(), tpid_);
shm_file_ = buf;
table_size_ = max_counter_num_ * sizeof(Entry) + sizeof(Table);
shm_size_ = table_size_;
MI << "shm_file_=" << shm_file_ << ", shm_size_=" << shm_size_;
table_ = NULL;
entries_ = NULL;
err_ = kOk;
errno_ = 0;
Attach();
}
int Attach() {
int err;
// only one shm file for a given thread
int oflag = O_CREAT | O_RDWR | O_EXCL;
int dir_mode = S_IRWXO | S_IRWXU | S_IRWXG;
int file_mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
umask(0);
err = mkdir(shm_root_.c_str(), dir_mode);
if (err && errno != EEXIST) {
MW << "dir=" << shm_root_ << ", mkdir,err=" << errno << ":" << strerror(errno);
err_ = kSysMkdir;
errno_ = errno;
return kSysMkdir;
}
err = mkdir(shm_dir_.c_str(), dir_mode);
if (err && errno != EEXIST) {
MW << "dir=" << shm_dir_ << ", mkdir,err=" << errno << ":" << strerror(errno);
err_ = kSysMkdir;
errno_ = errno;
return kSysMkdir;
}
shm_fd_ = shm_open(shm_file_.c_str(), oflag, file_mode);
if (shm_fd_ == -1) {
MW << "file=" << shm_file_ << ", shm_open,err=" << errno << ":" << strerror(errno);
err_ = kSysShmOpen;
errno_ = errno;
return kSysShmOpen;
}
err = ftruncate(shm_fd_, shm_size_);
if (err == -1) {
err_ = kSysFtruncate;
errno_ = errno;
MW << "file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", ftruncate,err=" << errno << ":" << strerror(errno);
return kSysFtruncate;
}
MI << "ftruncate success, return:" << err;
int proto = PROT_READ | PROT_WRITE;
int flags = MAP_SHARED;
void *addr = mmap(NULL, shm_size_, proto, flags, shm_fd_, 0);
if (addr == MAP_FAILED) {
MW << "file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", mmap,err=" << errno << ":" << strerror(errno);
err_ = kSysMmap;
errno_ = errno;
return kSysMmap;
}
MI << "mmap success shm_file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", mmap_addr=" << addr << ", shm_size_=" << shm_size_;
memset(addr, 0, shm_size_);
MI << "mmemset success";
table_ = (Table *)addr;
entries_ = (Entry *)((char *)addr + sizeof(Table));
table_->n_entry = 0;
table_->max_entry_num = max_counter_num_;
table_->last_timestamp = time(NULL);
gethostname(table_->endpoint, MAX_HOSTNAME_LEN);
MI << "Attach success shm_file=" << shm_file_ << ", shm_fd=" << shm_fd_ << ", mmap_addr=" << addr;
return 0;
}
memset in Attach method trigger the bus error after running several times. The log is:
I0509 10:34:27.081708 920 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:27.081895 920 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:27.081923 920 shm_counter.cpp:70] tpid=920, table_name=knn_realtime
I0509 10:34:27.081974 920 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/920, shm_size_=8340560
I0509 10:34:27.082068 920 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:27.082094 920 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/920, shm_fd=12, mmap_addr=0x7fb4d700b000, shm_size_=8340560
I0509 10:34:27.087743 920 shm_counter.cpp:153] mmemset success
I0509 10:34:27.087836 920 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/920, shm_fd=12, mmap_addr=0x7fb4d700b000
I0509 10:34:29.168845 867 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:29.189066 867 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:29.192279 867 shm_counter.cpp:70] tpid=867, table_name=knn_realtime
I0509 10:34:29.192375 867 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/867, shm_size_=8340560
I0509 10:34:29.192492 867 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:29.192528 867 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/867, shm_fd=14, mmap_addr=0x7fb4d660b000, shm_size_=8340560
I0509 10:34:29.198365 867 shm_counter.cpp:153] mmemset success
I0509 10:34:29.198501 867 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/867, shm_fd=14, mmap_addr=0x7fb4d660b000
I0509 10:34:33.079138 923 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:33.079949 923 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:33.080663 923 shm_counter.cpp:70] tpid=923, table_name=knn_realtime
I0509 10:34:33.080742 923 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/923, shm_size_=8340560
I0509 10:34:33.080799 923 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:33.080823 923 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/923, shm_fd=17, mmap_addr=0x7fb4d5c0b000, shm_size_=8340560
I0509 10:34:33.086556 923 shm_counter.cpp:153] mmemset success
I0509 10:34:33.086634 923 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/923, shm_fd=17, mmap_addr=0x7fb4d5c0b000
I0509 10:34:35.251169 853 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:35.251964 853 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:35.252004 853 shm_counter.cpp:70] tpid=853, table_name=knn_realtime
I0509 10:34:35.252027 853 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/853, shm_size_=8340560
I0509 10:34:35.252079 853 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:35.252110 853 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/853, shm_fd=19, mmap_addr=0x7fb4d520b000, shm_size_=8340560
I0509 10:34:35.257800 853 shm_counter.cpp:153] mmemset success
I0509 10:34:35.257881 853 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/853, shm_fd=19, mmap_addr=0x7fb4d520b000
I0509 10:34:35.505298 868 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:35.513530 868 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:35.513626 868 shm_counter.cpp:70] tpid=868, table_name=knn_realtime
I0509 10:34:35.513657 868 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/868, shm_size_=8340560
I0509 10:34:35.513706 868 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:35.513723 868 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/868, shm_fd=21, mmap_addr=0x7fb4d480b000, shm_size_=8340560
I0509 10:34:35.519409 868 shm_counter.cpp:153] mmemset success
I0509 10:34:35.519487 868 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/868, shm_fd=21, mmap_addr=0x7fb4d480b000
I0509 10:34:37.302048 899 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:37.308457 899 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:37.308708 899 shm_counter.cpp:70] tpid=899, table_name=knn_realtime
I0509 10:34:37.308753 899 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/899, shm_size_=8340560
I0509 10:34:37.308815 899 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:37.308836 899 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/899, shm_fd=22, mmap_addr=0x7fb4d3e0b000, shm_size_=8340560
I0509 10:34:37.314566 899 shm_counter.cpp:153] mmemset success
I0509 10:34:37.314673 899 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/899, shm_fd=22, mmap_addr=0x7fb4d3e0b000
I0509 10:34:38.097653 872 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:38.097880 872 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:38.097911 872 shm_counter.cpp:70] tpid=872, table_name=knn_realtime
I0509 10:34:38.097932 872 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/872, shm_size_=8340560
I0509 10:34:38.097973 872 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:38.097998 872 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/872, shm_fd=24, mmap_addr=0x7fb4d340b000, shm_size_=8340560
I0509 10:34:38.103730 872 shm_counter.cpp:153] mmemset success
I0509 10:34:38.103808 872 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/872, shm_fd=24, mmap_addr=0x7fb4d340b000
I0509 10:34:41.335608 905 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:41.337450 905 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:41.337491 905 shm_counter.cpp:70] tpid=905, table_name=knn_realtime
I0509 10:34:41.337515 905 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/905, shm_size_=8340560
I0509 10:34:41.337570 905 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:41.337599 905 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/905, shm_fd=26, mmap_addr=0x7fb4d2a0b000, shm_size_=8340560
I0509 10:34:41.343400 905 shm_counter.cpp:153] mmemset success
I0509 10:34:41.343483 905 shm_counter.cpp:161] Attach success shm_file=/counters/knn_realtime/905, shm_fd=26, mmap_addr=0x7fb4d2a0b000
I0509 10:34:41.964365 906 shm_counter.cpp:41] readlink(/proc/self/exe)=/home/work/bin/knn-realtime/bin/knn_realtime
I0509 10:34:41.965782 906 shm_counter.cpp:52] executable_name=knn_realtime
I0509 10:34:41.965902 906 shm_counter.cpp:70] tpid=906, table_name=knn_realtime
I0509 10:34:41.965938 906 shm_counter.cpp:79] shm_file_=/counters/knn_realtime/906, shm_size_=8340560
I0509 10:34:41.965989 906 shm_counter.cpp:138] ftruncate success, return:0
I0509 10:34:41.966127 906 shm_counter.cpp:151] mmap success shm_file=/counters/knn_realtime/906, shm_fd=28, mmap_addr=0x7fb4d200b000, shm_size_=8340560
The stderr shows:
*** Aborted at 1715222081 (unix time) try "date -d @1715222081" if you are using GNU date ***
PC: @ 0x7fb55cc12720 __memset_sse2
*** SIGBUS (@0x7fb4d2062000) received by PID 35 (TID 0x7fb5049ff700) from PID 18446744072938201088; stack trace: ***
@ 0x7fb561e9d6d0 (unknown)
@ 0x7fb55cc12720 __memset_sse2
@ 0x7fb56ffd60b1 falcon::ShmCounterImpl::Attach()
@ 0x7fb56ffd6a17 falcon::ShmCounterImpl::ShmCounterImpl()
@ 0x7fb56ffd7382 __tls_init
@ 0x7fb56ffd74b2 falcon::ShmCounter::Set()
@ 0x58b2f7 KnnRecallServiceHandler::ProcessThriftFramedRequest()
@ 0x7fb56d946b31 brpc::policy::ProcessThriftFramedRequestNoExcept()
@ 0x7fb56d942849 brpc::policy::ProcessThriftRequest()
@ 0x7fb56d99f36a brpc::ProcessInputMessage()
@ 0x7fb56d9a03a8 brpc::InputMessenger::OnNewMessages()
@ 0x7fb56d9a7e6d brpc::Socket::ProcessEvent()
@ 0x7fb56d87fadf bthread::TaskGroup::task_runner()
@ 0x7fb56d88d761 bthread_make_fcontext
The system is 4.19.91-2048.3.miks7.x86_64. The processes running in containers and in one pod.
It seem that the /dev/shm runs out of space: shm 64M 0 64M 0% /dev/shm The shm init 8 times, and the ninth crashes. 8340560 * 8 /1024/1024=63M.The next init will exceed 64M limit.