For some reason GDB stops running while stepping the daemon I have developed and I can't find what is wrong. The problem started after some code changes has been made, and a Segmentation Fault error appeared. In order to track the error, I ran the daemon through the gdb, like I always did in such cases, but this time, my bad code seems to be crashing the GDB itself. What could be the problem and how to proceed in this case?
I attach the trace of the execution from the function that has been changed:
Breakpoint 1, serve_outlinks_stage1 (conn_id=31, job_idx=241) at dependency.c:320
320 job_data=jobs[job_idx].data;
(gdb) step
322 fl_iocbs_top--;
(gdb)
323 if (fl_iocbs_top==0) {
(gdb)
327 iocb_idx=fl_iocbs[fl_iocbs_top];
(gdb)
328 memset(&iocbs[iocb_idx],0,sizeof(struct iocb));
(gdb)
329 iocb_ptrs[num_iocb_submits]=&iocbs[iocb_idx];
(gdb)
330 num_iocb_submits++;
(gdb)
331 io_prep_pread(&iocbs[iocb_idx],company_infos[cliconns[conn_id].company_idx].fd_dependencies,&jobs[job_idx].aux.outlinks_id,sizeof(uint),sizeof(t_dependency_t)*job_data->dep_id+offsetof(t_dependency_t,tail_outlinks_id));
(gdb)
io_prep_pread (iocb=0x620020 <iocbs+1216>, fd=20, buf=0x68f300 <jobs+13536>, count=4, offset=274) at /usr/include/libaio.h:173
173 memset(iocb, 0, sizeof(*iocb));
(gdb)
174 iocb->aio_fildes = fd;
(gdb)
175 iocb->aio_lio_opcode = IO_CMD_PREAD;
(gdb)
176 iocb->aio_reqprio = 0;
(gdb)
177 iocb->u.c.buf = buf;
(gdb)
178 iocb->u.c.nbytes = count;
(gdb)
179 iocb->u.c.offset = offset;
(gdb)
180 }
(gdb)
serve_outlinks_stage1 (conn_id=31, job_idx=241) at dependency.c:332
332 callback=iocb_idx;
(gdb)
333 io_set_callback(&iocbs[iocb_idx], (io_callback_t) callback);
(gdb)
io_set_callback (iocb=0x620020 <iocbs+1216>, cb=0x13) at /usr/include/libaio.h:168
168 iocb->data = (void *)cb;
(gdb)
169 }
(gdb)
serve_outlinks_stage1 (conn_id=31, job_idx=241) at dependency.c:334
334 aio_infos[iocb_idx].job_idx=job_idx;
(gdb)
335 aio_infos[iocb_idx].conn_id=conn_id;
(gdb)
336 aio_infos[iocb_idx].op_code=AIO_OP_READ_DEPENDENCY_OUTLINKS_ID;
(gdb)
337 jobs[job_idx].pending_ops++;
(gdb)
338 }
(gdb)
process_command (conn_id=31, job_idx=241) at depserv.c:493
493 break;
(gdb)
565 }
(gdb)
main_event_loop () at depserv.c:1087
1087 _assign_job(job_idx);
(gdb)
jobs_top=240, job_idx[jobs_top]=240
1088 memset(&jobs[job_idx],0,sizeof(jobs[job_idx]));
(gdb)
1089 jobs[job_idx].conn_id=conn_id;
(gdb)
1090 jobs[job_idx].company_idx=cliconns[conn_id].company_idx;
(gdb)
1075 while(count>0) {
(gdb)
1078 count = read (infd, &jobs[job_idx].dscmd, sizeof(dscmd_t));
(gdb)
1079 printf("count=%zd\n",count);
(gdb)
count=-1
1080 if (count>0) {
(gdb)
1098 if (count==0) {
(gdb)
1101 break;
(gdb)
1049 for (i = 0; i < n; i++) {
(gdb)
1108 } // while
(gdb)
982 usleep(2000); // for debugging because we need pending_aio_submits to be valid, should be removed for production
(gdb)
983 check_aio();
(gdb)
check_aio () at depserv.c:807
807 num_events = io_getevents(io_ctx, 0, MAX_IO_EVENTS, aio_events, NULL);
(gdb)
808 if (num_events<0) {
(gdb)
811 for(i=0;i<num_events;i++) {
(gdb)
843 if (num_iocb_submits>0) {
(gdb)
845 write_cycle();
(gdb)
write_cycle () at depserv.c:225
225 for(i=0,j=0;i<num_iocb_submits;i++) {
(gdb)
226 callback=(long) iocb_ptrs[i]->data;
(gdb)
227 iocb_idx=callback;
(gdb)
228 switch(aio_infos[iocb_idx].op_code) {
(gdb)
225 for(i=0,j=0;i<num_iocb_submits;i++) {
(gdb)
245 if (!j) return;
(gdb)
269 }
(gdb)
check_aio () at depserv.c:846
846 for(i=0;i<num_iocb_submits;i++) {
(gdb)
847 ret = io_submit(io_ctx, 1, &iocb_ptrs[i]);
(gdb)
848 if (ret<0) printf("bad iosubmit ret=%d\n",ret);
(gdb)
846 for(i=0;i<num_iocb_submits;i++) {
(gdb)
850 pending_aio_submits=pending_aio_submits+1;
(gdb)
856 num_iocb_submits=0;
(gdb)
858 }
(gdb)
main_event_loop () at depserv.c:985
985 n=MAX_IOCBS-fl_iocbs_top;
(gdb)
986 if (pending_aio_submits>n) {
(gdb)
990 if (recvfrom(identityd_socket, udp_buf, MAX_UDP_PACKET_SIZE, 0, (struct sockaddr*) &remote_addr, &slen)==-1) {
(gdb)
995 if (!announced) continue;
(gdb)
997 n = epoll_wait(listening_efd, listening_events, MAX_EPOLL_EVENTS_LISTEN, 1);
(gdb)
1000 for (i = 0; i < n; i++) {
(gdb)
1048 n = epoll_wait(accepted_efd, accepted_events, MAX_EPOLL_EVENTS_ACCEPTED, 1);
(gdb)
1049 for (i = 0; i < n; i++) {
(gdb)
1108 } // while
(gdb)
982 usleep(2000); // for debugging because we need pending_aio_submits to be valid, should be removed for production
(gdb)
983 check_aio();
(gdb)
check_aio () at depserv.c:807
807 num_events = io_getevents(io_ctx, 0, MAX_IO_EVENTS, aio_events, NULL);
(gdb)
808 if (num_events<0) {
(gdb)
811 for(i=0;i<num_events;i++) {
(gdb)
812 pending_aio_submits--;
(gdb)
813 iocb_ptr=aio_events[i].obj;
(gdb)
814 callback=(long) iocb_ptr->data;
(gdb)
815 iocb_idx=callback;
(gdb)
816 op_code=aio_infos[iocb_idx].op_code;
(gdb)
817 job_idx=aio_infos[iocb_idx].job_idx;
(gdb)
818 if (job_idx!=MAX_JOBS) {
(gdb)
819 jobs[job_idx].pending_ops--;
(gdb)
821 func=aio_op_funcs[op_code];
(gdb)
822 if (func==0) {
(gdb)
826 func(iocb_ptr,aio_events[i].res);
(gdb)
serve_outlinks_stage2 (iocb_ptr=0x620020 <iocbs+1216>, res=4) at dependency.c:346
346 if (res!=iocb_ptr->u.c.nbytes) { /// error
(gdb)
350 callback=(long) iocb_ptr->data;
(gdb)
351 iocb_idx=callback;
(gdb)
352 job_idx=aio_infos[iocb_idx].job_idx;
(gdb)
353 conn_id=aio_infos[iocb_idx].conn_id;
(gdb)
355 outlinks_id=jobs[job_idx].aux.outlinks_id;
(gdb)
356 job_data=(job_read_outlinks_t*) malloc(sizeof(job_read_outlinks_t));
(gdb)
357 if (!job_data) {
(gdb)
361 memset(&job_data,0,sizeof(job_read_outlinks_t));
(gdb) print outlinks_id
$1 = 2
(gdb) step
362 jobs[job_idx].data=job_data;
(gdb)
364 fl_iocbs_top--;
(gdb)
365 if (fl_iocbs_top==0) {
(gdb)
369 iocb_idx=fl_iocbs[fl_iocbs_top];
(gdb)
370 memset(&iocbs[iocb_idx],0,sizeof(struct iocb));
(gdb)
371 iocb_ptrs[num_iocb_submits]=&iocbs[iocb_idx];
(gdb)
372 num_iocb_submits++;
(gdb)
373 io_prep_pread(&iocbs[iocb_idx],company_infos[cliconns[conn_id].company_idx].fd_outlinks,&job_data->r_outlinks,sizeof(t_outlinks_t),sizeof(t_outlinks_t)*outlinks_id);
(gdb)
io_prep_pread (iocb=0x61ffe0 <iocbs+1152>, fd=0, buf=0x10, count=32, offset=0) at /usr/include/libaio.h:173
173 memset(iocb, 0, sizeof(*iocb));
(gdb)
174 iocb->aio_fildes = fd;
(gdb)
175 iocb->aio_lio_opcode = IO_CMD_PREAD;
(gdb)
176 iocb->aio_reqprio = 0;
(gdb)
177 iocb->u.c.buf = buf;
(gdb)
178 iocb->u.c.nbytes = count;
(gdb)
179 iocb->u.c.offset = offset;
(gdb)
180 }
(gdb)
serve_outlinks_stage2 (iocb_ptr=0x620020 <iocbs+1216>, res=4) at dependency.c:374
374 callback=iocb_idx;
(gdb)
375 io_set_callback(&iocbs[iocb_idx], (io_callback_t) callback);
(gdb)
io_set_callback (iocb=0x61ffe0 <iocbs+1152>, cb=0x12) at /usr/include/libaio.h:168
168 iocb->data = (void *)cb;
(gdb)
169 }
(gdb)
serve_outlinks_stage2 (iocb_ptr=0x620020 <iocbs+1216>, res=4) at dependency.c:376
376 aio_infos[iocb_idx].job_idx=job_idx;
(gdb)
377 aio_infos[iocb_idx].conn_id=conn_id;
(gdb)
378 aio_infos[iocb_idx].op_code=AIO_OP_READ_OUTLINKS;
(gdb)
379 jobs[job_idx].pending_ops++;
(gdb)
380 return(0);
(gdb)
381 }
(gdb)
Warning:
Cannot insert breakpoint 0.
Cannot access memory at address 0x0
0x0000000000000000 in ?? ()
(gdb)
Cannot find bounds of current function
(gdb)
On the line 826 I am calling a function dynamically, maybe that has something to do with it?
my bad code seems to be crashing the GDB itself.
No, it does not. This:
381 }
(gdb)
Warning:
Cannot insert breakpoint 0.
Cannot access memory at address 0x0
0x0000000000000000 in ?? ()
(gdb)
Cannot find bounds of current function
usually means that your program has jumped to location 0, and GDB can't set an internal breakpoint for the step
command.
The most probable cause of such "return to 0" is stack corruption: you've overwrote your return address with 0.
You can verify this by using run
instead of stepping through the program. If you run
also terminates like this:
Program received signal SIGSEGV, Segmentation fault.
0x0000000000000000 in ?? ()
then my guess is confirmed. So what can you do to catch this bug?
Let's use an example:
#include <string.h>
int foo()
{
char buf[1];
memset(buf, 0, 1024);
}
int main()
{
return foo();
}
First we step into foo
:
(gdb) b foo
Breakpoint 1 at 0x400535: file t.c, line 6.
(gdb) r
Starting program: /tmp/a.out
Breakpoint 1, foo () at t.c:6
6 memset(buf, 0, 1024);
Next we confirm that our (return) stack is still intact:
(gdb) bt
#0 foo () at t.c:6
#1 0x000000000040055b in main () at t.c:11
Now we need to find location on stack where the return address is stored:
(gdb) disas
Dump of assembler code for function foo:
0x000000000040052d <+0>: push %rbp
0x000000000040052e <+1>: mov %rsp,%rbp
0x0000000000400531 <+4>: sub $0x10,%rsp
=> 0x0000000000400535 <+8>: lea -0x1(%rbp),%rax
0x0000000000400539 <+12>: mov $0x400,%edx
0x000000000040053e <+17>: mov $0x0,%esi
0x0000000000400543 <+22>: mov %rax,%rdi
0x0000000000400546 <+25>: callq 0x400410 <memset@plt>
0x000000000040054b <+30>: leaveq
0x000000000040054c <+31>: retq
End of assembler dump.
This tells us that the return address will be at $rbp+8
, and indeed we find it there:
(gdb) x/a $rbp+8
0x7fffffffe2b8: 0x40055b <main+14>
Finally we set a watchpoint on location 0x7fffffffe2b8
, so GDB will stop when that location is overwritten:
(gdb) watch *(int**)0x7fffffffe2b8
Hardware watchpoint 2: *(int**)0x7fffffffe2b8
Finally we continue:
(gdb) c
Continuing.
Hardware watchpoint 2: *(int**)0x7fffffffe2b8
Old value = (int *) 0x40055b <main+14>
New value = (int *) 0x0
memset () at ../sysdeps/x86_64/memset.S:79
79 ../sysdeps/x86_64/memset.S: No such file or directory.
And now we are stopped at the exact place where stack overflow caused us to "forget" the return address. Using bt
confirms that the stack is now damaged (main
no longer appears in the backtrace):
(gdb) bt
#0 memset () at ../sysdeps/x86_64/memset.S:79
#1 0x000000000040054b in foo () at t.c:6
#2 0x0000000000000000 in ?? ()
Finally, let's see if step
ping through this program will produce the same result as your original does.
(gdb) r
Starting program: /tmp/a.out
Breakpoint 1, foo () at t.c:6
6 memset(buf, 0, 1024);
(gdb) n
7 }
(gdb) s
Warning:
Cannot insert breakpoint 0.
Cannot access memory at address 0x0
0x0000000000000000 in ?? ()
Yes, it does. QED.