cc-ares

What's the source of this enigmatic *** buffer overflow detected ***: terminated error


I have a program which reads from a file which is a list of domain names. It performs asynchronous DNS and then downloads the landing page for each domain using an asynchronous epoll loop.

The program runs fine for thousands of iterations and then bombs out with a *** buffer overflow detected ***: terminated error. Here is the backtrace:

Program received signal SIGABRT, Aborted.
__pthread_kill_implementation (no_tid=0, signo=6, threadid=140737351415616) at pthread_kill.c:44
44  pthread_kill.c: No such file or directory.
(gdb) bt
#0  __pthread_kill_implementation (no_tid=0, signo=6, threadid=140737351415616) at pthread_kill.c:44
#1  __pthread_kill_internal (signo=6, threadid=140737351415616) at pthread_kill.c:80
#2  __GI___pthread_kill (threadid=140737351415616, signo=signo@entry=6) at pthread_kill.c:91
#3  0x00007ffff7db0476 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
#4  0x00007ffff7d967b7 in __GI_abort () at abort.c:79
#5  0x00007ffff7df75e6 in __libc_message (action=action@entry=do_abort, fmt=fmt@entry=0x7ffff7f48ef4 "*** %s ***: terminated\n") at ../sysdeps/posix/libc_fatal.c:155
#6  0x00007ffff7ea322a in __GI___fortify_fail (msg=msg@entry=0x7ffff7f48e9a "buffer overflow detected") at fortify_fail.c:26
#7  0x00007ffff7ea1b46 in __GI___chk_fail () at chk_fail.c:28
#8  0x00007ffff7ea316b in __fdelt_chk (d=<optimised out>) at fdelt_chk.c:25
#9  0x00007ffff7f97362 in ares_fds () from /lib/x86_64-linux-gnu/libcares.so.2
#10 0x000055555555682d in wait_ares (channel=0x555556bb32a0) at epoll_recv_with_async_dns.c:80
#11 0x000055555555773c in main (argc=2, argv=0x7fffffffe0a8) at epoll_recv_with_async_dns.c:303

As you can see the backtrace points to a call to ares_fds. The offending line of code is:

nfds = ares_fds(channel, &read_fds, &write_fds);

I fail to see how there is a buffer overflow in that line of code. Any ideas what I can do further to debug this and find and fix the problem. For those interested a minimal reproducer is here below:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/socket.h>
#include <resolv.h>
#include <sys/epoll.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <time.h>
#include <ares.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdarg.h>
#include <string.h>
#include <ctype.h>

#define MAXWAITING 1000 /* Max. number of parallel DNS queries */
#define MAXTRIES      3 /* Max. number of tries per domain */
#define DNSTIMEOUT    3000 /* Max. number of ms for first try */
#define SERVERS    "1.0.0.1,8.8.8.8" /* DNS server to use (Cloudflare & Google) */
#define MAXDOMAINS 8192
#define MAX_CONNECTIONS 8192
#define TIMEOUT 10000
int epfd;
int sockfd[MAX_CONNECTIONS];
struct epoll_event event[MAX_CONNECTIONS];
struct sockaddr_in dest[MAX_CONNECTIONS];
char resolved[MAXDOMAINS][254];
char ips[MAXDOMAINS][128];
int current = 0, active = 0, next = 0;
char servers[MAX_CONNECTIONS][128];
char domains[MAX_CONNECTIONS][254];
int i, num_ready, connections = 0, done = 0, total_bytes = 0, total_domains = 0, iterations = 0, count = 0;
static int nwaiting;

static void state_cb(void *data, int s, int read, int write)
{
    //printf("Change state fd %d read:%d write:%d\n", s, read, write);
}

static void callback(void *arg, int status, int timeouts, struct hostent *host)
{
    nwaiting--;

    if(!host || status != ARES_SUCCESS){
        //fprintf(stderr, "Failed to lookup %s\n", ares_strerror(status));
        return;
    }

    char ip[INET6_ADDRSTRLEN];

    if (host->h_addr_list[0] != NULL){
        inet_ntop(host->h_addrtype, host->h_addr_list[0], ip, sizeof(ip));
        strcpy(resolved[current], host->h_name);
        strcpy(ips[current], ip);
        if (current < MAXDOMAINS - 1) current++; else current = 0;
        active++;
        printf("active %d\r", active);
    }
}

static void wait_ares(ares_channel channel)
{
    struct timeval *tvp, tv;
    fd_set read_fds, write_fds;
    int nfds = 0;

    FD_ZERO(&read_fds);
    FD_ZERO(&write_fds);

    nfds = ares_fds(channel, &read_fds, &write_fds);
    
    if (nfds > 0) {
    tvp = ares_timeout(channel, NULL, &tv);
        select(nfds, &read_fds, &write_fds, NULL, tvp);
        ares_process(channel, &read_fds, &write_fds);
    }     
}
                
int main(int argc, char *argv[]) {
        
    sigaction(SIGPIPE, &(struct sigaction){SIG_IGN}, NULL);
    FILE * fp;
    char domain[128];
    size_t len = 0;
    ssize_t read;
    ares_channel channel;
    int status, dns_done = 0;
    int optmask;
    
    status = ares_library_init(ARES_LIB_INIT_ALL);
    if (status != ARES_SUCCESS) {
        printf("ares_library_init: %s\n", ares_strerror(status));
        return 1;
    }

    struct ares_options options = {
        .timeout = DNSTIMEOUT,     /* set first query timeout */
        .tries = MAXTRIES       /* set max. number of tries */
    };
    optmask = ARES_OPT_TIMEOUTMS | ARES_OPT_TRIES;

    status = ares_init_options(&channel, &options, optmask);
    if (status != ARES_SUCCESS) {
        printf("ares_init_options: %s\n", ares_strerror(status));
        return 1;
    }

    status = ares_set_servers_csv(channel, SERVERS);
    if (status != ARES_SUCCESS) {
        printf("ares_set_servers_csv: %s\n", ares_strerror(status));
        return 1;
    }
    
    fp = fopen(argv[1], "r");
    if (!fp)
        exit(EXIT_FAILURE);

    do{
        if (nwaiting >= MAXWAITING || dns_done) {
            do {
                wait_ares(channel);
                
            } while (nwaiting > MAXWAITING);
        }
        if (!dns_done) {
            if (fscanf(fp, "%128s", domain) == 1) {
                ares_gethostbyname(channel, domain, AF_INET, callback, NULL);
                nwaiting++;
            } else {
                dns_done = 1;
            }
        }
    } while (active < MAX_CONNECTIONS);
    
    /*---Open sockets for streaming---*/
    for (i = 0; i < MAX_CONNECTIONS; i++)
    { 
        if ( (sockfd[i] = socket(AF_INET, SOCK_STREAM|SOCK_NONBLOCK, 0)) < 0 ) {
            perror("Socket");
            exit(errno);
        }
        count++;
    }

    while (1)
    {
        /*---Do async DNS---*/
        while (/*active < MAXDOMAINS &&*/ nwaiting > 0) {
            //printf("active = %d MAXDOMAINS = %d nwaiting = %d MAXWAITING = %d\n", active, MAXDOMAINS, nwaiting, MAXWAITING);
            if (nwaiting >= MAXWAITING || dns_done) {
                do {
                    wait_ares(channel);
                } while (nwaiting > MAXWAITING);
            }
            if (!dns_done) {
                if (fscanf(fp, "%127s", domain) == 1) {
                    ares_gethostbyname(channel, domain, AF_INET, callback, NULL);
                    nwaiting++;
                } else {
                    dns_done = 1;
                }
            }
        } //while (active < MAXDOMAINS);
        
        if (done && count == 0) break;
    }
    ares_destroy(channel);
    ares_library_cleanup();
    fclose(fp);
    printf("\nFinished without errors\n");
    return 0;
}

The abort doesn't happen if I comment out the section which creates the sockets:

 /*---Open sockets for streaming---*/
    for (i = 0; i < MAX_CONNECTIONS; i++)
    { 
        if ( (sockfd[i] = socket(AF_INET, SOCK_STREAM|SOCK_NONBLOCK, 0)) < 0 ) {
            perror("Socket");
            exit(errno);
        }
        count++;
    }

So whatever the problem is it is related to the fact that I have a number of sockets file descriptors. Any ideas?

FURTHER EDIT:

Further debugging seems to show the problem is related to the number of sockets opened. If I reduce the number of sockets created to 1017 the abort no longer happens. While if I create 1018 sockets the program aborts.


Solution

  • It looks like this might be the root cause:

    https://c-ares.org/mail/c-ares-archive-2017-08/0002.shtml

    >>> The stack trace is shown as above.
    >>>
    >>> /(gdb) bt/
    >>> /#0 0x00007f959c01ac37 in __GI_raise (sig=sig_at_entry=6) at
    >>> ../nptl/sysdeps/unix/sysv/linux/raise.c:56/
    >>> /#1 0x00007f959c01e028 in __GI_abort () at abort.c:89/
    >>> /#2 0x00007f959c0572a4 in __libc_message
    >>> (do_abort=do_abort_at_entry=2, fmt=fmt_at_entry=0x7f959c166d70 "*** %s
    >>> ***: %s terminated\n")/
    >>> / at ../sysdeps/posix/libc_fatal.c:175/
    >>> /#3 0x00007f959c0f283c in __GI___fortify_fail (msg=<optimized out>,
    >>> msg_at_entry=0x7f959c166d07 "buffer overflow detected") at
    >>> fortify_fail.c:38/
    >>> /#4 0x00007f959c0f1710 in __GI___chk_fail () at chk_fail.c:28/
    >>> /#5 0x00007f959c0f2787 in __fdelt_chk (d=<optimized out>) at
    >>> fdelt_chk.c:25/
    >>> /#6 0x00007f959c6b69ad in ares_fds () from
    >>> /usr/local/multiplier/system/libs/libcares.so.2/
    >>> /#7 0x000000000040b448 in rec_c_ares_execute () at
    >>> /home/necs/dev/apat/source/recorder/recdns.c:157/
    >>> /#8 0x00000000004052f2 in rec_main_thread (data=0x0) at
    >>> /home/necs/dev/apat/source/recorder/rec.c:772/
    >>> /#9 0x0000000000403de1 in main (argc=7, argv=0x7fff58cde398) at
    >>> /home/necs/dev/apat/source/recorder/main.c:129/
    >> ...
    

    You are either crossing FD_SETSIZE limit, or have negative number of fds. Glibc checks this internally and causes crash if check will fail: https://github.com/lattera/glibc/blob/master/debug/fdelt_chk.c

    Daniel Received on 2017-08-01

    Since I'm not sure what platform you're on, I can't recommend a good way to inspect the value before calling ares_fds(), besides keeping track of the previous nfds (the return value immediately before the failure).