cparent-childwaitpid

Wait(NULL) catch a PID of a child not finished?


i want that my loop intercept the child process finished or a message sent in the queue by a child. It work if a child sent a message and terminate immediatly...but if i wanna send a message and do other things? I put here my code that work and the variant where i'm working on...

master.c not work: catch pid child despite is not finish for the loop

#include "config.h"


int main(int argc, char const *argv[]) {
    char * args[3] = {CHILD_NAME}; 
    int q_id,num_bytes;
    struct msgUser mt_msg;

    int sh_id = shmget(IPC_PRIVATE,sizeof(int),0644 | IPC_CREAT);
    int *sh_value = shmat(sh_id,NULL,0);
    shmctl(sh_id, IPC_RMID, NULL);

    *sh_value = 0;

#ifndef SO_USERS_NUM
    printf("Error in main:  SO_USERS_NUM not defined");
#endif
    /* 
        Creation users, main do forks as SO_USERS_NUM is setting on confing library.
        A child do a execve to the same file .c with the required operation.
        args: array of string that contains data usefull to user
    */
    q_id = msgget(KEY_QUEUE, IPC_CREAT | 0600);
    for (int i = 0; i < SO_USERS_NUM; i++)
    {
        switch (fork())
        {
        case 0:
#ifdef CHILD_NAME 
            char* test;
            *sh_value = *sh_value +1;
            sprintf(test,"%d",*sh_value);
            args[1] = test;
            args[2] = NULL;          
            execve(CHILD_NAME,args,NULL);
#endif
            exit(EXIT_FAILURE);
            break;
         case -1:
            printf("Error in main:  forkkodio");
            break;
        default:
            break;
        }
    }
    
    int k_pid = 0;
    while (((k_pid = wait(NULL))>0)){

        printf("\nfiglio: %d",k_pid);   }

    return 0; }

master work: wait correctly forever when i put the infinity loop in the user

#include "config.h"


int main(int argc, char const *argv[])
{
    char * args[3] = {CHILD_NAME}; 
    int q_id,num_bytes;
    struct msgUser mt_msg;

    int sh_id = shmget(IPC_PRIVATE,sizeof(int),0644 | IPC_CREAT);
    int *sh_value = shmat(sh_id,NULL,0);
    shmctl(sh_id, IPC_RMID, NULL);

    *sh_value = 0;

#ifndef SO_USERS_NUM
    printf("Error in main:  SO_USERS_NUM not defined");
#endif
    /* 
        Creation users, main do forks as SO_USERS_NUM is setting on confing library.
        A child do a execve to the same file .c with the required operation.
        args: array of string that contains data usefull to user
    */
    q_id = msgget(KEY_QUEUE, IPC_CREAT | 0600);
    for (int i = 0; i < SO_USERS_NUM; i++)
    {
        switch (fork())
        {
        case 0:
#ifdef CHILD_NAME 
            char* test;
            *sh_value = *sh_value +1;
            sprintf(test,"%d",*sh_value);
            args[1] = test;
            args[2] = NULL;          
            execve(CHILD_NAME,args,NULL);
#endif
            exit(EXIT_FAILURE);
            break;
         case -1:
            printf("Error in main:  forkkodio");
            break;
        default:
            break;
        }
    }
    
    
    while (1) {
         for (int i = 0; i < SO_USERS_NUM; i++)
        {
            printf("\nHo finito di aspettare:%d  sono il padre: %d\n",wait(NULL),getppid());
            /* now receiving the message */
            num_bytes = msgrcv(q_id, &mt_msg, 120, 3, 0);

            if (num_bytes >= 0) {
                /* received a good message (possibly of zero length) */
                printf("\nmessaggio ricevuto: %d \n",mt_msg.numero);
            }
        }
    }

    return 0;
}

user.c :

#include "user_manager.h"
#include "config.h"

int main(int argc, char const *argv[])
{
    int q_id;
    struct msgUser my_msg;
    printf("\n user id: %s",argv[1]);
    q_id = msgget(KEY_QUEUE, IPC_CREAT | 0600);
    my_msg.mtype = 3;
    my_msg.numero = 33;
    
    msgsnd(q_id, &my_msg, 120, 0);

    while (1)
    {
        /* code */
    }
    
    return 0;
}

config.h:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <sys/msg.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/sem.h>
#include <sys/wait.h>

struct msgUser {
    long mtype;             /* message type, must be > 0 */
    int numero;
};
/*
    KEY_QUEUE define the start key for the master process.
    Children nodes will increase this value to create owner's queue
*/
#define KEY_QUEUE 0x200800                     
/*
   CHILD_NAME name of users's code lauched by execve
*/
#define CHILD_NAME "user"
/*
   SO_USERS_NUM define the maximum number of users
*/
#define SO_USERS_NUM 3

Solution

  • You can do both wait and msgrcv in the same loop, but you'll have to keep a count of the number of children terminated. And, you'll have to make the calls non-blocking:

    int pid_done = 0;
    while (pid_done < SO_USERS_NUM) {
        num_bytes = sizeof(mt_msg) - sizeof(mt_msg.mtype);
    
        /* now receiving the message */
        num_bytes = msgrcv(q_id, &mt_msg, num_bytes, 3, IPC_NOWAIT);
        if (num_bytes >= 0) {
            printf("received message");
    
            /* received a good message (possibly of zero length) */
            if (num_bytes >= sizeof(mt_msg.numero))
                printf(" numero: %d", mt_msg.numero);
    
            printf("\n");
        }
    
        pid_t pidnow = waitpid(-1,NULL,WNOHANG);
        if (pidnow > 0) {
            printf("reaped: pid %d\n",pidnow);
            ++pid_done;
        }
    
        // optional: sleep a bit if no change in state occurred to prevent us from
        // "hammering" the system
        if ((num_bytes < 0) && (pidnow < 0))
            usleep(100);
    }
    

    Note: As I mentioned in my top comments, using 120 isn't just bad form. But, it is UB (undefined behavior). With it, you'd put data way past the end of the struct. So, the fix is required.


    The above is the simplest way. But, if we establish a signal handler for SIGCHLD, we can use blocking calls:

    int chld_sig = 0;                       // # of children terminated (SIGCHLD)
    int chld_reap = 0;                      // # of children reaped (parent loop)
    
    // sigchld -- signal handler for SIGCHLD
    void
    sigchld(int signo)
    {
    
        // NOTE: we can _not_ do printf inside a signal handler
    
        // increment number of completed pids
        atomic_fetch_add(&chld_sig,1);
    }
    
    void
    parent_loop(void)
    {
    
        // using sigaction/sigprocmask, enable signal handler for SIGCHLD ...
    
        while (chld_reap < SO_USERS_NUM) {
            /* now receiving the message */
            do {
                num_bytes = sizeof(mt_msg) - sizeof(mt_msg.mtype);
    
                num_bytes = msgrcv(q_id, &mt_msg, num_bytes, 3, 0);
    
                // probably got EINTR
                if (num_bytes < 0)
                    break;
    
                printf("received message");
    
                /* received a good message (possibly of zero length) */
                if (num_bytes >= sizeof(mt_msg.numero))
                    printf(" numero: %d", mt_msg.numero);
    
                printf("\n");
            } while (0)
    
            // check for completed children
            while (1) {
                // get number of signals seen
                int count = atomic_load(&chld_sig);
    
                // no new child reaped
                if (count <= chld_reap)
                    break;
    
                // reap a child
                pid_t pidnow = waitpid(-1,NULL,0);
    
                // show the pid of the reaped child
                if (pidnow > 0) {
                    printf("reaped: pid %d\n",pidnow);
                    ++chld_reap;
                }
            }
        }
    }
    

    UPDATE:

    i tried the first implementation and it wait for the child but doesn't print anything like the children doesn't write the message on the queue.

    But, if i start i child it would add a message on the queue and after that, if i start the master it would read the previous message but no new message generated and terminate the child whit their pid printed. – Matteo Pagliarello 8 hours ago

    Because the queue persists across executions of master, if it fails to process all messages on the first invocation, the subsequent invocation will start with "stale" messages.

    Thus, it will become further "desynchronized".

    You didn't provide the full user code, but I suspect it sent several messages.

    There was a bug in the loop I provided:

    1. It would terminate as soon as it saw SO_USERS_NUM number of children complete.
    2. But, there could still be pending messages from various children
    3. These pending messages would remain after the master terminated
    4. They would appear as "stale" in the next invocation.
    5. The loop should only terminate when all children have completed AND there are no more messages.

    Here is the full and complete refactored code. I've combined all files into a single .c file:

    #include <stdio.h>
    #include <stdlib.h>
    #include <unistd.h>
    #include <errno.h>
    #include <string.h>
    #include <time.h>
    
    #include <sys/types.h>
    #include <sys/msg.h>
    #include <sys/ipc.h>
    #include <sys/shm.h>
    #include <sys/sem.h>
    #include <sys/wait.h>
    
    struct msgUser {
        long mtype;                         /* message type, must be > 0 */
        int xid;                            // process id (sequential)
        pid_t pid;                          // process id (real)
        int numero;
    };
    #define MSGUSER_SIZE        (sizeof(struct msgUser) - sizeof(long))
    
    /*
        KEY_QUEUE define the start key for the master process.
        Children nodes will increase this value to create owner's queue
    */
    #define KEY_QUEUE 0x200800
    
    /*
       CHILD_NAME name of users's code lauched by execve
    */
    #define CHILD_NAME "./user"
    
    /*
       SO_USERS_NUM define the maximum number of users
    */
    #ifndef SO_USERS_NUM
    #define SO_USERS_NUM    3
    #endif
    
    // max number of messages
    #ifndef MAXMSG
    #define MAXMSG          50
    #endif
    
    // set this to 1 to force old [broken] behavior in wait_loop
    #ifndef FORCE_STALE
    #define FORCE_STALE     0
    #endif
    
    int q_id;
    pid_t pidlist[SO_USERS_NUM];
    
    void
    wait_loop(void)
    {
        int chld_done = 0;
        int num_bytes;
        struct msgUser mt_msg;
    
        while (1) {
            num_bytes = MSGUSER_SIZE;
    
            // NOTE: we can get messages from a given child even after the child
            // has terminated
            num_bytes = msgrcv(q_id, &mt_msg, num_bytes, 3, IPC_NOWAIT);
    
            /* now receiving the message */
            if (num_bytes >= 0) {
                printf("wait_loop: received message (%d)",num_bytes);
    
                /* received a good message (possibly of zero length) */
                if (num_bytes >= sizeof(mt_msg.numero)) {
                    printf(" xid:%d",mt_msg.xid);
                    printf(" numero:%d", mt_msg.numero);
                    printf(" pid:%d",mt_msg.pid);
                }
    
                printf("\n");
            }
    
            // no pending message
            // stop loop if:
            //   (1) all users reaped
            //   (2) no pending messages (from any child)
            if (chld_done >= SO_USERS_NUM) {
                if (FORCE_STALE || (num_bytes < 0))
                    break;
            }
    
            // has a child just terminated?
            pid_t pidnow = waitpid(-1, NULL, WNOHANG);
    
            // yes, remember the count
            if (pidnow > 0) {
                ++chld_done;
    
                // get the sequential id number for this pid
                int xid = -1;
                for (int idx = 0;  idx < SO_USERS_NUM;  ++idx) {
                    if (pidnow == pidlist[idx]) {
                        xid = idx;
                        break;
                    }
                }
    
                printf("reaped: xid:%d done:%d pid:%d\n", xid, chld_done, pidnow);
            }
    
            // optional: sleep a bit if no change in state occurred to prevent us
            // from "hammering" the system
            if ((num_bytes < 0) && (pidnow < 0))
                usleep(100);
        }
    }
    
    // douser -- do user/child
    int
    douser(int argc, char **argv)
    {
        int q_id;
        struct msgUser my_msg;
    
        q_id = msgget(KEY_QUEUE, IPC_CREAT | 0600);
        my_msg.mtype = 3;
        my_msg.xid = atoi(argv[1]);
        my_msg.pid = getpid();
        my_msg.numero = 0;
    
        struct timespec ts;
        clock_gettime(CLOCK_MONOTONIC,&ts);
        srand(ts.tv_nsec);
    
        int count = (rand() % MAXMSG) + 1;
    
        printf("user:%d pid:%d (%d messages to send)\n",
            my_msg.xid, my_msg.pid, count);
        fflush(stdout);
    
        time_t osec = ts.tv_sec;
        while (1) {
            clock_gettime(CLOCK_MONOTONIC,&ts);
            if ((ts.tv_sec - osec) >= 1)
                break;
            usleep(1000);
        }
    
        for (int mno = 0;  mno < count;  ++mno) {
            my_msg.numero = mno;
            msgsnd(q_id, &my_msg, MSGUSER_SIZE, 0);
        }
    
        return 0;
    }
    
    // domaster -- do master/parent
    int
    domaster(int argc, char **argv)
    {
        char *args[3] = { CHILD_NAME };
    
    // no need for shared memory -- see below
    #if 0
        int sh_id = shmget(IPC_PRIVATE, sizeof(int), 0644 | IPC_CREAT);
        int *sh_value = shmat(sh_id, NULL, 0);
        shmctl(sh_id, IPC_RMID, NULL);
        *sh_value = 0;
    #else
        int sh_value = -1;
    #endif
    
    #ifndef SO_USERS_NUM
        printf("Error in main:  SO_USERS_NUM not defined");
    #endif
    
        q_id = msgget(KEY_QUEUE, IPC_CREAT | 0600);
    
        // drain all "stale" messages
        // NOTE: if we're working correctly, this should never happen
        int drain = 0;
        while (1) {
            struct msgUser mt_msg;
            int num_bytes = MSGUSER_SIZE;
    
            /* now receiving the message */
            num_bytes = msgrcv(q_id, &mt_msg, num_bytes, 0, IPC_NOWAIT);
            if (num_bytes < 0)
                break;
    
            ++drain;
        }
        if (drain > 0)
            printf("master: DRAIN %d\n",drain);
    
        /*
           Creation users, main do forks as SO_USERS_NUM is setting on confing
            library. A child do a execve to the same file .c with the required
            operation. args: array of string that contains data usefull to user */
        for (int i = 0; i < SO_USERS_NUM; i++) {
            // NOTE: by doing this in the parent (before the fork), sh_value does
            // _not_ need to be  in shared memory
            ++sh_value;
    
            char test[100];
    
            pid_t pid = fork();
            switch (pid) {
            case 0:
                sprintf(test, "%d", sh_value);
                args[0] = argv[0];
                args[1] = test;
                args[2] = NULL;
                execvp(args[0], args);
                exit(EXIT_FAILURE);
                break;
    
            case -1:
                printf("Error in main:  forkkodio");
                break;
    
            default:
                printf("master: launch %d (pid %d)\n",sh_value,pid);
                pidlist[sh_value] = pid;
                break;
            }
        }
    
        wait_loop();
    
        return 0;
    }
    
    int
    main(int argc, char **argv)
    {
    
        setlinebuf(stdout);
    
        int code;
    
        if (argc > 1)
            code = douser(argc,argv);
        else
            code = domaster(argc,argv);
    
        return code;
    }
    

    Here is the program output:

    master: launch 0 (pid 3321553)
    master: launch 1 (pid 3321554)
    master: launch 2 (pid 3321555)
    user:0 pid:3321553 (40 messages to send)
    user:1 pid:3321554 (44 messages to send)
    user:2 pid:3321555 (11 messages to send)
    wait_loop: received message (16) xid:0 numero:0 pid:3321553
    wait_loop: received message (16) xid:2 numero:0 pid:3321555
    wait_loop: received message (16) xid:0 numero:1 pid:3321553
    wait_loop: received message (16) xid:2 numero:1 pid:3321555
    wait_loop: received message (16) xid:0 numero:2 pid:3321553
    wait_loop: received message (16) xid:2 numero:2 pid:3321555
    wait_loop: received message (16) xid:0 numero:3 pid:3321553
    wait_loop: received message (16) xid:2 numero:3 pid:3321555
    wait_loop: received message (16) xid:0 numero:4 pid:3321553
    wait_loop: received message (16) xid:2 numero:4 pid:3321555
    wait_loop: received message (16) xid:0 numero:5 pid:3321553
    wait_loop: received message (16) xid:2 numero:5 pid:3321555
    wait_loop: received message (16) xid:0 numero:6 pid:3321553
    wait_loop: received message (16) xid:1 numero:0 pid:3321554
    wait_loop: received message (16) xid:2 numero:6 pid:3321555
    wait_loop: received message (16) xid:0 numero:7 pid:3321553
    wait_loop: received message (16) xid:1 numero:1 pid:3321554
    wait_loop: received message (16) xid:2 numero:7 pid:3321555
    wait_loop: received message (16) xid:0 numero:8 pid:3321553
    reaped: xid:2 done:1 pid:3321555
    wait_loop: received message (16) xid:1 numero:2 pid:3321554
    wait_loop: received message (16) xid:2 numero:8 pid:3321555
    wait_loop: received message (16) xid:0 numero:9 pid:3321553
    wait_loop: received message (16) xid:1 numero:3 pid:3321554
    wait_loop: received message (16) xid:2 numero:9 pid:3321555
    wait_loop: received message (16) xid:0 numero:10 pid:3321553
    wait_loop: received message (16) xid:1 numero:4 pid:3321554
    wait_loop: received message (16) xid:2 numero:10 pid:3321555
    wait_loop: received message (16) xid:0 numero:11 pid:3321553
    wait_loop: received message (16) xid:1 numero:5 pid:3321554
    wait_loop: received message (16) xid:0 numero:12 pid:3321553
    wait_loop: received message (16) xid:1 numero:6 pid:3321554
    wait_loop: received message (16) xid:0 numero:13 pid:3321553
    wait_loop: received message (16) xid:1 numero:7 pid:3321554
    wait_loop: received message (16) xid:0 numero:14 pid:3321553
    wait_loop: received message (16) xid:1 numero:8 pid:3321554
    wait_loop: received message (16) xid:0 numero:15 pid:3321553
    wait_loop: received message (16) xid:1 numero:9 pid:3321554
    reaped: xid:0 done:2 pid:3321553
    wait_loop: received message (16) xid:0 numero:16 pid:3321553
    wait_loop: received message (16) xid:1 numero:10 pid:3321554
    wait_loop: received message (16) xid:0 numero:17 pid:3321553
    wait_loop: received message (16) xid:1 numero:11 pid:3321554
    wait_loop: received message (16) xid:0 numero:18 pid:3321553
    wait_loop: received message (16) xid:1 numero:12 pid:3321554
    wait_loop: received message (16) xid:0 numero:19 pid:3321553
    reaped: xid:1 done:3 pid:3321554
    wait_loop: received message (16) xid:1 numero:13 pid:3321554
    wait_loop: received message (16) xid:0 numero:20 pid:3321553
    wait_loop: received message (16) xid:1 numero:14 pid:3321554
    wait_loop: received message (16) xid:0 numero:21 pid:3321553
    wait_loop: received message (16) xid:1 numero:15 pid:3321554
    wait_loop: received message (16) xid:0 numero:22 pid:3321553
    wait_loop: received message (16) xid:1 numero:16 pid:3321554
    wait_loop: received message (16) xid:0 numero:23 pid:3321553
    wait_loop: received message (16) xid:1 numero:17 pid:3321554
    wait_loop: received message (16) xid:0 numero:24 pid:3321553
    wait_loop: received message (16) xid:1 numero:18 pid:3321554
    wait_loop: received message (16) xid:0 numero:25 pid:3321553
    wait_loop: received message (16) xid:1 numero:19 pid:3321554
    wait_loop: received message (16) xid:0 numero:26 pid:3321553
    wait_loop: received message (16) xid:1 numero:20 pid:3321554
    wait_loop: received message (16) xid:0 numero:27 pid:3321553
    wait_loop: received message (16) xid:1 numero:21 pid:3321554
    wait_loop: received message (16) xid:0 numero:28 pid:3321553
    wait_loop: received message (16) xid:1 numero:22 pid:3321554
    wait_loop: received message (16) xid:0 numero:29 pid:3321553
    wait_loop: received message (16) xid:1 numero:23 pid:3321554
    wait_loop: received message (16) xid:0 numero:30 pid:3321553
    wait_loop: received message (16) xid:1 numero:24 pid:3321554
    wait_loop: received message (16) xid:0 numero:31 pid:3321553
    wait_loop: received message (16) xid:1 numero:25 pid:3321554
    wait_loop: received message (16) xid:0 numero:32 pid:3321553
    wait_loop: received message (16) xid:1 numero:26 pid:3321554
    wait_loop: received message (16) xid:0 numero:33 pid:3321553
    wait_loop: received message (16) xid:1 numero:27 pid:3321554
    wait_loop: received message (16) xid:0 numero:34 pid:3321553
    wait_loop: received message (16) xid:1 numero:28 pid:3321554
    wait_loop: received message (16) xid:0 numero:35 pid:3321553
    wait_loop: received message (16) xid:1 numero:29 pid:3321554
    wait_loop: received message (16) xid:0 numero:36 pid:3321553
    wait_loop: received message (16) xid:1 numero:30 pid:3321554
    wait_loop: received message (16) xid:0 numero:37 pid:3321553
    wait_loop: received message (16) xid:1 numero:31 pid:3321554
    wait_loop: received message (16) xid:0 numero:38 pid:3321553
    wait_loop: received message (16) xid:1 numero:32 pid:3321554
    wait_loop: received message (16) xid:0 numero:39 pid:3321553
    wait_loop: received message (16) xid:1 numero:33 pid:3321554
    wait_loop: received message (16) xid:1 numero:34 pid:3321554
    wait_loop: received message (16) xid:1 numero:35 pid:3321554
    wait_loop: received message (16) xid:1 numero:36 pid:3321554
    wait_loop: received message (16) xid:1 numero:37 pid:3321554
    wait_loop: received message (16) xid:1 numero:38 pid:3321554
    wait_loop: received message (16) xid:1 numero:39 pid:3321554
    wait_loop: received message (16) xid:1 numero:40 pid:3321554
    wait_loop: received message (16) xid:1 numero:41 pid:3321554
    wait_loop: received message (16) xid:1 numero:42 pid:3321554
    wait_loop: received message (16) xid:1 numero:43 pid:3321554