arrayscpermutationarray-comparison

How to effectively code to check if two arrays with the same number of elements has the same elements even if in different positions?


I was trying to solve this problem: Given a number N and two arrays of A, B of N number of elements determine if B contains the same elements as A (not necessarily at the same position).

There will be three inputs:

Examples:

input:

4
4 2 3 7
2 3 4 9

output: no

input:

5
5 1 1 9 3
1 9 1 5 3

output: yes

My Code:

#include <stdio.h>
#include <stdlib.h>

int main()
{
    int n;
    scanf("%d", &n);
    long int a[n], b[n];
    for (int i = 0; i < n; i++)
    {
        scanf("%ld", &a[i]);
    }
    for (int i = 0; i < n; i++)
    {
        scanf("%ld", &b[i]);
    }
    
    int same;
    for (int j = 0; j < n; j++)
    {
        same = 0;
        for (int k = 0; k < n; k++)
        {
            if (a[j] == b[k])
            {
                same = 1;
            }
        }
        if (same == 0)
        {
            break;
        }
    }
    if (same == 0)
    {
        printf("no\n");
    }
    else
    {
        printf("yes\n");
    }
    return 0;
}

My code will take the first element of the array A and compare it with all the elements of array B and then take the following elements of A and do the same. This approach could successfully give correct output for the first 3 test cases (only 2 were shown in the question) but it shows wrong answer on test 4 and I don't know why as the input of that test case is not shown. So, please help me find the problem in my code.


Solution

  • Your approach, which has quadratic time complexity, does not work if the arrays have the same numbers but in differing amounts: eg: 1 1 2 and 1 2 2 will produce yes eventhough the arrays do not strictly contain the same numbers in a possibly different order.

    You could avoid this problem using an extra array of booleans storing whether b[k] has already been used.

    A more efficient approach is to make copies of both arrays, sort them and compare the sorted arrays (complexity O(N.log(N))).

    Here is a modified version using the extra array of booleans:

    #include <stdio.h>
    #include <stdbool.h>
    
    int main(void)
    {
        int n;
    
        if (scanf("%d", &n) != 1 || n <= 0)
            return 1;
    
        long int a[n], b[n];
        bool seen[n];
    
        for (int i = 0; i < n; i++) {
            if (scanf("%ld", &a[i]) != 1)
                return 1;
        }
        for (int i = 0; i < n; i++) {
            if (scanf("%ld", &b[i]) != 1)
                return 1;
            seen[i] = false;
        }
        
        int same = 1;
        for (int j = 0; j < n; j++) {
            same = 0;
            for (int k = 0; k < n; k++) {
                if (a[j] == b[k] && seen[k] == false) {
                    same = 1;
                    seen[k] = true;
                    break;
                }
            }
            if (same == 0)
                break;
        }
        if (same == 0) {
            printf("no\n");
        } else {
            printf("yes\n");
        }
        return 0;
    }
    

    Here is a more efficient version (for large sets), that sorts the arrays using qsort:

    #include <stdio.h>
    #include <stdlib.h>
    #include <stdbool.h>
    
    int compare_longs(const void *aa, const void *bb) {
        const long int *a = aa;
        const long int *b = bb;
        return (*a > *b) - (*a < *b);
    }
    
    long int *read_array(int n) {
        long int *a = calloc(sizeof(*a), n);
        if (a == NULL) {
            fprintf(stderr, "cannot allocate array\n");
            exit(1);
        }
        for (int i = 0; i < n; i++) {
            if (scanf("%ld", &a[i]) != 1) {
                fprintf(stderr, "input error\n");
                free(a);
                exit(1);
            }
        }
        return a;
    }
    
    int main(void) {
        int n;
    
        if (scanf("%d", &n) != 1 || n <= 0)
            return 1;
    
        long int *a = read_array(n);
        long int *b = read_array(n);
    
        qsort(a, n, sizeof(*a), compare_longs);
        qsort(b, n, sizeof(*b), compare_longs);
    
        int same = 1;
        for (int i = 0; i < n; i++) {
            if (a[i] != b[i]) {
                same = 0;
                break;
            }
        }
        if (same == 0) {
            printf("no\n");
        } else {
            printf("yes\n");
        }
        free(a);
        free(b);
        return 0;
    }
    

    Finally, here is an alternative using a hash table that should have quasi linear time complexity at the cost of extra memory:

    #include <limits.h>
    #include <stdio.h>
    #include <stdlib.h>
    
    enum hash_state { FREE = 0, USED, DELETED };
    
    typedef struct hash_entry {
        long int key;
        enum hash_state state;
        int count;
    } hash_entry;
    
    typedef struct hash_table {
        size_t size;
        hash_entry *arr;
    } hash_table;
    
    size_t hash_hash(hash_table *tab, long int key) {
        unsigned long int v = key;
        // rotate left one bit
        v = (v << 1) | (v / (ULONG_MAX ^ (ULONG_MAX >> 1)));
        return v % tab->size;
    }
    
    hash_table *hash_init(hash_table *tab, size_t n) {
        if (n > SIZE_MAX / 2)
            return NULL;
        tab->size = (n * 2) | 1;
        tab->arr = calloc(tab->size, sizeof(hash_entry));
        if (!tab->arr)
            return NULL;
        return tab;
    }
    
    void hash_free(hash_table *tab) {
        free(tab->arr);
    }
    
    hash_entry *hash_add(hash_table *tab, long int key) {
        for (size_t i = hash_hash(tab, key);; i = (i + 1) % tab->size) {
            if (tab->arr[i].state == FREE) {
                tab->arr[i].state = USED;
                tab->arr[i].key = key;
                return &tab->arr[i];
            }
            if (tab->arr[i].state == USED && tab->arr[i].key == key) {
                return &tab->arr[i];
            }
        }
    }
    
    hash_entry *hash_find(hash_table *tab, long int key) {
        for (size_t i = hash_hash(tab, key);; i = (i + 1) % tab->size) {
            if (tab->arr[i].state == FREE)
                return NULL;
            if (tab->arr[i].state == USED && tab->arr[i].key == key)
                return &tab->arr[i];
        }
    }
    
    long int *read_array(size_t n) {
        long int *a = calloc(sizeof(*a), n);
        if (a == NULL) {
            fprintf(stderr, "cannot allocate arrays\n");
            exit(1);
        }
        for (size_t i = 0; i < n; i++) {
            if (scanf("%ld", &a[i]) != 1) {
                fprintf(stderr, "input error\n");
                free(a);
                exit(1);
            }
        }
        return a;
    }
    
    int main(void) {
        int n;
    
        if (scanf("%d", &n) != 1 || n <= 0)
            return 1;
    
        long int *a = read_array(n);
        long int *b = read_array(n);
    
        hash_table tab[1];
        if (!hash_init(tab, n)) {
            fprintf(stderr, "cannot initialize hash table\n");
            return 1;
        }
        for (int i = 0; i < n; i++) {
            hash_add(tab, a[i])->count++;
        }
        int same = 1;
        for (int i = 0; i < n; i++) {
            hash_entry *e = hash_find(tab, b[i]);
            if (!e || !e->count--) {
                same = 0;
                break;
            }
        }
        hash_free(tab);
        if (same == 0) {
            printf("no\n");
        } else {
            printf("yes\n");
        }
        free(a);
        free(b);
        return 0;
    }
    

    In practice, the hash version is only slightly faster than the qsort version for large random sets (100k or 1 million random values) because the overhead of hashing is large, the hash function is simplistic and the log factor in sorting is small.

    Here is a benchmark for 100000 random numbers (and a shuffled array):

    chqrlie ~/dev/stackoverflow > time ./samesets-simplistic < samesets.100000
    yes
    
    real    0m2.212s
    user    0m2.182s
    sys     0m0.023s
    chqrlie ~/dev/stackoverflow > time ./samesets-qsort < samesets.100000
    yes
    
    real    0m0.096s
    user    0m0.080s
    sys     0m0.013s
    chqrlie ~/dev/stackoverflow > time ./samesets-hash < samesets.100000
    yes
    
    real    0m0.072s
    user    0m0.052s
    sys     0m0.014s