unixuniq

How sort piped into uniq works in unix


In the following:

cat file | cut -f 1,5,6 | sort | uniq

Is intuitive to me to think that uniq needs to know the whole dataset before proceeding.
From HERE I understand that sort does write temporary files into disk for long sets of data.

Does uniq writes temporary files into disk for long datasets? Where?

Thank you!


Solution

  • uniq only needs to read a line at a time and compare the current line to the previous one; it can start working as soon as it starts getting lines; no need to read all input before producing any output.

    Basically, it just needs to read a line, and compare it to the previous line. If they're the same, increment a counter. If not, print the previous line (With count if requested). Then save the current line as the previous, and repeat.

    Here's a bare bones version written in C you can use as an example:

    #include <errno.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    int main(int argc, char **argv) {
      _Bool show_count = 0;
    
      if (argc == 2 && strcmp(argv[1], "-c") == 0) {
        show_count = 1;
      } else if (argc > 1) {
        fprintf(stderr, "Usage: %s [-c]\n", argv[0]);
        return EXIT_FAILURE;
      }
    
      char *prev = NULL;
      size_t prev_len = 0;
      int count = 1;
      while (1) {
        char *line = NULL;
        size_t line_len = 0;
        ssize_t len = getline(&line, &line_len, stdin);
        if (len < 0) {
          if (feof(stdin)) {
            break;
          } else {
            fprintf(stderr, "Couldn't read input: %s\n", strerror(errno));
            return EXIT_FAILURE;
          }
        } else {
          if (prev) {
            if (strcmp(line, prev) == 0) {
              count++;
            } else {
              if (show_count) {
                printf("%7d ", count);
              }
              fwrite(prev, 1, prev_len, stdout);
              count = 1;
            }
            free(prev);
          }
        }
        prev = line;
        prev_len = len;
      }
    
      if (prev) {
        if (show_count) {
          printf("%7d ", count);
        }
        fwrite(prev, 1, prev_len, stdout);
        free(prev);
      }
    
      return 0;
    }