In the following:
cat file | cut -f 1,5,6 | sort | uniq
Is intuitive to me to think that uniq
needs to know the whole dataset before proceeding.
From HERE I understand that sort
does write temporary files into disk for long sets of data.
Does uniq
writes temporary files into disk for long datasets? Where?
Thank you!
uniq
only needs to read a line at a time and compare the current line to the previous one; it can start working as soon as it starts getting lines; no need to read all input before producing any output.
Basically, it just needs to read a line, and compare it to the previous line. If they're the same, increment a counter. If not, print the previous line (With count if requested). Then save the current line as the previous, and repeat.
Here's a bare bones version written in C you can use as an example:
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char **argv) {
_Bool show_count = 0;
if (argc == 2 && strcmp(argv[1], "-c") == 0) {
show_count = 1;
} else if (argc > 1) {
fprintf(stderr, "Usage: %s [-c]\n", argv[0]);
return EXIT_FAILURE;
}
char *prev = NULL;
size_t prev_len = 0;
int count = 1;
while (1) {
char *line = NULL;
size_t line_len = 0;
ssize_t len = getline(&line, &line_len, stdin);
if (len < 0) {
if (feof(stdin)) {
break;
} else {
fprintf(stderr, "Couldn't read input: %s\n", strerror(errno));
return EXIT_FAILURE;
}
} else {
if (prev) {
if (strcmp(line, prev) == 0) {
count++;
} else {
if (show_count) {
printf("%7d ", count);
}
fwrite(prev, 1, prev_len, stdout);
count = 1;
}
free(prev);
}
}
prev = line;
prev_len = len;
}
if (prev) {
if (show_count) {
printf("%7d ", count);
}
fwrite(prev, 1, prev_len, stdout);
free(prev);
}
return 0;
}