algorithmcompression7ziplzmaxz

Where in the .XZ file is stored uncompressed file size?


I read XZ's specification and I want to get uncompressed size. There is nothing like this in the footer or header.

Am I supposed to:


Solution

  • Yes, what you described is correct, except you read the backward size from the footer of the .xz file, not the header. There can be zero padding, so you need to search backwards from the end of the file for the footer. You also need to continue to search for additional xz streams, since concatenated xz streams are valid .xz files.

    Example in C:

    // Show the uncompressed size that would result from decompressing a .xz file.
    // All concatenated xz streams in the file are included. Keep it simple, with
    // limited format error checking.
    
    #include <stdio.h>
    #include <setjmp.h>
    #ifdef _WIN32
    #  define off_t __int64
    #  define fseeko _fseeki64
    #  define ftello _ftelli64
    #endif
    
    // Throw on error.
    static jmp_buf env;
    static void bad(char *msg) {
        printf(" -- %s\n", msg);
        longjmp(env, 1);
    }
    
    // Return one byte read successfully from in, or report an error.
    static unsigned get(FILE *in) {
        int ch = getc(in);
        if (ch == EOF)
            bad(ferror(in) ? "read error" : "invalid xz");
        return ch;
    }
    
    // Return the n-byte little-endian integer from in. n <= 8.
    static unsigned long long little(FILE *in, int n) {
        unsigned long long val = 0;
        for (int i = 0; i < n; i++)
            val |= (unsigned long long)get(in) << (i << 3);
        return val;
    }
    
    // Return the value of the variable-length unsigned integer read from in.
    static unsigned long long varint(FILE *in) {
        unsigned long long var = 0, dig;
        int shift = 0;
        do {
            if (shift == 63)
                bad("invalid xz");
            var |= ((dig = get(in)) & 0x7f) << shift;
            shift += 7;
        } while (dig & 0x80);
        return var;
    }
    
    // Return the total uncompressed size of the .xz streams in xz.
    static unsigned long long xzsz(FILE *xz) {
        if (fseeko(xz, 0, SEEK_END))
            bad("could not seek");
        off_t off = ftello(xz);
        if (off & 3)
            bad("invalid xz");
        unsigned long long usize = 0;
        do {                                    // process streams in reverse order
            do {
                if (off < 32)                   // minimum .xz stream length
                    bad("invalid xz");
                off -= 4;
                fseeko(xz, off + 2, SEEK_SET);
            } while (get(xz) != 'Y' || get(xz) != 'Z');     // footer signature
            fseeko(xz, off - 4, SEEK_SET);
            off -= ((little(xz, 4) + 1) << 2) + 8;          // offset of index
            if (off < 0)
                bad("invalid xz");
            fseeko(xz, off + 1, SEEK_SET);
            unsigned long long recs = varint(xz);
            while (recs--) {
                off -= (varint(xz) + 3) & ~3ULL;
                if (off < 0)
                    bad("invalid xz");
                usize += varint(xz);
            }
            off -= 12;                          // offset of stream header
            if (off < 0)
                bad("invalid xz");
            fseeko(xz, off, SEEK_SET);
            if (little(xz, 6) != 0x5a587a37fd)  // .xz header signature
                bad("invalid xz");
        } while (off);
        return usize;
    }
    
    // Show the size for the .xz files on the command line.
    int main(int argc, char **argv) {
        int ret = 0;
        for (int i = 1; i < argc; i++) {
            fputs(argv[i], stdout);
            FILE *xz = fopen(argv[i], "rb");
            if (setjmp(env)) {
                // Catch error throw.
                if (xz != NULL)
                    fclose(xz);
                ret = 1;
                continue;
            }
            if (xz == NULL)
                bad("could not open");
            printf(": %llu\n", xzsz(xz));
            fclose(xz);
        }
        return ret;
    }