stringalgorithmsorting

Return a new string that sorts between two given strings


Given two strings a and b, where a is lexicographically < b, I'd like to return a string c such that a < c < b. The use case is inserting a node in a database sorted by such keys. You can specify the format for a, b, and c if you like, as long as it is possible to generate initial values as well as new values on insert.

Is there a practical algorithm for this?


Solution

  • Minimising string length

    If you want to keep the string lengths to a minimum, you could create a string that is lexicographically halfway between the left and right strings, so that there is room to insert additional strings, and only create a longer string if absolutely necessary.

    I will assume an alphabet [a-z], and a lexicographical ordering where an empty space comes before 'a', so that e.g. "ab" comes before "abc".

    Basic case

    You start by copying the characters from the beginning of the strings, until you encounter the first difference, which could be either two different characters, or the end of the left string:

    abcde ~ abchi  ->  abc  +  d ~ h  
    abc   ~ abchi  ->  abc  +  _ ~ h  
    

    The new string is then created by appending the character that is halfway in the alphabet between the left character (or the beginning of the alphabet) and the right character:

    abcde ~ abchi  ->  abc  +  d ~ h  ->  abcf  
    abc   ~ abchi  ->  abc  +  _ ~ h  ->  abcd  
    

    Consecutive characters

    If the two different characters are lexicographically consecutive, first copy the left character, and then append the character halfway between the next character from the left string and the end of the alphabet:

    abhs ~ abit  ->  ab  +  h ~ i  ->  abh  +  s ~ _  ->  abhw
    abh  ~ abit  ->  ab  +  h ~ i  ->  abh  +  _ ~ _  ->  abhn
    

    If the next character(s) in the left string are one or more z's, then copy them and append the character halfway between the first non-z character and the end of the alphabet:

    abhz   ~ abit  ->  ab  +  h ~ i  ->  abh  +  z ~ _  ->  abhz  +  _ ~ _  ->  abhzn  
    abhzs  ~ abit  ->  ab  +  h ~ i  ->  abh  +  z ~ _  ->  abhz  +  s ~ _  ->  abhzw  
    abhzz  ~ abit  ->  ab  +  h ~ i  ->  abh  +  z ~ _  ->  ... ->  abhzz  +  _ ~ _  ->  abhzzn
    

    Right character is a or b

    You should never create a string by appending an 'a' to the left string, because that would create two lexicographically consecutive strings, inbetween which no further strings could be added. The solution is to always append an additional character, halfway inbetween the beginning of the alphabet and the next character from the right string:

    abc  ~ abcah   ->  abc  +  _ ~ a  ->  abca  +  _ ~ h  ->  abcad  
    abc  ~ abcab   ->  abc  +  _ ~ a  ->  abca  +  _ ~ b  ->  abcaa  +  _ ~ _  ->  abcaan  
    abc  ~ abcaah  ->  abc  +  _ ~ a  ->  abca  +  _ ~ a  ->  abcaa  +  _ ~ h  ->  abcaad  
    abc  ~ abcb    ->  abc  +  _ ~ b  ->  abca  +  _ ~ _  ->  abcan
    

    Code examples

    Below is a code snippet which demonstrates the method. It's a bit fiddly because JavaScript, but not actually complicated. To generate a first string, call the function with two empty strings; this will generate the string "n". To insert a string before the leftmost or after the rightmost string, call the function with that string and an empty string.

    function midString(prev, next) {
        var p, n, pos, str;
        for (pos = 0; p == n; pos++) {               // find leftmost non-matching character
            p = pos < prev.length ? prev.charCodeAt(pos) : 96;
            n = pos < next.length ? next.charCodeAt(pos) : 123;
        }
        str = prev.slice(0, pos - 1);                // copy identical part of string
        if (p == 96) {                               // prev string equals beginning of next
            while (n == 97) {                        // next character is 'a'
                n = pos < next.length ? next.charCodeAt(pos++) : 123;  // get char from next
                str += 'a';                          // insert an 'a' to match the 'a'
            }
            if (n == 98) {                           // next character is 'b'
                str += 'a';                          // insert an 'a' to match the 'b'
                n = 123;                             // set to end of alphabet
            }
        }
        else if (p + 1 == n) {                       // found consecutive characters
            str += String.fromCharCode(p);           // insert character from prev
            n = 123;                                 // set to end of alphabet
            while ((p = pos < prev.length ? prev.charCodeAt(pos++) : 96) == 122) {  // p='z'
                str += 'z';                          // insert 'z' to match 'z'
            }
        }
        return str + String.fromCharCode(Math.ceil((p + n) / 2)); // append middle character
    }
    
    var strings = ["", ""];
    while (strings.length < 100) {
        var rnd = Math.floor(Math.random() * (strings.length - 1));
        strings.splice(rnd + 1, 0, midString(strings[rnd], strings[rnd + 1]));
        document.write(strings + "<br>");
    }

    Below is a straightforward translation into C. Call the function with empty null-terminated strings to generate the first string, or insert before the leftmost or after the rightmost string. The string buffer buf should be large enough to accomodate one extra character.

    int midstring(const char *prev, const char *next, char *buf) {
        char p = 0, n = 0;
        int len = 0;
        while (p == n) {                                           // copy identical part
            p = prev[len] ? prev[len] : 'a' - 1;
            n = next[len] ? next[len] : 'z' + 1;
            if (p == n) buf[len++] = p;
        }
        if (p == 'a' - 1) {                                        // end of left string
            while (n == 'a') {                                     // handle a's
                buf[len++] = 'a';
                n = next[len] ? next[len] : 'z' + 1;
            }
            if (n == 'b') {                                        // handle b
                buf[len++] = 'a';
                n = 'z' + 1;
            }
        }
        else if (p + 1 == n) {                                     // consecutive characters
            n = 'z' + 1;
            buf[len++] = p;
            while ((p = prev[len] ? prev[len] : 'a' - 1) == 'z') { // handle z's
                buf[len++] = 'z';
            }
        }
        buf[len++] = n - (n - p) / 2;                              // append middle character
        buf[len] = '\0';
        return len;
    }
    

    Average string length

    The best case is when the elements are inserted in random order. In practice, when generating 65,536 strings in pseudo-random order, the average string length is around 4.74 characters (the theoretical minimum, using every combination before moving to longer strings, would be 3.71).

    The worst case is when inserting the elements in order, and always generating a new rightmost or leftmost string; this will lead to a recurring pattern:

    n, u, x, z, zn, zu, zx, zz, zzn, zzu, zzx, zzz, zzzn, zzzu, zzzx, zzzz...  
    n, g, d, b, an, ag, ad, ab, aan, aag, aad, aab, aaan, aaag, aaad, aaab...  
    

    with an extra character being added after every fourth string.


    If you have an existing ordered list for which you want to generate keys, generate lexicographically equally-spaced keys with an algorithm like the one below, and then use the algorithm described above to generate a new key when inserting a new element.

    The code checks how many charactes are needed, how many different characters are needed for the least significant digit, and then switches between two selections from the alphabet to get the right number of keys. E.g. keys with two character can have 676 different values, so if you ask for 1600 keys, that is 1.37 extra keys per two-character combination, so after each two-character key an additional one ('n') or two ('j','r') characters are appended, i.e.: aan ab abj abr ac acn ad adn ae aej aer af afn ... (skipping the initial 'aa').

    function seqString(num) {
        var chars = Math.floor(Math.log(num) / Math.log(26)) + 1;
        var prev = Math.pow(26, chars - 1);
        var ratio = chars > 1 ? (num + 1 - prev) / prev : num;
        var part = Math.floor(ratio);
        var alpha = [partialAlphabet(part), partialAlphabet(part + 1)];
        var leap_step = ratio % 1, leap_total = 0.5;
        var first = true;
        var strings = [];
        generateStrings(chars - 1, "");
        return strings;
    
        function generateStrings(full, str) {
            if (full) {
                for (var i = 0; i < 26; i++) {
                    generateStrings(full - 1, str + String.fromCharCode(97 + i));
                }
            }
            else {
                if (!first) strings.push(stripTrailingAs(str));
                else first = false;
                var leap = Math.floor(leap_total += leap_step);
                leap_total %= 1;
                for (var i = 0; i < part + leap; i++) {
                    strings.push(str + alpha[leap][i]);
                }
            }
        }
        function stripTrailingAs(str) {
            var last = str.length - 1;
            while (str.charAt(last) == 'a') --last;
            return str.slice(0, last + 1);
        }
        function partialAlphabet(num) {
            var magic = [0, 4096, 65792, 528416, 1081872, 2167048, 2376776, 4756004,
                         4794660, 5411476, 9775442, 11097386, 11184810, 22369621];
            var bits = num < 13 ? magic[num] : 33554431 - magic[25 - num];
            var chars = [];
            for (var i = 1; i < 26; i++, bits >>= 1) {
                if (bits & 1) chars.push(String.fromCharCode(97 + i));
            }
            return chars;
        }
    
    }
    document.write(seqString(1600).join(' '));