javascriptnode.jsminhash

Node.js / javascript minhash module that outputs a similar hashstring for similar text


I am looking for a node.js / Javascript module that applies the minhash algorithm to a string or bigger text, and returns me an "identifying" or "characteristic" Bytestring or Hexstring for that text. If I apply the algorithm to another similar text string, the hash string should also be similar. Does a module like that exist already?

The modules I was examining so far had only the possibility to compare texts directly and calculating some kind of jaccard similarity in numbers directly to the compared texts, but I would like to store some kind of hash-string for each document, so I can later compare the strings for similarity if I have similar texts...

Essentially, what I am looking for is this code from here (Java): in Javascript: https://github.com/codelibs/elasticsearch-minhash

for example, for a string like: "The quick brown fox jumps over the lazy dog" and "The quick brown fox jumps over the lazy d" it would create a hash for the first sentence like:

"KV5rsUfZpcZdVojpG8mHLA=="

and for the second string something like:

KV5rsSfZpcGdVojpG8mGLA==

both hash-strings don't differ very much... that's the point in minhash algorithm, however, I don't know how to create that similar hashstring.. and all libraries thus far I have found, only compare directly 2 documents and create a similarity coefficient, but they don't create a hashstring that's characteristic for the document... The similarity with all algorithms is, that they create a hashed crc32 (or similar) hash value for their Array of word tokens (or shingles). But I still do not know how they compare those hashes with each other...


Solution

  • Requires Douglas Duhaime's implementation of minhash, but any other implementation computing an array of hash values could be used the same way.

    const str1 = "The quick brown fox jumps over the lazy dog";
    const str2 = "The quick brown fox jumps over the lazy d";
    console.log(str1);
    console.log(str2);
    var s1 = str1.split(' ');
    var s2 = str2.split(' ');
    
    // create a hash for each set of words to compare
    // default numPerm is 128 but that gives very long hash
    // below 8, almost similar string will give exactly the same hash
    var m1 = new Minhash({numPerm: 8});
    var m2 = new Minhash({numPerm: 8});
    
    // update each hash
    s1.map(function(w) { m1.update(w) });
    s2.map(function(w) { m2.update(w) });
    
    
    // estimate the jaccard similarity between two minhashes
    console.log('jaccard similarity:', m1.jaccard(m2));
    
    // Now to convert hashvalues to a string we use a kind of base64
    // encode but since hasvalues is an array of 32bits integer we
    // have to explode it into a array of 8bits integers first
    
    // for a given int32 returns 4 bytes
    function int32ToBytes(num) {
        // the hexadecimal representation of the largest 32bits unsigned integer is 0xFFFFFFFF
        // the hexadecimal representation of the largest unsigned integer (8bits === a byte) is 0xFF
        // so it is possible to think a 32bits uint (unsigned integer) as the concatenation of 4 8bits uint.
        // the bitwise & operator is the bitwise AND
        // its table of truth is 0 & 0 = 0, 0 & 1 = 0, 1 & 0 = 0 and 1 & 1 = 1
        // for instance 8 & 1 <=> 0b111 & 0b001 <=> 0b001 <=> 1
    
        // the same is possible with hex representation:
        // 65535 & 255 <=> 0xFFFF & 0x00FF <=> 0x0FF <=> 255
        // 65535 & 65280 <=> 0xFFFF & 0xFF00 <=> 0xFF00 <=> 65280
        // 255 + 65535 = 65535
    
        // now about the bitwise >> shift operator
        // a >> n shift the number a by n bits to the right
        // in hex FF is 8bits so `0xFF00 >> 8 = 0xFF`
        // this operation is reversible `0xFF << 8 = 0xFF00`
    
        // 0xFFFF needs 16 bits to be represented, as 0xFF00
        // but 0xFF only needs 8 bits
        // so its possible to split a 16 bits integer into two 8 bits integer this way:
        // int16 = (int16 & 0xFF00) >> 8 + (int16 & 0x00FF) >> 0
        // no information was lost because we're able to do the reverse operation
    
        // the same principle is used below to encode a 32 bits integer into 4 bytes (8bits integers)
       // max uint32 = 0xFFFFFFFF =
       // 0xFF << 24 + 0xFF << 16 + 0xFF << 8 + 0xFF << 0
        
    
      
        const arr = [
            (num & 0xff000000) >> 24,
            (num & 0x00ff0000) >> 16,
            (num & 0x0000ff00) >> 8,
            (num & 0x000000ff)
        ];
        return arr;
    }
    
    // tolerant base64 encode of 4 bytes
    function Uint8ToString(u8a){
      var CHUNK_SZ = 0x8000;
      var c = [];
      for (var i=0; i < u8a.length; i+=CHUNK_SZ) {
        c.push(String.fromCharCode.apply(null, u8a.subarray(i, i+CHUNK_SZ)));
      }
      return c.join("");
    }
    
    // tolerant base64 encode of int32 array
    function base64EncodeInt32Array(intArray) {
        let str = '';
        intArray.forEach((i) => {
            var u8 = new Uint8Array(int32ToBytes(i));
            var b64encoded = btoa(Uint8ToString(u8));
            str += b64encoded;
        });
        
        return str;
        
    }
    
    // replace non significant '==' to shorten hash
    console.log(base64EncodeInt32Array(m1.hashvalues).replace(/==/g, ''));
    console.log(base64EncodeInt32Array(m2.hashvalues).replace(/==/g, ''));
    <script src='https://rawgit.com/duhaime/minhash/master/minhash.min.js'></script>