I am looking for a node.js / Javascript module that applies the minhash algorithm to a string or bigger text, and returns me an "identifying" or "characteristic" Bytestring or Hexstring for that text. If I apply the algorithm to another similar text string, the hash string should also be similar. Does a module like that exist already?
The modules I was examining so far had only the possibility to compare texts directly and calculating some kind of jaccard similarity in numbers directly to the compared texts, but I would like to store some kind of hash-string for each document, so I can later compare the strings for similarity if I have similar texts...
Essentially, what I am looking for is this code from here (Java): in Javascript: https://github.com/codelibs/elasticsearch-minhash
for example, for a string like:
"The quick brown fox jumps over the lazy dog"
and "The quick brown fox jumps over the lazy d"
it would create a hash for the first sentence like:
"KV5rsUfZpcZdVojpG8mHLA=="
and for the second string something like:
KV5rsSfZpcGdVojpG8mGLA==
both hash-strings don't differ very much... that's the point in minhash algorithm, however, I don't know how to create that similar hashstring.. and all libraries thus far I have found, only compare directly 2 documents and create a similarity coefficient, but they don't create a hashstring that's characteristic for the document... The similarity with all algorithms is, that they create a hashed crc32 (or similar) hash value for their Array of word tokens (or shingles). But I still do not know how they compare those hashes with each other...
Requires Douglas Duhaime's implementation of minhash, but any other implementation computing an array of hash values could be used the same way.
const str1 = "The quick brown fox jumps over the lazy dog";
const str2 = "The quick brown fox jumps over the lazy d";
console.log(str1);
console.log(str2);
var s1 = str1.split(' ');
var s2 = str2.split(' ');
// create a hash for each set of words to compare
// default numPerm is 128 but that gives very long hash
// below 8, almost similar string will give exactly the same hash
var m1 = new Minhash({numPerm: 8});
var m2 = new Minhash({numPerm: 8});
// update each hash
s1.map(function(w) { m1.update(w) });
s2.map(function(w) { m2.update(w) });
// estimate the jaccard similarity between two minhashes
console.log('jaccard similarity:', m1.jaccard(m2));
// Now to convert hashvalues to a string we use a kind of base64
// encode but since hasvalues is an array of 32bits integer we
// have to explode it into a array of 8bits integers first
// for a given int32 returns 4 bytes
function int32ToBytes(num) {
// the hexadecimal representation of the largest 32bits unsigned integer is 0xFFFFFFFF
// the hexadecimal representation of the largest unsigned integer (8bits === a byte) is 0xFF
// so it is possible to think a 32bits uint (unsigned integer) as the concatenation of 4 8bits uint.
// the bitwise & operator is the bitwise AND
// its table of truth is 0 & 0 = 0, 0 & 1 = 0, 1 & 0 = 0 and 1 & 1 = 1
// for instance 8 & 1 <=> 0b111 & 0b001 <=> 0b001 <=> 1
// the same is possible with hex representation:
// 65535 & 255 <=> 0xFFFF & 0x00FF <=> 0x0FF <=> 255
// 65535 & 65280 <=> 0xFFFF & 0xFF00 <=> 0xFF00 <=> 65280
// 255 + 65535 = 65535
// now about the bitwise >> shift operator
// a >> n shift the number a by n bits to the right
// in hex FF is 8bits so `0xFF00 >> 8 = 0xFF`
// this operation is reversible `0xFF << 8 = 0xFF00`
// 0xFFFF needs 16 bits to be represented, as 0xFF00
// but 0xFF only needs 8 bits
// so its possible to split a 16 bits integer into two 8 bits integer this way:
// int16 = (int16 & 0xFF00) >> 8 + (int16 & 0x00FF) >> 0
// no information was lost because we're able to do the reverse operation
// the same principle is used below to encode a 32 bits integer into 4 bytes (8bits integers)
// max uint32 = 0xFFFFFFFF =
// 0xFF << 24 + 0xFF << 16 + 0xFF << 8 + 0xFF << 0
const arr = [
(num & 0xff000000) >> 24,
(num & 0x00ff0000) >> 16,
(num & 0x0000ff00) >> 8,
(num & 0x000000ff)
];
return arr;
}
// tolerant base64 encode of 4 bytes
function Uint8ToString(u8a){
var CHUNK_SZ = 0x8000;
var c = [];
for (var i=0; i < u8a.length; i+=CHUNK_SZ) {
c.push(String.fromCharCode.apply(null, u8a.subarray(i, i+CHUNK_SZ)));
}
return c.join("");
}
// tolerant base64 encode of int32 array
function base64EncodeInt32Array(intArray) {
let str = '';
intArray.forEach((i) => {
var u8 = new Uint8Array(int32ToBytes(i));
var b64encoded = btoa(Uint8ToString(u8));
str += b64encoded;
});
return str;
}
// replace non significant '==' to shorten hash
console.log(base64EncodeInt32Array(m1.hashvalues).replace(/==/g, ''));
console.log(base64EncodeInt32Array(m2.hashvalues).replace(/==/g, ''));
<script src='https://rawgit.com/duhaime/minhash/master/minhash.min.js'></script>