c++precisionlong-integer

Big numbers in c++ issue in cosine similarity


I'm writing this function

double long CosineDistance(const vector<unsigned long>& a,const vector<unsigned long>& b){
  double long num = 0.0, den1 = 0.0, den2 = 0.0 ;
    for(int i = 0; i < a.size(); ++i) {
     num+=a[i]*b[i] ;
     den1+=a[i]*a[i] ;
     den2+=b[i]*b[i] ;
     } 
return num/(sqrt(den1)*sqrt(den2));
}

And it works as it expect with small numbers:

i.e. passing {1,3,8} and {5,4,9} returns 0.936686 (which is right)

Now the project I'm building uses big numbers (they are hashed strings) and using numbers like

{3337682107,92015386,2479056,2478761,4153082938}

and

{104667454,92015386,150359366,2225484100,2479056}

it returns me 1, which I think is the approximation of 0.968597, according to WolframAlpha.

Already checked overflow and it's not happening.

Is there a way to fix this?

Thanks


Solution

  • I checked this using Matlab and C++ (x64 VC2013), for your "big numbers" case, I got an answer of 0.0314034 instead of 0.968597. I used the raw numbers as double instead of converting from int to double.

    Here is how I checked things.

    #include <cmath>
    #include <vector>
    #include <iostream>
    using namespace std;
    
    double CosineDistance(const vector<double> &a, const vector<double> &b);
    long double CosineDistance2(const vector<long double> &a, const vector<long double> &b);
    long double Cos2(const vector<unsigned long> &a, const vector<unsigned long> &b);
    long double Cos3(const vector<unsigned long> &a, const vector<unsigned long> &b);
    
    int main(int argc, char * argv[]){
    
        vector<double> a = { 1, 3, 8 };
        vector<double> b = { 5, 4, 9 };
    
        double v1 = CosineDistance(a, b);
    
        vector<double> a2 = { 3337.682107, 92.015386, 2.479056, 2.478761, 4153.082938 };
        vector<double> b2 = { 104.667454, 92.015386, 150.359366, 2225.484100, 2.479056 };
    
        double v2 = CosineDistance(a2, b2);
    
        vector<double> a3 = { 333.7682107, 9.2015386, .2479056, .2478761, 415.3082938 };
        vector<double> b3 = { 10.4667454, 9.2015386, 15.0359366, 222.5484100, .2479056 };
    
        double v3 = CosineDistance(a3, b3);
    
        vector<double> a4 = { .1, .3, .8 };
        vector<double> b4 = { .5, .4, .9 };
    
        double v4 = CosineDistance(a4, b4);
    
        vector<long double> a5 = { 3337682107, 92015386, 2479056, 2478761, 4153082938 };
        vector<long double> b5 = { 104667454, 92015386, 150359366, 2225484100, 2479056 };
    
        long double v5 = CosineDistance2(a5, b5);
    
        vector<unsigned long> a6 = { 3337682107, 92015386, 2479056, 2478761, 4153082938 };
        vector<unsigned long> b6 = { 104667454, 92015386, 150359366, 2225484100, 2479056 };
    
        long double v6 = Cos2(a6, b6);
        long double v7 = Cos3(a6, b6);
    
        cout << v1 << endl;
        cout << v2 << endl;
        cout << v3 << endl;
        cout << v4 << endl;
        cout << v5 << endl;
        cout << v6 << endl;
        cout << v7 << endl;
    
        return 0;
    }
    
    double CosineDistance(const vector<double> &a, const vector<double> &b){
    
        double num(0.0), den1(0.0), den2(0.0);
    
        for (unsigned int i = 0; i < a.size(); ++i){
            num += a[i] * b[i];
            den1 += a[i] * a[i];
            den2 += b[i] * b[i];
        }
    
        double res = num / (sqrt(den1) * sqrt(den2));
    
        return res;
    }
    
    long double CosineDistance2(const vector<long double> &a, const vector<long double> &b){
    
        long double num(0.0), den1(0.0), den2(0.0);
    
        for (unsigned int i = 0; i < a.size(); ++i){
            num += a[i] * b[i];
            den1 += a[i] * a[i];
            den2 += b[i] * b[i];
        }
    
        long double res = num / (sqrt(den1) * sqrt(den2));
    
        return res;
    }
    
    long double Cos2(const vector<unsigned long> &a, const vector<unsigned long> &b){
    
        vector<long double> ad(a.size());
        vector<long double> bd(b.size());
        for (unsigned int i = 0; i < a.size(); ++i){
            ad[i] = static_cast<long double>(a[i]);
            bd[i] = static_cast<long double>(b[i]);
        }
    
        long double num(0.0), den1(0.0), den2(0.0);
    
        for (unsigned int i = 0; i < a.size(); ++i){
            num += ad[i] * bd[i];
            den1 += ad[i] * ad[i];
            den2 += bd[i] * bd[i];
        }
    
        long double res = num / (sqrt(den1) * sqrt(den2));
    
        return res;
    }
    
    long double Cos3(const vector<unsigned long> &a, const vector<unsigned long> &b){
    
        long double num(0.0), den1(0.0), den2(0.0);
    
        for (unsigned int i = 0; i < a.size(); ++i){
            num += a[i] * b[i];
            den1 += a[i] * a[i];
            den2 += b[i] * b[i];
        }
    
        long double res = num / (sqrt(den1) * sqrt(den2));
    
        return res;
    }
    

    The output is:

    0.936686
    0.0314034
    0.0314034
    0.936686
    0.0314034
    0.0314034
    0.581537
    

    Notice that when I specifically convert from unsigned long to long double my answer agrees with both Matlab and my other C++ numbers.