When I try to fit a linear curve to my data points the curve does not match my data. The line is below my data points and the slope is too deep, too. It could be due to starting at x=10 and its corresponding y value, but I am not sure. least square method should be correct, as I tried multiple ways of calculating slope and intercept, but all look methods produce the same output.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define PATH_GNUPLOT "your/path/gnuplot"
void ERROR(char* error_string);
int main() {
// first 10 data points are irrelevant
double data[100] = {0,0,0,0,0,0,0,0,0,0, 0.242910, 0.235523, 0.228976, 0.223114, 0.217820, 0.213005, 0.208597, 0.204539, 0.200786, 0.197299, 0.194047, 0.191003, 0.188146, 0.185456, 0.182916, 0.180513, 0.178234, 0.176068, 0.174006, 0.172039, 0.170160, 0.168362, 0.166639, 0.164987, 0.163399, 0.161872, 0.160402, 0.158984, 0.157617, 0.156297, 0.155020, 0.153785, 0.152590, 0.151431, 0.150308, 0.149218, 0.148159, 0.147131, 0.146131, 0.145158, 0.144211, 0.143289, 0.142391, 0.141515, 0.140661, 0.139827, 0.139014, 0.138219, 0.137442, 0.136683, 0.135941, 0.135215, 0.134504, 0.133809, 0.133128, 0.132461, 0.131807, 0.131166, 0.130538, 0.129922, 0.129318, 0.128725, 0.128142, 0.127571, 0.127010, 0.126458, 0.125916, 0.125384, 0.124861, 0.124346, 0.123840, 0.123342, 0.122853, 0.122371, 0.121897, 0.121430, 0.120970, 0.120518, 0.120072, 0.119633, 0.119200, 0.118774, 0.118354, 0.117939, 0.117531, 0.117128, 0.116731, 0.116340, 0.115953, 0.115572};
int n = 100;
/* calc fit */
double sumx, sumy, sumx2, sumxy, a, b = 0.0;
// => log(y) = a * log(x) + b
// data starts at 10 for x value
for (int i = 10; i < n; i++) {
double x = log(i);
double y = log(data[i]);
sumx += x;
sumx2 += x * x;
sumy += y;
sumxy += y * x;
}
a = (n * sumxy - sumx * sumy) / (n * sumx2 - sumx*sumx);
b = (sumy * sumx2 - sumx * sumxy) / (n * sumx2 - sumx*sumx);
/* end calc fit */
/* save data and plot */
FILE *fp = fopen("data.dat", "w");
if (fp == NULL) {
ERROR("failed to open data.dat");
}
for (int i = 10; i < n; i++)
fprintf(fp, "%d %.16lf\n", i, data[i]);
fclose(fp);
FILE *GNUpipe = popen(PATH_GNUPLOT, "w");
if (GNUpipe == NULL) {
ERROR("failed to open gnuplot");
}
fprintf(GNUpipe, "set term png\n");
fprintf(GNUpipe, "set logscale xy\n");
// log(y) = a * log(x) + b
// => y = x**a + 10**b
fprintf(GNUpipe, "Fit(x) = x**%lf * 10**%lf\n", a, b);
fprintf(GNUpipe, "plot [0.1:%d*1.1] 'data.dat' title 'error', ", n);
fprintf(GNUpipe, "Fit(x) title 'fit'\n");
pclose(GNUpipe);
}
void ERROR(char* error_string) {
fprintf(stderr, error_string);
exit(EXIT_FAILURE);
}
EDIT:
Code so far with all changes implemented:
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define PATH_GNUPLOT "your/path/gnuplot"
void ERROR(char* error_string);
int main() {
// first 10 data points are irrelevant
double data[100] = {0,0,0,0,0,0,0,0,0,0, 0.242910, 0.235523, 0.228976, 0.223114, 0.217820, 0.213005, 0.208597, 0.204539, 0.200786, 0.197299, 0.194047, 0.191003, 0.188146, 0.185456, 0.182916, 0.180513, 0.178234, 0.176068, 0.174006, 0.172039, 0.170160, 0.168362, 0.166639, 0.164987, 0.163399, 0.161872, 0.160402, 0.158984, 0.157617, 0.156297, 0.155020, 0.153785, 0.152590, 0.151431, 0.150308, 0.149218, 0.148159, 0.147131, 0.146131, 0.145158, 0.144211, 0.143289, 0.142391, 0.141515, 0.140661, 0.139827, 0.139014, 0.138219, 0.137442, 0.136683, 0.135941, 0.135215, 0.134504, 0.133809, 0.133128, 0.132461, 0.131807, 0.131166, 0.130538, 0.129922, 0.129318, 0.128725, 0.128142, 0.127571, 0.127010, 0.126458, 0.125916, 0.125384, 0.124861, 0.124346, 0.123840, 0.123342, 0.122853, 0.122371, 0.121897, 0.121430, 0.120970, 0.120518, 0.120072, 0.119633, 0.119200, 0.118774, 0.118354, 0.117939, 0.117531, 0.117128, 0.116731, 0.116340, 0.115953, 0.115572};
int n = 100;
/* calc fit */
double sumx = 0.0;
double sumy = 0.0;
double sumx2 = 0.0;
double sumxy = 0.0;
double a = 0.0;
double b = 0.0;
// => log(y) = a * log(x) + b
// data starts at 10 for x value
for (int i = 10; i < n; i++) {
double x = log(i);
double y = log(data[i]);
sumx += x;
sumx2 += x * x;
sumy += y;
sumxy += y * x;
}
int N = n - 10;
double denom = N*sumx2 - sumx * sumx;
b = (sumy * sumx2 - sumx * sumxy)/ denom;
a = (N*sumxy - sumx * sumy)/ denom;
printf("%g %g\n",a , b);
/* end calc fit */
/* save data and plot */
FILE *fp = fopen("data.dat", "w");
if (fp == NULL) {
ERROR("failed to open data.dat");
}
for (int i = 10; i < n; i++)
fprintf(fp, "%d %.16le\n", i, data[i]);
fclose(fp);
FILE *GNUpipe = popen(PATH_GNUPLOT, "w");
if (GNUpipe == NULL) {
ERROR("failed to open gnuplot");
}
fprintf(GNUpipe, "set term png\n");
fprintf(GNUpipe, "set logscale xy\n");
// log(y) = a * log(x) + b
// => y = x**a * 10**b
fprintf(GNUpipe, "Fit(x) = x**%.16lf * 10**%.16lf\n", a, b);
fprintf(GNUpipe, "plot [0.1:%d*1.1] 'data.dat' title 'error', ", n);
fprintf(GNUpipe, "Fit(x) title 'fit'\n");
pclose(GNUpipe);
}
void ERROR(char* error_string) {
fprintf(stderr, error_string);
exit(EXIT_FAILURE);
}
"data.dat": link
stdout: -0.323977 -0.66909
for printf("%g %g\n",a , b);
At least these problems:
Need to zero initialize
// double sumx, sumy, sumx2, sumxy, a, b = 0.0;
double sumx = 0.0;
double sumy = 0.0;
...
Save time
Enable more warnings: "warning: excess elements in array initializer"
Too many initializers
// double data[100] = { 101 initializers };
Avoid losing precision
// fprintf(GNUpipe, "Fit(x) = x**%lf * 10**%.*le\n", a, b);
fprintf(GNUpipe, "Fit(x) = x**%.16le * 10**%.16le\n", a, b);
// fprintf(fp, "%d %.16lf\n", i, data[i]);
fprintf(fp, "%d %.16le\n", i, data[i]);
Wrong n
Code uses n
. Since, for (int i = 10; i < n; i++)
, should be n-10
in a, b
formulas.
Maybe wrong formula?
Looks wrong to me, need deeper review.
// ??
a = (n * sumxy - sumx * sumy) / (n * sumx2 - sumx*sumx);
b = (sumy * sumx2 - sumx * sumxy) / (n * sumx2 - sumx*sumx);
I expect: ref.
N = n - 10;
double denom = N*sumx2 - sumx * sumx;
a = (sumy * sumx2 - sumx * sumxy)/ denom;
b = (N*sumxy - sumx * sumy)/ denom;
Ah, I see we have a, b
reversed.
[Append]
Wrong base
Code is using log()
(log base-e). Likely wants log10()
if "Fit(x) = x**%lf * 10**%lf\n"
is important.