TPCCLIB
Loading...
Searching...
No Matches
pearson.c File Reference

Pearson's correlation coefficient and regression line. More...

#include "libtpcmodel.h"

Go to the source code of this file.

Functions

int pearson (double *x, double *y, int nr, double *k, double *kSD, double *b, double *bSD, double *r, double *ySD)
 
int pearson2 (double *x, double *y, char *is, int nr, double *k, double *kSD, double *b, double *bSD, double *r, double *ySD)
 
int pearson3 (double *x, double *y, int nr, double *k, double *kSD, double *b, double *bSD, double *r, double *ySD)
 
int pearson4 (double *x, double *y, int nr, double start, double end, double *k, double *kSD, double *b, double *bSD, double *r, double *ySD)
 Calculate slope and intercept of a line and Pearson's correlation coefficient.
 
int best_pearson (double *x, double *y, int nr, int min_nr, int *first, int *last, double *k, double *kSD, double *b, double *bSD, double *r, double *ySD)
 
int mean (double *x, double *y, int nr, double *xmean, double *xsd, double *ymean, double *ysd)
 
int regr_line (double *x, double *y, int n, double *m, double *c)
 
int highest_slope (double *x, double *y, int n, int slope_n, double *m, double *c, double *xi, double *xh)
 
int highest_slope_after (double *x, double *y, int n, int slope_n, double x_start, double *m, double *c, double *xi, double *xh)
 

Detailed Description

Pearson's correlation coefficient and regression line.

Author
Vesa Oikonen

Definition in file pearson.c.

Function Documentation

◆ best_pearson()

int best_pearson ( double * x,
double * y,
int nr,
int min_nr,
int * first,
int * last,
double * k,
double * kSD,
double * b,
double * bSD,
double * r,
double * ySD )

Find the best linear fit to double data (x[], y[]) with nr points.

Data may contain NaN's, which are not used.

See also
highest_slope, regr_line, pearson
Returns
Returns the nr of points actually used, or 0, in case of error.
Parameters
xData x values
yData y values
nrNumber of data sample values
min_nrMinimum nr of data points to use
firstIndex [0..last-2] of the first point to use initially, and after fitting
lastIndex [first+1..nr-1] of the last point to use initially, and after fitting
kslope
kSDS.D. of slope
by axis intercept
bSDS.D. of y axis intercept
rPearson's correlation coefficient, r
ySDResidual variance of y values

Definition at line 252 of file pearson.c.

277 {
278 int i, n, m, x1, x2, b1, b2, bm;
279 double *nx, *ny;
280 double tk, tkSD, tb, tbSD, tr, tySD;
281
282
283 if(0) {
284 fprintf(stdout, "best_pearson(x, y, %d, %d, %d, %d, k, kSD, b, bSD, r, ySD)\n",
285 nr, min_nr, *first, *last);
286 fflush(stdout);
287 }
288 /* Remove NaN's and those outside range first-last */
289 /* Allocate memory for new data pointers */
290 nx=(double*)calloc(nr, sizeof(double)); if(nx==NULL) return 0;
291 ny=(double*)calloc(nr, sizeof(double)); if(ny==NULL) {free((char*)nx); return 0;}
292 /* Copy data to pointers */
293 if(*last>nr-1) *last=nr-1;
294 if(*first<0) *first=0;
295 for(i=*first, n=0; i<=*last; i++)
296 if(!isnan(x[i]) && !isnan(y[i])) {nx[n]=x[i]; ny[n]=y[i]; n++;}
297
298 /* Check that we have enough points */
299 if(n<2 || n<min_nr) {free((char*)nx); free((char*)ny); return 0;}
300 if(n==min_nr) {
301 i=pearson(nx, ny, n, k, kSD, b, bSD, r, ySD);
302 free((char*)nx); free((char*)ny);
303 if(i) return 0; else return n;
304 }
305
306 /* OK, let's start */
307 *k=*kSD=*b=*bSD=*r=*ySD=0.; b1=b2=bm=0;
308 for(x1=0, m=n; n-x1>min_nr; x1++) {
309 /*printf("I x1=%i x2=%i m=%i n=%i\n", x1, x2, m, n);*/
310 for(x2=n-1, m=x2-x1+1; m>=min_nr; x2--, m--) {
311 if(pearson(nx+x1, ny+x1, m, &tk, &tkSD, &tb, &tbSD, &tr, &tySD))
312 continue;
313 if((tr>*r) ||
314 (tr==*r && m>bm) ||
315 (tr==*r && m==bm && x1>b1) ||
316 (tr==*r && m==bm && x1==b1 && tk>*k) ) {
317 bm=m; b1=x1; b2=x2;
318 *k=tk; *kSD=tkSD; *b=tb; *bSD=tbSD; *r=tr; *ySD=tySD;
319 }
320 /*printf("Fit range: %2i-%2i ; best fit: %2i-%2i\n", x1, x2, b1, b2);*/
321 }
322 }
323
324 /* Set first&last */
325 for(i=*first; i<=*last; i++)
326 if(x[i]==nx[b1] && y[i]==ny[b1]) {*first=i; break;}
327 for(i=*last; i>=*first; i--)
328 if(x[i]==nx[b2] && y[i]==ny[b2]) {*last=i; break;}
329 /*printf("FIRST=%i LAST=%i\n", *first, *last);*/
330
331 free((char*)nx); free((char*)ny);
332 return bm;
333}
int pearson(double *x, double *y, int nr, double *k, double *kSD, double *b, double *bSD, double *r, double *ySD)
Definition pearson.c:14

◆ highest_slope()

int highest_slope ( double * x,
double * y,
int n,
int slope_n,
double * m,
double * c,
double * xi,
double * xh )

Finds the regression line with the highest slope for x,y data.

See also
highest_slope_after, regr_line, best_pearson
Returns
Return 0 if ok.
Parameters
xAn array of x axis values
yAn array of y axis values
nThe number of values in x and y arrays
slope_nThe number of samples used to fit the line
mPointer where calculated slope is written; NULL if not needed
cPointer where calculated y axis intercept is written; NULL if not needed
xiPointer where calculated x axis intercept is written; NULL if not needed
xhPointer where the place (x) of the highest slope is written; NULL if not needed

Definition at line 424 of file pearson.c.

442 {
443 int i, ret, i_at_max=0;
444 double slope, ic, max_slope, ic_at_max=0.0;
445
446 /* Check the data */
447 if(x==NULL || y==NULL) return(1);
448 if(n<2) return(2);
449 if(slope_n>n) return(3);
450
451 /* Compute */
452 max_slope=-1.0E200;
453 for(i=0; i<=n-slope_n; i++) {
454 ret=regr_line(x+i, y+i, slope_n, &slope, &ic); if(ret) continue;
455 if(slope>max_slope) {max_slope=slope; ic_at_max=ic; i_at_max=i;}
456 } /* next slope */
457 if(max_slope==-1.0E200) return(10);
458 if(m!=NULL) *m=max_slope;
459 if(c!=NULL) *c=ic_at_max;
460 if(xi!=NULL) {if(max_slope!=0.0) *xi=-ic_at_max/max_slope; else *xi=0.0;}
461 if(xh!=NULL) {
462 *xh=0.0; for(i=i_at_max; i<i_at_max+slope_n; i++) *xh+=x[i];
463 *xh/=(double)slope_n;
464 }
465
466 return(0);
467}
int regr_line(double *x, double *y, int n, double *m, double *c)
Definition pearson.c:387

Referenced by dftFixPeak().

◆ highest_slope_after()

int highest_slope_after ( double * x,
double * y,
int n,
int slope_n,
double x_start,
double * m,
double * c,
double * xi,
double * xh )

Finds the regression line with the highest slope for x,y data after specified x.

See also
highest_slope, regr_line
Returns
Return 0 if ok.
Parameters
xAn array of x axis values.
yAn array of y axis values.
nThe number of values in x and y arrays.
slope_nThe number of samples used to fit the line.
x_startEstimation start x value, samples with smaller x are ignored; can usually be set to zero.
mPointer where calculated slope is written; NULL if not needed.
cPointer where calculated y axis intercept is written; NULL if not needed.
xiPointer where calculated x axis intercept is written; NULL if not needed.
xhPointer where the place (x) of the highest slope is written; NULL if not needed.

Definition at line 475 of file pearson.c.

495 {
496 int i, ret, i_at_max=0;
497 double slope, ic, max_slope, ic_at_max=0.0;
498
499 /* Check the data */
500 if(x==NULL || y==NULL) return(1);
501 if(n<2) return(2);
502 if(slope_n>n) return(3);
503
504 /* Compute */
505 max_slope=-1.0E200;
506 for(i=0; i<=n-slope_n; i++) if(x[i]>=x_start) {
507 ret=regr_line(x+i, y+i, slope_n, &slope, &ic); if(ret) continue;
508 if(slope>max_slope) {max_slope=slope; ic_at_max=ic; i_at_max=i;}
509 } /* next slope */
510 if(max_slope==-1.0E200) return(10);
511 if(m!=NULL) *m=max_slope;
512 if(c!=NULL) *c=ic_at_max;
513 if(xi!=NULL) {if(max_slope!=0.0) *xi=-ic_at_max/max_slope; else *xi=0.0;}
514 if(xh!=NULL) {
515 *xh=0.0; for(i=i_at_max; i<i_at_max+slope_n; i++) *xh+=x[i];
516 *xh/=(double)slope_n;
517 }
518
519 return(0);
520}

◆ mean()

int mean ( double * x,
double * y,
int nr,
double * xmean,
double * xsd,
double * ymean,
double * ysd )

Calculates the mean and SD of data. Data (y data) may contain NaN's.

See also
dmean, dmedian, dmean_nan, regr_line, pearson
Returns
Returns !=0 in case of an error.
Parameters
xData x values
yData y values
nrNumber of data sample values
xmeanCalculated x mean
xsdCalculated SD of x mean
ymeanCalculated y mean
ysdCalculated SD of y mean

Definition at line 341 of file pearson.c.

356 {
357 int i, n;
358 double sumsqr, sqrsum;
359
360 for(i=0, sumsqr=sqrsum=0.0, n=0; i<nr; i++) if(!isnan(x[i]) && !isnan(y[i])) {
361 sumsqr+=x[i]*x[i]; sqrsum+=x[i]; n++;}
362 if(n<=0) return -1;
363 *xmean=sqrsum/(double)n;
364 if(n==1) *xsd=0.0; else {
365 sqrsum*=sqrsum;
366 *xsd=sqrt( (sumsqr - sqrsum/(double)n) / (double)(n-1) );
367 }
368 for(i=0, sumsqr=sqrsum=0.0, n=0; i<nr; i++) if(!isnan(x[i]) && !isnan(y[i])) {
369 sumsqr+=y[i]*y[i]; sqrsum+=y[i]; n++;}
370 if(n<=0) return -1;
371 *ymean=sqrsum/(double)n;
372 if(n==1) *ysd=0.0; else {
373 sqrsum*=sqrsum;
374 *ysd=sqrt( (sumsqr - sqrsum/(double)n) / (double)(n-1) );
375 }
376 return 0;
377}

Referenced by dftMeanTAC(), doubleMatchRel(), imgsegmClusterExpand(), least_trimmed_square(), mertwiRandomExponential(), noiseSD4SimulationFromDFT(), and resMean().

◆ pearson()

int pearson ( double * x,
double * y,
int nr,
double * k,
double * kSD,
double * b,
double * bSD,
double * r,
double * ySD )

Calculate slope and intercept of a line and Pearson's correlation coefficient.

See also
regr_line, best_pearson, highest_slope, pearson2, pearson3, pearson4, mean, imgsegmPearson, nlopt1D
Returns
If successful, function returns value 0.
Parameters
xdata x values
ydata y values
nrnumber of data sample values
kslope
kSDS.D. of slope
by axis intercept
bSDS.D. of y axis intercept
rPearson's correlation coefficient, r
ySDResidual variance of y values

Definition at line 14 of file pearson.c.

33 {
34 int i;
35 double e, f, g, meanx, meany, kk, bb, ke, be, rr, sumsdcy=0.0;
36 double sumx=0.0, sumy=0.0, sumsx=0.0, sumsdx=0.0, sumsdy=0.0, sumdxdy=0.0;
37 double sumxy=0.0, sumsy=0.0;
38
39
40 if(0) {
41 fprintf(stdout, "pearson(x[], y[], %d, *k, *kSD, *b, *bSD, *r, *ySD)\n", nr);
42 fflush(stdout);
43 }
44 /* Check that there is some data */
45 if(x==NULL || y==NULL || nr<2) return(1);
46
47 /* If only 2 datapoints */
48 if(nr==2) {
49 f=x[1]-x[0]; if(fabs(f)<1.0E-50) return(1);
50 *k=(y[1]-y[0])/f; *b=y[0]-(*k)*x[0];
51 *kSD=*bSD=*ySD=0.; *r=1.;
52 return(0);
53 }
54
55 /* Calculate (x,y) sums and means */
56 for(i=0; i<nr; i++) {
57 sumx+=x[i]; sumy+=y[i];
58 sumsx+=x[i]*x[i]; sumsy+=y[i]*y[i];
59 sumxy+=x[i]*y[i];
60 }
61 meanx=sumx/(double)nr;
62 meany=sumy/(double)nr;
63 /* and then based on means */
64 for(i=0; i<nr; i++) {
65 f=x[i]-meanx; sumsdx+=f*f;
66 g=y[i]-meany; sumsdy+=g*g;
67 sumdxdy+=f*g;
68 }
69 if(sumsdx<1.0e-50 || sumsdy<1.0e-50) return(3);
70 /* Regression coefficient */
71 kk=sumdxdy/sumsdx; *k=kk;
72 /* Intercept with y axis */
73 bb=(sumsdx*sumy - sumx*sumdxdy)/((double)nr*sumsdx); *b=bb;
74 /* Errors */
75 for(i=0; i<nr; i++) {
76 f=kk*x[i]+bb-y[i];
77 sumsdcy+=f*f;
78 }
79 /* Deviation of y values */
80 if(sumsdcy<=1.0e-12) e=0.0; else e=sqrt(sumsdcy/(double)(nr-2)); *ySD=e;
81 /* SD of slope and intercept */
82 ke=e/sqrt(sumsdx); be=e/sqrt((double)nr-sumx*sumx/sumsx);
83 *kSD=ke; *bSD=be;
84 be=sqrt(sumsdcy/(double)(nr-2))/sqrt((double)nr-(sumx*sumx)/sumsx);
85 /* Pearson's correlation coefficient */
86 rr=(sumxy-((sumx*sumy)/(double)nr)) /
87 sqrt((sumsx-sumx*sumx/(double)nr)*(sumsy-sumy*sumy/(double)nr));
88 /* Correct for small sample size */
89 if(nr>4) rr*=1.0+(1.0-rr*rr)/(double)(2*(nr-4));
90 *r=rr;
91 if(0) {
92 fprintf(stdout, "k=%14.7e +- %14.7e\n", kk, ke);
93 fprintf(stdout, "b=%14.7e +- %14.7e\n", bb, be);
94 fprintf(stdout, "r=%14.7e ySD=%14.7e\n", rr, e);
95 fflush(stdout);
96 }
97
98 return(0);
99}

Referenced by best_logan_regr(), best_pearson(), dft_end_line(), extrapolate_monoexp(), pearson2(), pearson3(), and pearson4().

◆ pearson2()

int pearson2 ( double * x,
double * y,
char * is,
int nr,
double * k,
double * kSD,
double * b,
double * bSD,
double * r,
double * ySD )

Calculate slope and intercept of a line and Pearson's correlation coefficient.

Array char is[] specifies whether single (x,y) points are used in the fit.

See also
regr_line, best_pearson, pearson, highest_slope
Returns
If successful, function returns value 0.
Parameters
xdata x values
ydata y values
isSwitch values: 0=do not use this point
nrnumber of data sample values
kslope
kSDS.D. of slope
by axis intercept
bSDS.D. of y axis intercept
rPearson's correlation coefficient, r
ySDResidual variance of y values

Definition at line 109 of file pearson.c.

130 {
131 int i, j;
132 double *nx, *ny;
133
134 /* Allocate memory for new data pointers */
135 nx=(double*)calloc(nr, sizeof(double)); if(nx==NULL) return 1;
136 ny=(double*)calloc(nr, sizeof(double)); if(ny==NULL) {free((char*)nx); return 1;}
137
138 /* Copy data to pointers */
139 for(i=0, j=0; i<nr; i++) if(is[i]) {nx[j]=x[i]; ny[j]=y[i]; j++;}
140
141 /* Use pearson() */
142 i=pearson(nx, ny, j, k, kSD, b, bSD, r, ySD); free((char*)nx); free((char*)ny);
143 return (i);
144}

◆ pearson3()

int pearson3 ( double * x,
double * y,
int nr,
double * k,
double * kSD,
double * b,
double * bSD,
double * r,
double * ySD )

Calculate slope and intercept of a line and Pearson's correlation coefficient.

Data points may contain NaN's.

See also
regr_line, pearson, best_pearson, highest_slope
Returns
If successful, function returns value 0.
Parameters
xdata x values
ydata y values
nrnumber of data sample values
kslope
kSDS.D. of slope
by axis intercept
bSDS.D. of y axis intercept
rPearson's correlation coefficient, r
ySDResidual variance of y values

Definition at line 154 of file pearson.c.

173 {
174 int i, j;
175 double *nx, *ny;
176
177 /* Allocate memory for new data pointers */
178 nx=(double*)calloc(nr, sizeof(double)); if(nx==NULL) return 1;
179 ny=(double*)calloc(nr, sizeof(double)); if(ny==NULL) {free((char*)nx); return 1;}
180
181 /* Copy data to pointers */
182 for(i=0, j=0; i<nr; i++)
183 if(!isnan(x[i]) && !isnan(y[i])) {nx[j]=x[i]; ny[j]=y[i]; j++;}
184
185 /* Use pearson() */
186 i=pearson(nx, ny, j, k, kSD, b, bSD, r, ySD); free((char*)nx); free((char*)ny);
187 return (i);
188}

◆ pearson4()

int pearson4 ( double * x,
double * y,
int nr,
double start,
double end,
double * k,
double * kSD,
double * b,
double * bSD,
double * r,
double * ySD )

Calculate slope and intercept of a line and Pearson's correlation coefficient.

Data points may contain NaN's. Fit start and end times are specified.

See also
regr_line, best_pearson, highest_slope
Returns
If successful, function returns value 0.
Parameters
xdata x values
ydata y values
nrnumber of data sample values
startfit start time
endfit end time
kslope
kSDS.D. of slope
by axis intercept
bSDS.D. of y axis intercept
rPearson's correlation coefficient, r
ySDResidual variance of y values

Definition at line 198 of file pearson.c.

221 {
222 int i, j;
223 double *nx, *ny;
224
225 if(0) {
226 fprintf(stdout, "pearson4(x[], y[], %d, %g, %g, *k, *kSD, *b, *bSD, *r, *ySD)\n", nr,start,end);
227 fflush(stdout);
228 }
229 /* Allocate memory for new data pointers */
230 if((nx=(double*)calloc(nr, sizeof(double)))==NULL) return -3;
231 if((ny=(double*)calloc(nr, sizeof(double)))==NULL) {free((char*)nx); return -3;}
232
233 /* Copy data to pointers */
234 for(i=0, j=0; i<nr; i++)
235 if(x[i]>=start && x[i]<=end && !isnan(x[i]) && !isnan(y[i])) {
236 nx[j]=x[i]; ny[j]=y[i]; j++;}
237
238 /* Use pearson() */
239 i=pearson(nx, ny, j, k, kSD, b, bSD, r, ySD);
240 free((char*)nx); free((char*)ny);
241 return i;
242}

◆ regr_line()

int regr_line ( double * x,
double * y,
int n,
double * m,
double * c )

Calculates regression line slope (m) and y axis intercept.

Data (x and y data) may contain NaN's.

See also
highest_slope, pearson, imgsegmPearson
Returns
Returns 0 if ok.
Parameters
xAn array of x axis values.
yAn array of y axis values.
nThe number of values in x and y arrays.
mPointer where calculated slope is written.
cPointer where calculated y axis intercept is written.

Definition at line 387 of file pearson.c.

398 {
399 double xsum=0.0, ysum=0.0, x2sum=0.0, xysum=0.0, delta;
400 int i, nn=0;
401
402 /* Check the data */
403 if(x==NULL || y==NULL) return(1);
404 if(n<2) return(2);
405
406 /* Compute */
407 for(i=0, nn=0; i<n; i++) {
408 if(isnan(x[i]) || isnan(y[i])) continue;
409 xsum+=x[i]; ysum+=y[i]; x2sum+=x[i]*x[i]; xysum+=x[i]*y[i]; nn++;
410 }
411 if(nn<2) return(2);
412 delta=(double)nn*x2sum - xsum*xsum; if(delta==0.0) return(3);
413 *m=((double)nn*xysum - xsum*ysum)/delta;
414 *c=(x2sum*ysum - xsum*xysum)/delta;
415 return(0);
416}

Referenced by highest_slope(), and highest_slope_after().