/** @file qrlsq.c
 *  @brief QR decomposition for solving least squares problems.
 *  
 *  @author Kaisa Liukko, Vesa Oikonen
 */
/*****************************************************************************/
#include "tpcclibConfig.h"
/*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
/*****************************************************************************/
#include "tpcextensions.h"
/*****************************************************************************/
#include "tpclinopt.h"
/*****************************************************************************/

/*****************************************************************************/
/// @cond
/* Local functions */
int qr_get_next_col(
  double **mat, const unsigned int rows, const unsigned int cols,
  unsigned int row_pos, unsigned int *p, unsigned int *max_loc
);
void qr_swap_cols(
  unsigned int *p, const unsigned int i, const unsigned int j
);
void qr_householder(
  double **mat, const unsigned int rows, const unsigned int cols, 
  const unsigned int row_pos, const unsigned int col_pos, double *result
);
void qr_apply_householder(
  double **mat, double *rhs, const unsigned int rows, const unsigned int cols, 
  double *house, const unsigned int row_pos, unsigned int *p
);
void qr_back_solve(
  double **mat, double *rhs, const unsigned int rows, const unsigned int cols, 
  double *sol, unsigned int *p
);



int qr_QTvec(
  int M, int N, double **QR, double *tau, double *v, double *help
);
int qr_Qvec(
  int M, int N, double **QR, double *tau, double *v, double *help
);
int qr_invRvec(
  int N, double **A, double *X
);
/// @endcond
/*****************************************************************************/

/*****************************************************************************/
/** @brief QR least-squares solving routine.
 
  Solves a matrix form least square problem 
  min||A x - b|| => A x =~ b     (A is m*n matrix, m>=n)
  using the QR decomposition for overdetermined systems
  
  @author: Michael Mazack
  License: Public Domain. Redistribution and modification without 
  restriction is granted. If you find this code helpful, please let the
  author know (http://mazack.org).
  Small editions by Vesa Oikonen when added to tpcclib.
  @sa nnls, qr
  @todo Check that modification in one subroutine is ok, and add possibility
  to provide working space for the main and sub-functions.
  @return Returns 0 if ok.
 */
int qrLSQ(
  /** Pointer to the row-major matrix (2D array), A[rows][cols]; not modified. */
  double **mat, 
  /** Vector b[rows]; modified to contain the computed (fitted) b[], that is,
      matrix-vector product A*x. */
  double *rhs, 
  /** Solution vector x[cols]; solution x corresponds to the solution of both 
      the modified and original systems A and B. */ 
  double *sol, 
  /** Number of rows (samples) in matrix A and length of vector B. */
  const unsigned int rows, 
  /** Number of cols (parameters) in matrix A and length of vector X. */
  const unsigned int cols,
  /** Pointer to value where R^2, the difference between original and computed
      right hand side (b); enter NULL if not needed. */
  double *r2
) {
  if(rows<1 || cols<1) return(1);
  if(mat==NULL || rhs==NULL || sol==NULL) return(2);

  unsigned int i, j, max_loc;
  unsigned int *p=NULL;
  double *buf=NULL, *ptr, *v=NULL, *orig_b=NULL, **A=NULL;

  /* Allocate memory for index vector and Householder transform vector */
  /* and original data matrix and vector. */
  p=malloc(sizeof(unsigned int)*cols);
  A=malloc(sizeof(double*)*rows);
  buf=malloc(sizeof(double)*(rows*cols+rows+rows));
  if(p==NULL || A==NULL || buf==NULL) {
    free(p); free(A); free(buf); 
    return(3);
  }
  ptr=buf; 
  for(i=0; i<rows; i++) {A[i]=ptr; ptr+=cols;}
  v=ptr; ptr+=rows; orig_b=ptr;
  /* copy the original data */
  for(i=0; i<rows; i++) {
    orig_b[i]=rhs[i];
    for(j=0; j<cols; j++) A[i][j]=mat[i][j];
  }

  /* Initial permutation vector. */
  for(i=0; i<cols; i++) p[i]=i;
  
  /* Apply rotators to make R and Q'*b */
  for(i=0; i<cols; i++) {
    if(qr_get_next_col(A, rows, cols, i, p, &max_loc)==0)
      qr_swap_cols(p, i, max_loc);
    qr_householder(A, rows, cols, i, p[i], v);
    qr_apply_householder(A, rhs, rows, cols, v, i, p);
  }

  /* Back solve Rx = Q'*b */
  qr_back_solve(A, rhs, rows, cols, sol, p);

  /* Compute fitted b[] (matrix-vector product A*x) using non-modified A */
  for(i=0; i<rows; i++) {
    rhs[i]=0.0;
    for(j=0; j<cols; j++) rhs[i]+=mat[i][j]*sol[j];
  }

  /* Compute R^2, if requested */
  if(r2!=NULL) {
    double ss=0, d;
    for(i=0; i<rows; i++) {
      d=orig_b[i]-rhs[i];
      ss+=d*d;
    }
    *r2=ss;
  }

  /* Collect garbage. */
  free(p); free(A); free(buf);
  return(0);
}

/// @cond
/** Subroutine for qrLSQ().
 *  @return Returns 0 if max_loc was found, otherwise <>0. 
 */
#if(1) // Version without memory allocation by VO
int qr_get_next_col(
  /** Pointer to the row-major matrix (2D array), A[rows][cols]. */
  double **mat, 
  /** Number of rows (samples) in matrix A and length of vector B. */
  const unsigned int rows, 
  /** Number of cols (parameters) in matrix A and length of vector X. */
  const unsigned int cols,
  /** Matrix row. */
  const unsigned int row_pos, 
  /** Permutation vector of size cols. */
  unsigned int *p,
  /** Max column. */
  unsigned int *max_loc
) {
  if(mat==NULL || rows<1 || cols<1 || p==NULL) return(-1);

  unsigned int i, j, maxloc;
  double col_norm, max_norm;

  // Compute the norms of the sub columns and find the maximum
  max_norm=DBL_MIN; maxloc=0;
  for(j=0; j<cols; j++) {
    col_norm=0;
    for(i=row_pos; i<rows; i++) col_norm+=mat[i][p[j]]*mat[i][p[j]];
    if(col_norm>max_norm) {max_norm=col_norm; maxloc=j;}
  }
  if(max_norm!=DBL_MIN) {
    if(max_loc!=NULL) *max_loc=maxloc;  
    return(0); 
  }
  return(1);
}

#else

int qr_get_next_col(
  /** Pointer to the row-major matrix (2D array), A[rows][cols]. */
  double **mat, 
  /** Number of rows (samples) in matrix A and length of vector B. */
  const unsigned int rows, 
  /** Number of cols (parameters) in matrix A and length of vector X. */
  const unsigned int cols,
  /** Matrix row. */
  const unsigned int row_pos, 
  /** Permutation vector of size cols. */
  unsigned int *p,
  /** Max column. */
  unsigned int *max_loc
) {
  if(mat==NULL || rows<1 || cols<1 || p==NULL) return(-1);

  unsigned int i, j, maxloc;
  double *col_norms;
  double max;

  col_norms=malloc(sizeof(double)*cols);

  // Compute the norms of the sub columns.
  for(j=0; j<cols; j++) {
    col_norms[j]=0;
    for(i=row_pos; i<rows; i++) col_norms[j]+=mat[i][p[j]]*mat[i][p[j]];
  }

  // Find the maximum location.
  max=DBL_MIN; maxloc=0;
  for(i=0; i<cols; i++)
    if(col_norms[i]>max) {max=col_norms[i]; maxloc=i;}

  // Collect garbage and return.
  free(col_norms);
  if(max!=DBL_MIN) {
    if(max_loc!=NULL) *max_loc=maxloc;  
    return(0); 
  }
  return(1);
}
#endif

/** Subroutine for qrLSQ() */
void qr_swap_cols(unsigned int *p, const unsigned int i, const unsigned int j)
{
  unsigned int temp;
  temp=p[i]; p[i]=p[j]; p[j]=temp;
}

/** Subroutine for qrLSQ() */
void qr_householder(
  /** Pointer to the row-major matrix (2D array), A[rows][cols]; modified. */
  double **mat, 
  /** Number of rows (samples) in matrix A and length of vector B. */
  const unsigned int rows, 
  /** Number of cols (parameters) in matrix A and length of vector X. */
  const unsigned int cols,
  /** Row position. */
  const unsigned int row_pos,
  /** Column position. */ 
  const unsigned int col_pos, 
  /** Pointer to result vector of size rows. */
  double *result
) {
  if(rows<1 || cols<1 || mat==NULL || result==NULL) return;

  unsigned int i;
  double norm=0.0;

  for(i=row_pos; i<rows; i++) norm+=mat[i][col_pos]*mat[i][col_pos];
  if(norm==0.0) return;

  norm=sqrt(norm);
  result[0]=(mat[row_pos][col_pos]-norm);
  for(i=1; i<(rows-row_pos); i++) result[i]=mat[i+row_pos][col_pos];

  norm=0.0;
  for(i=0; i<(rows-row_pos); i++) norm+=result[i]*result[i];
  if(norm==0) return;

  norm=sqrt(norm);
  for(i=0; i<(rows-row_pos); i++) result[i]*=(1.0/norm);
}

/** Subroutine for qrLSQ() */
void qr_apply_householder(
  /** Pointer to the row-major matrix (2D array), A[rows][cols]; modified. */
  double **mat,
  /** Vector b[rows]; modified. */
  double *rhs,
  /** Number of rows (samples) in matrix A and length of vector B. */
  const unsigned int rows,
  /** Number of cols (parameters) in matrix A and length of vector X. */
  const unsigned int cols, 
  /** Pointer to vector of size rows. */
  double *house,
  /** Row position. */
  const unsigned int row_pos,
  /** Pointer to integer vector of size cols. */
  unsigned int *p
) {
  if(mat==NULL || rhs==NULL || rows<1 || cols<1 || house==NULL || p==NULL) return;

  unsigned int i, j, k, n;
  double sum;
  double **hhmat;
  double **mat_cpy;
  double *rhs_cpy;

  // Get the dimensions for the Q matrix.
  n=rows-row_pos;

  // Allocate memory.
  hhmat=malloc(sizeof(double*)*n);
  for(i=0; i<n; i++) hhmat[i]=malloc(sizeof(double)*n);
  mat_cpy=malloc(sizeof(double*)*rows);
  for(i=0; i<rows; i++) mat_cpy[i]=malloc(sizeof(double)*cols);
  rhs_cpy=malloc(sizeof(double)*rows);
  
  // Copy the matrix.
  for(i=0; i<rows; i++)
    for(j=0; j<cols; j++)
      mat_cpy[i][j]=mat[i][j];

  // Copy the right hand side.
  for(i=0; i<rows; i++) rhs_cpy[i]=rhs[i];

  // Build the Q matrix from the Householder transform.
  for(j=0; j<n; j++)
    for(i=0; i<n; i++)
      if(i!=j) hhmat[i][j]=-2.0*house[j]*house[i];
      else  hhmat[i][j]=1.0-2.0*house[j]*house[i];

  // Multiply by the Q matrix.
  for(k=0; k<cols; k++)
    for(j=0; j<n; j++) {
      sum=0.0;
      for(i=0; i<n; i++) sum+=hhmat[j][i]*mat_cpy[i+row_pos][p[k]];
      mat[j+row_pos][p[k]]=sum;
    }

  // Multiply the rhs by the Q matrix.
  for(j=0; j<n; j++) {
    sum=0.0;
    for(i=0; i<n; i++) sum+=hhmat[i][j]*rhs_cpy[i+row_pos];
    rhs[j+row_pos]=sum;
  }

  // Collect garbage.
  for(i=0; i<(rows-row_pos); i++) free(hhmat[i]);
  for(i=0; i<rows; i++) free(mat_cpy[i]);
  free(hhmat);
  free(mat_cpy);
  free(rhs_cpy);
}

/** Subroutine for qrLSQ() */
void qr_back_solve(
  /** Pointer to the row-major matrix (2D array), A[rows][cols]; modified. */
  double **mat, 
  /** Vector b[rows]; modified */
  double *rhs, 
  /** Number of rows (samples) in matrix A and length of vector B. */
  const unsigned int rows,
  /** Number of cols (parameters) in matrix A and length of vector X. */
  const unsigned int cols, 
  /** Solution vector x[cols]. */ 
  double *sol, 
  /** Pointer to integer vector of size cols */
  unsigned int *p
) {
  if(mat==NULL || rhs==NULL || rows<1 || cols<1 || sol==NULL || p==NULL)
    return;

  unsigned int i, j, bottom;
  double sum;

  /* Fill the solution with zeros initially. */
  for(i=0; i<cols; i++) sol[i]=0.0;

  /* Find the first non-zero row from the bottom and start solving from here. */
  i=rows-1; bottom=i;
  while(1) {
    if(fabs(mat[i][p[cols-1]]) > 2.0*DBL_MIN) {bottom=i; break;}
    if(i==0) break; else i--;
  } 

  /* Standard back solving routine starting at the first non-zero diagonal. */
  i=bottom;
  while(1) {
    sum=0.0;
    j=cols-1;
    while(1) {
      if(j>i) sum+=sol[p[j]]*mat[i][p[j]];
      if(j==0) break; else j--;
    }
    if(i<cols) { // Added by VO
      if(mat[i][p[i]] > 2.0*DBL_MIN) sol[p[i]]=(rhs[i]-sum)/mat[i][p[i]];
      else sol[p[i]]=0.0;
    }
    if(i==0) break; else i--;
  }
}
/// @endcond
/*****************************************************************************/

/*****************************************************************************/
/** Algorithm QR.
 * 
 * Solves a matrix form least square problem 
 * min||A x - b|| => A x =~ b     (A is m*n matrix, m>=n)
 * using the QR decomposition for overdetermined systems.
 * Based on GNU Scientific Library, edited by Kaisa Liukko.
 *
 * Instead of pointers for working space, NULL can be given to let this
 * function to allocate and free the required memory.
 * @sa qrLSQ, nnls, qr_decomp
 *
 * @return Function returns 0 if successful and 1 in case of invalid problem
 *         dimensions or memory allocation error.
 */
int qr(
  /** On entry, a[m][n] contains the m by n matrix A.
      On exit, a[][] contains the QR factorization. */
  double **A,
  /** Dimensions of matrix A are a[m][n]. */
  int m,
  /** Dimensions of matrix A are a[m][n]. */
  int n,
  /** B[] is an m-size vector containing the right-hand side vector b. */
  double *B,
  /** On exit, x[] will contain the solution vector x (size of n). */
  double *X,
  /** On exit, rnorm (pointer to double) contains the squared Euclidean norm of
      the residual vector (R^2); enter NULL if not needed. */
  double *rnorm,
  /** On exit, tau[] will contain the householder coefficients (size of n);
      enter NULL, if not needed. */
  double *tau,
  /** An m-size array of working space, res[]. On output contains
      residual b - Ax. Enter NULL to let qr() to handle it. */
  double *res,
  /** m*n array of working space. Enter NULL to let qr() to handle it. */ 
  double **wws,
  /** 2m-array of working space. Enter NULL to let qr() to handle it. */
  double *ws
) {
  int i;
  double *qrRes, *qrTau, **qrWws, *qrWs, *chain;

  /* Check the parameters and data */
  if(m<=0 || n<=0 || A==NULL || B==NULL || X==NULL) return(1);
  if(m<n) return(1);

  /* Allocate memory for working space, if required */
  if(tau!=NULL) qrTau=tau; else qrTau=(double*)calloc(n, sizeof(double));
  if(res!=NULL) qrRes=res; else qrRes=(double*)calloc(m, sizeof(double));
  if(wws!=NULL) {
    qrWws=wws; chain=(double*)NULL;
  } else {
    qrWws=(double**)malloc(m * sizeof(double*));
    chain=(double*)malloc(m*n * sizeof(double));
    for(i=0; i<m; i++) qrWws[i]=chain + i*n;
  }
  if(ws!=NULL) qrWs=ws; else qrWs=(double*)calloc(2*m, sizeof(double));
  if(qrTau==NULL || qrRes==NULL || qrWws==NULL || qrWs==NULL) return(1);

  /* Form the householder decomposition and solve the least square problem */
  if(qr_decomp(A, m, n, qrTau, qrWws, qrWs)) return(2);
  if(qr_solve(A, m, n, qrTau, B, X, qrRes, rnorm, qrWws, qrWs)) return(3);

  /* Free working space, if it was allocated here */
  //if(tau==NULL || res==NULL || wws==NULL || ws==NULL) printf("free in qr()\n");
  if(tau==NULL) free(qrTau);
  if(res==NULL) free(qrRes);
  if(wws==NULL) {free(qrWws); free(chain);}
  if(ws==NULL) free(qrWs);
  for(i=0; i<n; i++) if(isnan(X[i])) return(4);
  return(0);
} /* qr */
/*****************************************************************************/

/*****************************************************************************/
/** Factorise a general M x N matrix A into A = Q R ,
    where Q is orthogonal (M x M) and R is upper triangular (M x N).

    Q is stored as a packed set of Householder vectors in the strict lower triangular 
    part of the input matrix A and a set of coefficients in vector tau.

    R is stored in the diagonal and upper triangle of the input matrix.

    The full matrix for Q can be obtained as the product Q = Q_1 Q_2 .. Q_k 
    and it's transform as the product Q^T = Q_k .. Q_2 Q_1 , where k = min(M,N) and
    Q_i = (I - tau_i * h_i * h_i^T)
    and where h_i is a Householder vector h_i = [1, A(i+1,i), A(i+2,i), ... , A(M,i)].
    This storage scheme is the same as in LAPACK.  
  
    NOTICE! The calling program must take care that pointer tau is 
    of size N or M, whichever is smaller. 

   @sa qr
   @return Function returns 0 if ok.
 */
int qr_decomp(
  /** contains coefficient matrix A (m*n) as input and factorisation QR as output. */
  double **a,
  /** nr of rows in matrix A. */
  int M,
  /** nr of columns in matrix A. */
  int N,
  /** Vector for householder coefficients, of length N or M, whichever is smaller. */
  double *tau,
  /** m*n array of working space. */
  double **cchain,
  /** m size array of working space. */
  double *chain
) {
  //printf("qr_decomp()\n");
  int i, m, n, MNmin;
  double *subvector, **submatrix;

  /* Local variables */
  if(M<N) MNmin=M; else MNmin=N;
  if(MNmin<1 || a==NULL || tau==NULL || cchain==NULL || chain==NULL) return(1);
  subvector=chain;
  submatrix=cchain;

  for(i=0; i<MNmin; i++) {
    //printf("i=%d (MNmin=%d)\n", i, MNmin);
    /* Compute the Householder transformation to reduce the j-th column of the matrix A to 
       a multiple of the j-th unit vector. Householder vector h_i is saved in the lower triangular 
       part of the column and Householder coefficient tau_i in the vector tau. */
    for(m=i; m<M; m++) {
      //printf("subvector[%d]=a[%d][%d]\n", m-1, m, i);
      subvector[m-i]=a[m][i];
    }
    tau[i] = householder_transform(subvector, M-i);
    for(m=i; m<M; m++) a[m][i]=subvector[m-i];

    /* Apply the transformation to the remaining columns to get upper triangular part of matrix R  */
    if(i+1 < N) {
      printf("     apply transformation\n");
      for(m=i; m<M; m++) for(n=i+1; n<N; n++) {
        if((m-i)<0 || (m-i)>=M || (n-i-1)<0 || (n-i-1)>=N) printf("OVERFLOW!\n"); 
        submatrix[m-i][n-i-1]=a[m][n];
      }
      if(householder_hm(tau[i], subvector, submatrix, M-i, N-i)) return(2);
      for(m=i; m<M; m++) for(n=i+1; n<N; n++)
        a[m][n]=submatrix[m-i][n-i-1];
    }
  }

  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
/** Find the least squares solution to the overdetermined system
 
     A x = b 
    
   for m >= n using the QR factorisation A = Q R. 
   qr_decomp() must be used prior to this function in order to form
   the QR factorisation of A.
   Solution is formed in the following order: 
   QR x = b  =>  R x = Q^T b  =>  x = R^-1 (Q^T b)
   
   NOTICE! The calling program must take care that pointers b, x
   and residual are of the right size.

   @sa qr, qr_decomp
   @return Function returns 0 if ok.
 */
int qr_solve(
  /** m*n matrix containing householder vectors of A. */
  double **QR,
  /** nr of rows in matrix A. */
  int M,
  /** nr of columns in matrix A. */
  int N,
  /** vector containing householder coefficients tau; length is M or N, whichever is smaller. */
  double *tau,
  /** Contains m-size vector b of A x = b. */
  double *b,
  /** solution vector x of length n. */
  double *x,
  /** residual vector of length m. */
  double *residual,
  /** norm^2 of the residual vector; enter NULL if not needed. */
  double *resNorm,
  /** m*n array of the working space. */
  double **cchain,
  /** 2m length array of the working space. */ 
  double *chain
) {
  //printf("qr_solve()\n");
  if(QR==NULL || tau==NULL || b==NULL || x==NULL || residual==NULL) return(1);
  if(cchain==NULL || chain==NULL) return(1);
  if(M<1 || N<1) return(2);

  int m, n;
  double **Rmatrix;

  /* Local variable */
  Rmatrix=cchain;

  /* Get matrix R from the upper triangular part of QR
     First the rows N - M-1 are eliminated from R matrix*/
  for(m=0; m<N; m++) for(n=0; n<N; n++) Rmatrix[m][n]=QR[m][n];
  for(m=0; m<M; m++) residual[m]=b[m];

  /* Compute b = Q^T b */
  if(qr_QTvec(M, N, QR, tau, residual, chain)) return(3);

  /* Solve R x = b by computing x = R^-1 b */
  for(n=0; n<N; n++) x[n]=residual[n];
  if(qr_invRvec(N, Rmatrix, x)) return(4);

  /* Compute residual = b - A x = Q (Q^T b - R x) */
  for(n=0; n<N; n++) residual[n]=0.0;
  /* Compute residual= Q*residual */
  if(qr_Qvec(M, N, QR, tau, residual, chain)) return(4);

  /* Compute norm^2 of the residual vector, if needed */
  if(resNorm!=NULL)
    for(m=0, *resNorm=0.0; m<M; m++) *resNorm +=residual[m]*residual[m];

  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
/// @cond
/** Form the product Q^T v  from householder vectors saved in the lower triangle of 
    QR matrix and householder coefficients saved in vector tau.
    @return Returns 0 if ok. 
 */
int qr_QTvec(
  /** Dimensions of matrix are QR[m][n]. */
  int M,
  /** Dimensions of matrix are QR[m][n]. */
  int N,
  /** Matrix QR[m][n]. */
  double **QR, 
  /* Vector tau of length M or N, which is smaller. */
  double *tau, 
  /** Vector v of size M. */
  double *v, 
  /** Working memory vector of length 2*M. */
  double *help
) {
  //printf("qr_QTvec()\n");
  int i, m, MNmin;
  double *h, *w;

  /* Local variables */
  if(M<N) MNmin=M; else MNmin=N;
  if(MNmin<1 || QR==NULL || tau==NULL || v==NULL || help==NULL) return(1);
  h=help; w=help+M;

  /* compute Q^T v */
  for(i=0; i<MNmin; i++) {
    for(m=i; m<M; m++) {
      if((m-i)<0 || (m-i)>=M || (i)<0 || (i)>=N) printf("OVERFLOW!\n"); 
      h[m-i]=QR[m][i]; 
      w[m-i]=v[m];
    }
    if(householder_hv(tau[i], M-i, h, w)) return(2);
    for(m=i; m<M; m++) v[m]=w[m-i];
  }
  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
/** Form the product Q v from householder vectors saved in the lower triangle
    of QR matrix and householder coefficients saved in vector tau.
    @return Returns 0 if ok. 
 */
int qr_Qvec(
  /** Dimensions of matrix are QR[m][n]. */
  int M,
  /** Dimensions of matrix are QR[m][n]. */
  int N,
  /** Matrix QR[m][n]. */
  double **QR, 
  /* Vector tau of length M or N, which is smaller. */
  double *tau, 
  /* Vector v of length M. */
  double *v, 
  /** Working memory vector of length 2*M. */
  double *help
) {
  //printf("qr_Qvec()\n");
  int i, m, MNmin;
  double *h, *w;

  /* Local variables */
  if(M<N) MNmin=M; else MNmin=N;
  if(MNmin<1 || QR==NULL || tau==NULL || v==NULL || help==NULL) return(1);
  h=help; w=help+M;

  /* compute Q v */
  for(i=MNmin-1; i>=0; i--) {
    for(m=i; m<M; m++) {
      if((m-i)<0 || (m-i)>=M || (i)<0 || (i)>=N) printf("OVERFLOW!\n"); 
      h[m-i]=QR[m][i]; 
      w[m-i]=v[m];
    }
    if(householder_hv(tau[i], M-i, h, w)) return(2);
    for(m=i; m<M; m++) v[m]=w[m-i];
  }
  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
/** Form the product R^-1 v. R is saved in the upper triangle of QR matrix.
    @return Returns 0 if ok. 
 */
int qr_invRvec(
  /** Matrix and vector size N. */
  int N,
  /** Matrix of size NxN (at least). */
  double **A, 
  /** Vector of length N. */
  double *X
) {
  //printf("qr_invRvec()\n");
  int i, j;
  double tmp;

  if(N<1 || A==NULL || X==NULL) return(1);
  /* form  x := inv( A )*x */
  /* back-substitution */
  X[N-1] = X[N-1]/A[N-1][N-1];
  for(i=N-2; i>=0; i--) {
    for(j=i+1, tmp=X[i]; j<N; j++) {
      //if((i)<0 || (i)>=N || (j)<0 || (j)>=N) printf("OVERFLOW!\n"); 
      tmp-=A[i][j]*X[j];
    }
    X[i] = tmp/A[i][i];
  }
  return(0);
}
/*****************************************************************************/
/// @endcond

/*****************************************************************************/
/** Algorithm for weighting the problem that is given to QR algorithm.
    Square roots of weights are used because in QR the difference w*A-w*b is squared.
    @sa qrWeightRm, qrLSQ, qr
    @todo Add tests.
    @return Algorithm returns zero if successful, otherwise <>0.
*/
int qrWeight(
  /** Dimensions of matrix A are a[m][n]. */
  int N,
  /** Dimensions of matrix A are a[m][n]; size of vector B is m. */
  int M,
  /** Matrix a[m][n] for QR, contents will be weighted here. */
  double **A,
  /** B[] is an m-size vector for QR, contents will be weighted here. */
  double *b,
  /** Pointer to array of size m, which contains sample weights, used here
      to weight matrix A and vector b. */
  double *weight,
  /** m-sized vector for working space; enter NULL to allocate locally. */
  double *ws
) {
  int n, m;
  double *w;

  /* Check the arguments */
  if(N<1 || M<1 || A==NULL || b==NULL || weight==NULL) return(1);

  /* Allocate memory, if necessary */
  if(ws==NULL) {
    w=(double*)malloc(M*sizeof(double)); if(w==NULL) return(2);
  } else {
    w=ws;
  }

  /* Check that weights are not zero and get the square roots of them to w[]. */
  for(m=0; m<M; m++) {
    if(weight[m]<=1.0e-100) w[m]=1.0e-50;
    else w[m]=sqrt(weight[m]);
  }

  /* Multiply rows of matrix A and elements of vector b with weights. */
  for(m=0; m<M; m++) {
    for(n=0; n<N; n++) A[m][n]*=w[m];
    b[m]*=w[m];
  }

  if(ws==NULL) free(w);
  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
/** Remove the weights from data provided to QR LSQ algorithms.
    @sa qrWeight, qrLSQ, qr
    @todo Add tests.
    @return Algorithm returns zero if successful, otherwise <>0.
*/
int qrWeightRm(
  /** Dimensions of matrix A are a[m][n]. */
  int N,
  /** Dimensions of matrix A are a[m][n]; size of vector B is m. */
  int M,
  /** Matrix a[m][n] for QR, weights will be removed here; enter NULL if not needed. */
  double **A,
  /** B[] is an m-size vector for QR, weights will be removed here;
      enter NULL if not needed here. */
  double *b,
  /** Pointer to array of size m, which contains sample weights, used here
      to weight matrix A and vector b. */
  double *weight,
  /** m-sized vector for working space; enter NULL to allocate locally */
  double *ws
) {
  int n, m;
  double *w;

  /* Check the arguments */
  if(N<1 || M<1 || weight==NULL) return(1);
  if(A==NULL && b==NULL) return(0); // nothing to do

  /* Allocate memory, if necessary */
  if(ws==NULL) {
    w=(double*)malloc(M*sizeof(double)); if(w==NULL) return(2);
  } else {
    w=ws;
  }

  /* Check that weights are not zero and get the square roots of them to w[] */
  for(m=0; m<M; m++) {
    if(weight[m]<=1.0e-100) w[m]=1.0e-50;
    else w[m]=sqrt(weight[m]);
  }

  /* Multiply rows of matrix A and elements of vector b with weights */
  for(m=0; m<M; m++) {
    if(A!=NULL) for(n=0; n<N; n++) A[m][n]/=w[m];
    if(b!=NULL) b[m]/=w[m];
  }

  if(ws==NULL) free(w);
  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
/** Simplistic QR least-squares solving routine.

    Reference:
    Máté A. Introduction to Numerical Analysis with C programs.
    Chapter 38. Overdetermined systems of linear equations.
    Brooklyn College of the City University of New York, 2014.

   @sa qrLSQ
   @return Function returns 0 when successful.
 */
int qrSimpleLSQ(
  /** Matrix A[M][N]. */
  double **A,
  /** Vector B[M]. */
  double *B,
  /** nr of rows in matrix A, must be >=N. */
  int M,
  /** nr of columns in matrix A, must be <=M. */
  int N,
  /** Vector X of length N for the results. */
  double *X,
  /** Pointer to value where R^2, the difference between original and computed
      right hand side (b); enter NULL if not needed. */
  double *r2
) {
  //printf("qrSimpleLSQ()\n");
  if(A==NULL || B==NULL || X==NULL || N<1 || M<N) return(1);

  const double closeZero=1.0E-20;

  double cc[M], orig_B[M];
  double p[M][N], c[M][N];

  /* Householder transformation of matrix A into matrix P */
  for(int n=0; n<N; n++) {
    for(int nn=0; nn<n; nn++) p[nn][n]=0.0;
    p[n][n]=0.0;
    for(int m=n; m<M; m++) p[n][n]+=A[m][n]*A[m][n];
    p[n][n]=copysign(sqrt(p[n][n]), A[n][n]);
    p[n][n]+=A[n][n];
    if(fabs(p[n][n])<closeZero) return(2);
    for(int m=n+1; m<M; m++) p[m][n]=A[m][n];
    double norm=0.0;
    for(int m=n; m<M; m++) norm+=p[m][n]*p[m][n];
    norm=sqrt(norm);
    for(int m=n; m<M; m++) p[m][n]/=norm;
    for(int m=n; m<M; m++) {
      for(int nn=n; nn<N; nn++) {
        c[m][nn]=A[m][nn];
        for(int mm=n; mm<M; mm++) c[m][nn]-=2.0*p[m][n]*p[mm][n]*A[mm][nn];
      }
    }
    for(int m=n; m<M; m++)
      for(int nn=n; nn<N; nn++) A[m][nn]=c[m][nn];
  }

  /* Keep the original B[] for R^2 calculation */
  for(int m=0; m<M; m++) orig_B[m]=B[m];

  /* Compute P'b */
  for(int n=0; n<N; n++) {
    for(int m=n; m<M; m++) {
      cc[m]=B[m];
      for(int mm=n; mm<M; mm++) cc[m]-=2.0*p[m][n]*p[mm][n]*B[mm];
    }
    for(int m=n; m<M; m++) B[m]=cc[m];
  }

  /* Solve the linear system with backward substitution */
  X[N-1]=B[N-1]/A[N-1][N-1];
  for(int n=N-2; n>=0; n--) {
    X[n]=B[n];
    for(int nn=n+1; nn<N; nn++) X[n]-=A[n][nn]*X[nn];
    X[n]/=A[n][n];
  }

  /* Compute R^2, if requested */
  if(r2!=NULL) {
    double ss=0, d;
    for(int m=0; m<M; m++) {
      d=orig_B[m]-B[m];
      ss+=d*d;
    }
    *r2=ss;
  }

  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
/** @brief Solve over-determined least-squares problem A x ~ b using
    successive Householder rotations.

    This routine is based on the text and Fortran code in
    C.L. Lawson and R.J. Hanson, Solving Least Squares Problems,
    Prentice-Hall, Englewood Cliffs, New Jersey, 1974,
    and Fortran code by R.L. Parker and P.B. Stark.

   @return Returns 0 when successful, 1 if system is singular, and 2 in case of
    other errors, including that system is under-determined.
*/
int qrLH(
  /** Number of samples in matrix A and the length of vector b. */
  const unsigned int m, 
  /** Number of parameters in matrix A and the length of vector x. 
      The n must be smaller or equal to m. */
  const unsigned int n,
  /** Pointer to matrix A; matrix must be given as an n*m array,
      containing n consecutive m-length vectors. 
      Contents of A are modified in this routine. */
  double *a,
  /** Pointer to vector b of length n.
      Contents of b are modified in this routine. */
  double *b,
  /** Pointer to the result vector x of length n. */
  double *x,
  /** Pointer to a double value, in where the sum of squared residuals is written. */
  double *r2
) {
  /* Check the input */
  if(a==NULL || b==NULL || x==NULL || r2==NULL) return(2);
  if(n<1 || m<n) {*r2=nan(""); return(2);}

  /* Initiate output to zeroes, in case of exit because of singularity */
  for(unsigned int ni=0; ni<n; ni++) x[ni]=0.0;
  *r2=0.0;

  /* Rotates matrix A into upper triangular form */
  for(unsigned int ni=0; ni<n; ni++) {
    /* Find constants for rotation and diagonal entry */
    double sq=0.0;
    for(unsigned int mi=ni; mi<m; mi++) sq+=a[mi + ni*m]*a[mi + ni*m];
    if(sq==0.0) return(1);
    double qv1=-copysign(sqrt(sq), a[ni + ni*m]);
    double u1=a[ni + ni*m] - qv1;
    a[ni + ni*m]=qv1;
    unsigned int ni1=ni+1;
    /*  Rotate the remaining columns of sub-matrix. */
    for(unsigned int nj=ni1; nj<n; nj++) {
      double dot=u1*a[ni + nj*m];
      for(unsigned int mi=ni1; mi<m; mi++)
        dot+=a[mi + nj*m] * a[mi + ni*m];
      double c=dot/fabs(qv1*u1);
      for(unsigned int mi=ni1; mi<m; mi++)
        a[mi + nj*m]-=c*a[mi + ni*m];
      a[ni + nj*m]-=c*u1;
    }
    /* Rotate vector B */
    double dot=u1*b[ni];
    for(unsigned int mi=ni1; mi<m; mi++)
      dot+=b[mi]*a[mi + ni*m];
    double c=dot/fabs(qv1*u1);
    b[ni]-=c*u1;
    for(unsigned int mi=ni1; mi<m; mi++)
      b[mi]-=c*a[mi + ni*m];
  } // end of rotation loop

  /* Solve triangular system by back-substitution. */
  for(unsigned int ni=0; ni<n; ni++) {
    int k=n-ni-1;
    double s=b[k];
    for(unsigned int nj=k+1; nj<n; nj++) 
      s-=a[k + nj*m] * x[nj];
    if(a[k + k*m]==0.0) return(1);
    x[k]=s/a[k + k*m];
  }

  /* Calculate the sum of squared residuals. */
  *r2=0.0;
  for(unsigned int mi=n; mi<m; mi++)
    *r2 += b[mi]*b[mi];

  return(0);
}
/*****************************************************************************/

/*****************************************************************************/
