# LDRseq.R : A version of LDR for matrices too large to fit in memory # all at once. Based on the paper below. # # Please cite: # # Qaqish, B. F., O'Brien, J. J., Hibbard, J. C., Clowers, K. J. (2017). # Accelerating high-dimensional clustering with lossless data reduction. # BIOINFORMATICS, Volume: 33, Issue: 18, Pages: 2867-2872. # DOI: 10.1093/bioinformatics/btx328 #requires Rtools, Rcpp. #input: data(z x a array, z>>a), R from a QR of the #first xx obs of data, a <= xx << z (a x a array), a, z. #output: R wrt QR of data (a x a array). library(Rcpp) cppFunction(' NumericMatrix xp(const DataFrame &MMG, NumericMatrix &IC, int &a, int &z, int &xx) { NumericMatrix GIJ = IC; double u, v, x, ghi, ghj; NumericVector JC(a), JCT(z); for(int i = xx; i < z; ++i) { for(int k = 0; k < a; ++k) { JCT = MMG[k]; JC[k] = JCT[i]; } for(int j = 0; j < a; ++j) { u = GIJ(j, j); v = JC[j]; x = sqrt(u * u + v * v); u /= x; v /= x; for(int k = j; k < a; ++k) { ghi = JC[k]; ghj = GIJ(j,k); JC[k] = -v * ghj + u * ghi; GIJ(j,k) = u * ghj + v * ghi; } } } return(GIJ); } ') LDRseq <- function(dat, blockSize=100000) { n_c <- ncol(dat) n_r <- nrow(dat) n_blocks <- ceiling(n_r / blockSize) n_last <- n_r %% blockSize if(n_last == 0) {n_last <- blockSize} #create indices for each block ep <- (1 : (n_blocks - 1)) * blockSize ep <- c(ep, ep[n_blocks - 1] + n_last) sp <- (0 : (n_blocks - 1)) * blockSize + 1 #initialize R R_ <- qr.R(qr(as.matrix(dat[sp[1] : ep[1],]))) for (i in 2 : n_blocks) { block <- dat[sp[i] : ep[i]] nRB <- nrow(block) R_ <- xp(block, R_, n_c, nRB, 0) } return(R_) }