* blex.sas * (adds: range restrictions on RHO due to marginal means) * xref: - Liang, K-Y. and Zeger, S. L. (1986) "Longitudinal data analysis using generalized linear models", Biometrika, vol 73, pp 13-22. - Zeger, S.L. and Liang, K-Y. (1986) "Longitudinal data analysis for discrete and continuous outcomes", Biometrics, vol 42, 121-130. - B034.SAS S060.SAS S063.SAS * does: - Analysis of correlated (including longitudinal) binary data (0/1 only) using generalized estimating equations (GEE). - Exchangeable correlation only. - Naive and robust var. est. - Optimized for the case where there is enough memory to hold the whole data set. - Memory requirements guidelines: Let N = # observations, p = # covariates, n = maximum cluster size. Memory use is dominated by the following 8 * (N*(p+6) + 2*n*p) bytes. e.g. N = 10,000, p=10, n=200 -> about 1.25 Megabytes. - Compatible with Karim's macro calls for: link, vari, corr. If specified, must have link=3, vari=3, corr=4. - output data sets if requested. - verifies all restrictions on rho (due to means, n); * By: - Bahjat Qaqish and Habib Moalem Department of Biostatistics CB 7400 Chapel Hill, NC 27599-7400 (919) 966-7271 qaqish@biostat.sph.unc.edu * Date: - Original version, February 1994 - Several later updates ***************************************************; %macro gee ( data=, /* required: dataset name */ yvar=, /* required: response 0/1 */ xvar=, /* required: covariates */ id=, /* required: cluster id */ offset=, /* offset */ link=3, /* logit link only */ vari=3, /* binary variance only */ corr=4, /* exchangeable correlation only */ epsilon=1.0e-4, /* criterion for beta convergence */ epsilon2=1.0e-2, /* criterion for rho convergence */ monitor=no, /* monitor convergence */ beta=, /* initial values */ rho=0, /* initial value */ fixrho=no, /* fixrho: no/yes */ fixbeta=no, /* fixbeta: no/yes */ outest=, /* estimtates and cov matrices dataset */ outs=, /* fitted values and residuals dataset */ conf=0.95, /* confidence intervals */ maxiter = 25 ); /* max. # iterations */ ***************************************************; * check required parameters before starting IML; %let err=0; %let fixrho=%upcase(&fixrho); %let fixbeta=%upcase(&fixbeta); %let monitor=%upcase(&monitor); %if %length(&data)=0 %then %do; %put E R R O R : data=dataset must be specified.; %let err=1; %end; %if %length(&yvar)=0 %then %do; %put E R R O R : yvar=response must be specified.; %let err=1; %end; %if %length(&xvar)=0 %then %do; %put E R R O R : xvar=covariates must be specified.; %let err=1; %end; %if %length(&id)=0 %then %do; %put E R R O R : id=identification number must be specified.; %let err=1; %end; %if (&link ^= 3) %then %do; %put E R R O R : link=3 is the only value allowed.; %let err=1; %end; %if (&vari ^= 3) %then %do; %put E R R O R : vari=3 is the only value allowed.; %let err=1; %end; %if (&corr ^= 4) %then %do; %put E R R O R : corr=4 is the only value allowed.; %let err=1; %end; %if &err=1 %then %goto stopit; ***************************************************; proc sort data=&data; by &id; run; * sort data by id; ***************************************************; proc summary data=&data nway; var &yvar; by &id; output out=_N_ (keep=n) n=n; run; ***************************************************; proc iml; ***************************************************; start signon; print "Generalized estimating equations for exchangeably- correlated binary responses (version 2.3)", "(c) Bahjat Qaqish and Habib Moalem (1994, 1995)", "Department of Biostatistics ", "CB 7400 ", "Chapel Hill, NC 27599-7400 ",,; finish signon; ***************************************************; start info (k, sum_n, min_n, max_n, n, p); sum_n = sum(n); max_n = max(n); min_n = min(n); k = nrow(n); print "Data set: &data", "Response: &yvar", "Covariates: &xvar", %if %length(&offset)^=0 %then %do; "Offset: &offset", %end; "Cluster ID: &id", "Number of observations:" sum_n, "Number of covariates:" p, "Number of clusters:" k, "Cluster sizes: from " min_n " to " max_n, ; finish info; ***************************************************; start psdinv (xinv, uinv, p, x); * xinv <- g-inverse of a symmetric,+ve, semi-definite matrix (x); * uinv <- g-inverse of root x; * p <- rank(x); u = half(x); * u [redundant cols] <- 0; d = vecdiag(u); redcol = loc(d = 0); if (ncol(redcol)>0) then do; u[,redcol]=0; end; p = ncol(u)-ncol(redcol); uinv = ginv(u); xinv = uinv * t(uinv); finish psdinv; ***************************************************; start get1c (axi, zi, rhomin, rhomax, x, beta, y, offs, cluster, j1, j2); * compute zi, axi, rhomin, rhomax for one cluster; xi = x [j1[cluster] : j2[cluster], ]; yi = y [j1[cluster] : j2[cluster], ]; %if %length(&offset)=0 %then %do; ofs = 0; * no offset; %end; %else %do; ofs = offs [j1[cluster] : j2[cluster], ]; * offset; %end; run gee2 (zi, axi, rhomin, rhomax, xi, beta, yi, ofs); * compute z, ax; finish; ***************************************************; start acc1c (u, h, g, x1s, x2s, g1, g2, adx, z, rho, irth); * accumulate contribution from one cluster; * irth : irth * irth` = inv(info matrix from last iter); * adds contribution to u, h, g, x1s, x2s, g1, g2; * adx, z, rho, irth: not modified; * note: the factor c=1/(1-rho) is ignored, the final nvar = inv(h) must be rescaled; * this is optimal code for corr=4, n by n matrices avoided; n = nrow(adx); t = sum(z); d = -rho/(1+rho*(n-1)); a = 1 + n * d; adxt = t(adx); e = adx[+,]; h = h + (adxt * adx) + t(e) * (d*e); ui = adxt * (z + d*t); free adxt; u = u + ui; g = g + ui * t(ui); l = adx * irth; g1i = ssq(l); g2i = ssq(l[+,]); g1 = g1 + g1i; g2 = g2 + g2i; x1si = ssq(z); x2si = t*t - x1si; x1s = x1s + x1si; x2s = x2s + x2si; finish; ***************************************************; start chk1 (rho, rholim, niter); c_l = 0.99; c_u = 0.99; if (rho <= rholim[1]) then do; newrho = c_l * rholim[1]; print "Iteration " niter; print "rho = " rho " is below the valid range and will be set to " newrho; rho = newrho; end; else if (rho >= rholim[2]) then do; newrho = c_u * rholim[2]; print "Iteration " niter; print "rho = " rho " is above the valid range and will be set to " newrho; rho = newrho; end; finish; ***************************************************; start adjust(nvar, rho); * scaling of nvar, because the factor 1/(1-rho) was ignored in the info matrix; nvar = (1-rho) * nvar; finish adjust; ***************************************************; %macro monitor2; %if (&monitor=YES) %then %do; print niter rho rholim g1 g2 x1s x2s crit crit2, beta u nvar; %end; %mend; ***************************************************; %macro monitor1; %if (&monitor=YES) %then %do; print p k min_n max_n sum_n n2; %end; %mend; ***************************************************; start update (rho, g1, g2, x2s, n2); rho = (x2s + (g2-g1)) / n2; finish; ***************************************************************; %macro update; %if (&fixrho=NO) %then %do; run update (rho, g1, g2, x2s, n2); %end; %mend; ***************************************************; start rhorange(rhomin, rhomax, fv); * input: fv = mean vector (column or row) output: rhomin, rhomax = bounds on exchangeable correlation imposed by the mean vector and cluster size n; ; n = ncol(fv)*nrow(fv); if (n=0) then return; if (n=1) then do; rhomin = -1; rhomax = 1; return; end; fv1 = min(fv); fvn = max(fv); if (n=2) then do; fv2 = fvn; fvn1 = fv1; end; else do; * find the second smallest; i=loc(fv=fv1); * locate the min; fv[i[1]] = fvn; * replace it by the max; fv2 = min(fv); * the second smallest; fv[i[1]] = fv1; * put the min back in place; * find the second largest; i=loc(fv=fvn); * locate the max; fv[i[1]] = fv1; * replace it by the min; fvn1= max(fv); * the second largest; fv[i[1]] = fvn; * put the max back in place; end; psi1 = sqrt(fv1/(1-fv1)); psi2 = sqrt(fv2/(1-fv2)); psin = sqrt(fvn/(1-fvn)); psin1 = sqrt(fvn1/(1-fvn1)); rhomin= -min(psi1*psi2, 1/(psin1*psin), 1/(n-1)); rhomax= psi1/psin; finish; ***************************************************; start gee4 (beta, oldrho, converge, crit, crit2, nvar, u, rho); %if (&fixbeta=NO) %then %do; dbeta = nvar * u; %end; %else %do; dbeta = j(nrow(u), 1, 0); %end; crit = sum(abs(dbeta)); crit2 = abs(oldrho-rho); converge = ((crit < &epsilon) & (crit2 < &epsilon2)); beta = beta + dbeta; oldrho = rho; finish; ***************************************************; start gee3 (rholim, niter, u, h, g, x1s, x2s, g1, g2, p); niter = niter + 1; u = j(p, 1, 0); h = j(p, p, 0); g = h; x2s = 0; x1s = 0; g1 = 0; * sum over i=1 to K of tr(Bi Qi Bi); g2 = 0; * sum over i=1 to K of tr(Ji Bi Qi Bi); rholim[1] = -1; * min; rholim[2] = 1; * max; finish; ***************************************************; start gee2 (z, ax, rhomin, rhomax, x, beta, y, offs); * compute z, ax; %if %length(&offset)=0 %then %do; fv = 1/(1+exp(-(x*beta))); * no offset; %end; %else %do; fv = 1/(1+exp(-(offs+x*beta))); * offset; %end; a = sqrt(fv#(1-fv)); z = (y - fv) / a; run rhorange(rhomin, rhomax, fv); free fv; ax = a # x; finish; ***************************************************; start gee1 (k, j1, j2, n2, n, sum_n); j2 = cusum(n); * cluster ends; j1 = j2 - n + 1; * cluster begins; k = nrow(n); sum_n = j2[k]; n2 = ssq(n) - sum_n; free n; * not needed any more; finish; ***************************************************; start gee (beta, rho, nvar, rvar, nep, converge, niter, y, offs, x, n, min_n, max_n ); * n = cluster sizes y = response x = design matrix offs = offset beta, rho: assumed to contain initial values n, y, x, offs: assumed sorted by cluster id n, y, x, offs: not modified ; p = ncol(x); run gee1 (k, j1, j2, n2, n, sum_n); * compute k, j1, j2, n2, free n; rholim = j(2,1,0); * bounds on rho; rholim[1]=-1/(max_n -1); rholim[2]=1; run chk1 (rho, rholim, 0); %monitor1; niter = 0; irth = 0; * : irth * irth` = inv(info matrix from last iter); oldrho = rho + abs(&epsilon); do until (converge | (niter=&maxiter)); * main loop; run gee3 (rholim, niter, u, h, g, x1s, x2s, g1, g2, p); do cluster = 1 to k; run get1c (axi, zi, rhomin, rhomax, x, beta, y, offs, cluster, j1, j2); rholim[1] = max(rholim[1], rhomin); rholim[2] = min(rholim[2], rhomax); run acc1c (u, h, g, x1s, x2s, g1, g2, axi, zi, rho, irth); end; g1 = g1 * (1-rho); * the missing factor; g2 = g2 * (1-rho); * the missing factor; %update; * update rho; run chk1 (rho, rholim, niter); run psdinv (nvar, irth, nep, h); run gee4 (beta, oldrho, converge, crit, crit2, nvar, u, rho); %monitor2; end; * main loop; rvar = nvar * g * nvar; run adjust(nvar, rho); if (^converge) then do; print "gee(): No convergence, niter = " niter; end; * show names memory; finish gee; ***************************************************; start initbeta (beta, p); * initial values of beta; * make sure beta has the correct dimension; * p == ncol(x); %if (%length(&beta)=0) %then %do; beta = j(p, 1, 0); %end; %else %do; beta = {&beta}; a = ncol(beta) * nrow(beta); beta = shape(beta, a, 1); if (a > p) then do; print "The supplied beta vector is too long and will be truncated"; beta = beta[1:p]; end; else if (a < p) then do; print "The supplied beta vector is too short and will be padded"; beta = beta // j(p-a, 1, 0); end; %end; finish initbeta; ***************************************************; start getdata (x, y, offs, id1, n, beta, parms, rho, p); * init. all these objects; use _N_; read all var {n} into n; close _N_; * read data; use &data; read all var {&xvar} into x; * covariates; read all var {&yvar} into y; * response; read all var {&id} into id1; * id; %if %length(&offset)=0 %then %do; offs = 0; * no offset; %end; %else %do; read all var {&offset} into offs; * offset; %end; close &data; parms = t({&xvar}); * variable names; rho = ρ p = ncol(x); run initbeta(beta, p); finish getdata; ***************************************************; start results1 (s, ttl, parms, beta, v, za); b = beta; se = sqrt(vecdiag(v)); * standard errors; redcol = loc(se = 0); if (ncol(redcol)>0) then do; b [redcol]=.; se[redcol]=.; end; z = b/se; p = 2 * (1-probnorm(abs(z))); r = exp(b); * odds ratios; lower = exp(b - za*se); upper = exp(b + za*se); variable = b || se || z || p || r || lower || upper; print s, variable [r=parms c=ttl] ,,; finish; ***************************************************; %if %length(&outest)^=0 %then %do; start dataout1 (rho, beta, nvar, rvar); * output estimates of parameters and covariance matrices; n = 1 + nrow(nvar) + nrow(rvar); a1 = {PARMS} // repeat({NCOV}, nrow(nvar),1) // repeat({RCOV}, nrow(nvar),1); a2 = {" "} // repeat(t({&xvar}) , 2, 1); a3 = repeat({&yvar}, n, 1); a = a1 || a2 || a3; _TYPE_="12345678"; _NAME_="12345678"; _DEPVAR_="12345678"; * char variables -> _ds1_; create _ds1_ var{ _TYPE_ _NAME_ _DEPVAR_}; append from a; close _ds1_; x = (t(beta) // nvar // rvar) || repeat(rho, n, 1); g = ({&xvar}) || ({"_RHO_"}); * numeric variables -> _ds2_; create _ds2_ from x [colname=g]; append from x; close _ds2_; finish dataout1; %end; ***************************************************; %if %length(&outs)^=0 %then %do; start dataout (id1, y, offs, x, beta); * output fitted values and residuals; %if %length(&offset)=0 %then %do; predict = 1/(1+exp(-(x*beta))); * no offset; %end; %else %do; predict = 1/(1+exp(-(offs+x*beta))); * offset; %end; residual = y - predict; stdev = sqrt(predict#(1-predict)); stdresid = residual / stdev; free stdev; * numeric variables; cols = {&id &yvar "Predict" "Residual" "StdResid" &xvar }; dm = id1 || y || predict || residual || stdresid || x ; create &outs from dm [colname = cols]; append from dm; close &outs ; finish dataout; %end; ***************************************************; start results (parms, beta, rvar, nvar, converge, niter, nep, rho, x, y, offs, id1); if (converge) then do; print "Converged, niter = " niter; print "Number of estimable parameters = " nep; %if (&fixrho=NO) %then %do; print "Estimated intra-cluster correlation = " rho; %end; %else %do; print "Intra-cluster correlation is fixed at " rho; %end; print "Summary of estimates and &conf confidence intervals:"; z = probit(0.5+&conf*0.5); s = "Results using the robust variance estimator:"; ttl = {'Estimate' 'Robust SE' 'z' 'p-val' 'Odds Ratio' 'CI-lower' 'CI-upper'}; run results1 (s, ttl, parms, beta, rvar, z); s = "Results using the naive variance estimator:"; ttl = {'Estimate' 'Naive SE' 'z' 'p-val' 'Odds Ratio' 'CI-lower' 'CI-upper'}; run results1 (s, ttl, parms, beta, nvar, z); %if %length(&outs)^=0 %then %do; run dataout (id1, y, offs, x, beta); %end; %if %length(&outest)^=0 %then %do; run dataout1 (rho, beta, nvar, rvar); %end; end; else if (^ converge) then do; print 'No convergence, niter ' niter; %if (&monitor ^= YES) %then %do; print "Consider running with monitor=yes" ; %end; end; finish; ***************************************************************; %macro options; %if (&monitor=YES) %then %do; reset name; %end; %else %do; reset noname; %end; %mend options; ***************************************************************; /* iml execution starts here -> */ %options; run signon; run getdata (x, y, offs, id1, n, beta, parms, rho, p); run info (k, sum_n, min_n, max_n, n, p); run gee (beta, rho, nvar, rvar, nep, converge, niter, y, offs, x, n, min_n, max_n ); run results (parms, beta, rvar, nvar, converge, niter, nep, rho, x, y, offs, id1); ***************************************************************; /* %if %length(&outs)^=0 %then %do; data &outs; set _ds4_; run; %end; */ ***************************************************************; %if %length(&outest)^=0 %then %do; data &outest; merge _ds1_ _ds2_; run; %end; ***************************************************************; %stopit: %mend gee;