/*  file: gts_alr4alpha.sas                                      */
/*  old name: ~/gts/JQW1/cluster/alrtab5c.sas                    */
/*  date: 7/8/2001                                               */
/*  name: John Preisser                                          */
/*  project: GTS (Green Tobacco Sickness)                        */
/*     "Detecting Patterns of Occupational Illness Clustering */
/*      with Alternating Logistic Regressions Applied to      */
/*      Longitudinal Data" by Preisser, Arcury and Quandt     */
/*      Amercian Journal of Epidemiology, 2003; 57:550-2.     */
/*  request: Fit 4 parameter POR model using alr with         */
/*  SAS PROC GENMOD using option ZDATA on the repeated statement */
/*  input files: ~/gts/datasets/daily2.ssd01                     */
/*  output files:  none                                          */
/*  Note:   Use ZDATA instead of writing out replicated Z- matrix */

    
libname dat "/home/projects/gts/datasets";
libname formats "/home/projects/gts/sas_formats";
filename zout "/home/projects/gts/JQW1/cluster/zrepdays.out";

options ls=80 ps=60; 
options fmtsearch=(work.format, formats.formats, fmt.format);

proc format;  
 value workty 
   1 = 'prime'
   2 = 'prime/barn'
   3 = 'top' 
   4 = 'barn'
   5 = 'other(none)'
   ;
run;
*********************************************************;
data gmd01;
    set dat.daily2;
    if (1 le cum_day le 34) then season=3;
    else if (35 le cum_day le 55) then season=2;
    else if cum_day ge 56 then season=1;
    temp92=temp-92;
    if workty in (4,5) then workty=4;       /* combine 'barn & other' */
run;
*********************************************************;

proc freq;
    title 'Number of observations per day';
    table cum_day;
run;

/* create wid */
proc sort data=gmd01;
   by siteid newpid cum_day;
run;

proc freq data=gmd01;
    title 'Number of workers per site';
    table newpid/ out=numdays;
run;
/*
proc print data=numdays;
run;
*/
data numdays2(keep=newpid pidcnt);
    set numdays;
    pidcnt=count;
run;
proc sort data=gmd01; by newpid; run;
proc sort data=numdays2; by newpid; run;


data gmd1;
    merge gmd01 numdays2;
    by newpid;
run;
* sort by pidcnt to try to minimize size of zrep;


proc sort data=gmd1;
   by siteid newpid;
********    by siteid descending pidcnt newpid;
run;
/*
proc print data=gmd1(obs=500);
    title 'data is gmd1';
    var siteid newpid pidcnt cum_day gts;
run;
*/

* instead of newpid sort by worker according to number of observations descending;

data gmd2(keep=siteid newpid period cum_day wid gts pidcnt);
   set gmd1;
********    by siteid descending pidcnt newpid;
   by siteid newpid cum_day;
   if first.siteid then WID=0;
   if first.newpid then WID+1;
run;


proc print data=gmd2(obs=500);
    title 'data is gmd2; order by worker count within sites';
    var siteid newpid pidcnt cum_day gts;
run;


data gmd3;
   set gmd2;
*********    by siteid descending pidcnt newpid;
   by siteid newpid;
   Y=(wid-1)*83+cum_day;       /* create y=1 to 747  */
run;
   

proc print data=gmd3(obs=500);
   title 'data is gmd3';
   var siteid newpid cum_day wid y;
run;
proc freq data=gmd3;
   tables period;

proc freq data=gmd3;
   tables siteid*newpid*wid*period/list;
run;


*********************************************************;
/* create zin */
* Z-matrix for a complete cluster ;
* alternatively, one could use SAS PROC IML;
*********************************************************;
/* create zrep */
* 83 days times 9 workers equals 747 max cls size;
data zrep;
 keep y1 y2 z1-z4 one /* cday1 cday2 worker1 worker2 */;
 one=1;
 do m=1 to 746;
   do n=m+1 to 747; 
    y1=m;
    y2=n;
    worker1=int(y1/83)+1;
    worker2=int(y2/83)+1;
    cday1 = mod(y1,83);
    cday2 = mod(y2,83);
    
    if cday1=0 then do; cday1=83; worker1=worker1-1; end;
    if cday2=0 then do; cday2=83; worker2=worker2-1; end;
/*
    if worker1=worker2 then do;       z1=1; z2=0; z3=0; end;
    else if cday2-cday1 le 6 then do; z1=0; z2=1; z3=0; end;
    else do;                          z1=0; z2=0; z3=1; end;  
*/

    if ((worker1=worker2) and (cday2-cday1 le 6)) then do; z1=1; z2=0; z3=0; z4=0; end;
    if ((worker1=worker2) and (cday2-cday1 > 6))  then do; z1=0; z2=1; z3=0; z4=0; end;
 if ((worker1 ne worker2) and (cday2-cday1 le 6)) then do; z1=0; z2=0; z3=1; z4=0; end;
 if ((worker1 ne worker2) and (cday2-cday1 > 6))  then do; z1=0; z2=0; z3=0; z4=1; end;

    output;
        
    end;
  end;
run;

* Merge with wid (actual data) and delete the rows not represented in actual data;

proc print data=zrep(obs=500);
   title 'data is zrep - full';
run;

proc sort data=gmd3;
    by y;
run;
data uniqy(keep=y1 y2);
    set gmd3;
    by y;
    if first.y;
    y1=y;
    y2=y;
run;


/*
proc print data=uniqy;
run;
*/
proc sort data=zrep out=zrep;
    by y2;
run;

data zrep2;
    merge uniqy(in=ina) zrep(in=inb);
    by y2;
    if ina and inb;
run;
proc sort data=zrep2;
    by y1;
run;
* this merge deletes those rows from zrep that are not represented in the data;
data zrep3;
    merge uniqy(in=ina) zrep2(in=inb);
    by y1;
    if ina and inb;
run;   

proc print data=zrep3(obs=500);
run;

* this is the 4 parameter POR model given by expression (5);

proc genmod data=gmd3 descending;            
  class siteid y;
  model gts=
          /dist=bin noscale scale=1;
  repeated subject=siteid / withinsubject=y LogOr=zrep zdata=zrep3 zrow=(z1-z4) ypair=(y1 y2) ecovb;
run;