//==================//
// EXERCISE 131
// REVISED FEB 2023
//==================//

// Load the Melanoma data, keep those with localized stage
use melanoma, clear
keep if stage == 1
gen female = sex == 2
stset surv_mm, failure(status==1) exit(time 120.5) scale(12)

// (a) Kaplan-Meier curve
sts graph

// (b) Weibull model (using stpm3)
stpm3, scale(lncumhazard) df(1)
predict s1, surv
predict h1, hazard

sts graph, addplot(line s1 _t, sort) name(km1, replace) ylabel(0.6(0.1)1)

// (c) Plot hazard function
sts graph, hazard kernel(epan2) addplot(line h1 _t, sort) name(hazard1, replace)

// (d) using the timevar() option.

// The predictions are at the values of _t by default.
// It is useful to learn how to use the timevar() option.
// Rather than predict at the observed values of time for each subject.
// We can define our own time variable to predict at 
// This is useful in large datasets and/or predicting
// for specific covariate patterns
// We usually store predicted results in a separate frame (dataset)
// to separate the analysis data from the prediction data.

// The following specifies times between 0 and 10 in steps of 0.1 
// Results will be saved to a frame called df1
predict s1, surv timevar(0 10, step(0.1)) frame(df1, replace)

// shows predictions frame df1
frame df1: list in 1/10

// We could save the hazard to a separate frame
// As we want to predict at the same values of time
// it is better to use the same frame
// Using the frame(df1, merge) option will do this
// it will use the timevar already created in frame df1.

predict h1, hazard frame(df1, merge)

// use the user written addplot command to add data to a plot
// that is stored in a different frame

sts graph, name(km1, replace) ylabel(0.6(0.1)1) legend(on)
frame df1: addplot: line s1 tt, sort 

sts graph, hazard kernel(epan2) name(haz1, replace)  legend(on)
frame df1: addplot: line h1 tt, sort 

// (e) try stpm2 with 4 df
stpm3, scale(lncumhazard) df(4)
predict s4, surv   timevar(0 10, step(0.1)) frame(df4, replace)
predict h4, hazard frame(df4, merge)

sts graph, ylabel(0.6(0.1)1) legend(on) name(km4, replace)
frame df4: addplot: line s4 tt, sort 

sts graph, hazard kernel(epan2)  name(hazard4, replace) legend(off)
frame df4: addplot: line h4 tt, sort


// (f) Fit a Cox Model with year8594 as the only covariate
stcox i.year8594   

/* (g) Flexible parametric model */
stpm3 i.year8594, scale(lncumhazard) df(4) eform

/* (h) predicted values */
predict S7584 S8594, surv timevar(0 10, step(0.1)) ci ///
                     frame(f2, replace)               ///
                     at1(year8594 0) at2(year8594 1)

predict h7584 h8594, hazard per(1000) ci frame(f2, merge) ///
					           at1(year8594 0) at2(year8594 1)

frame f2 {
  twoway (line S7584 tt, sort)                      ///
  		   (line S8594 tt, sort),                     ///
  		   legend(order(1 "1975-1984" 2 "1985-1994")  ///
  		          ring(0) pos(1) col(1))              ///
  		   xtitle("Time since diagnosis (years)")     ///
  		   ytitle("Survival")                         ///
  		   name(surv, replace)
  		
  twoway (line h7584 tt, sort)                                   ///
  		   (line h8594 tt, sort),                                  ///
  		   legend(order(1 "1975-1984" 2 "1985-1994")               ///
  		          ring(0) pos(1) col(1))                           ///
  		   xtitle("Time since diagnosis (years)")                  ///
  		   ytitle("Cause specific mortality rate (per 1000 py's)") ///
  	     name(haz, replace)
}

/* (i) hazard on log scale */
frame f2 {
  twoway (line h7584 tt, sort)                       ///
         (line h8594 tt, sort)                       ///
         , legend(order(1 "1975-1984" 2 "1985-1994") ///
           ring(0) pos(1) col(1))                    ///
         xtitle("Time since diagnosis (years)")      ///
         ytitle("Cause specific mortality rate (per 1000 py's)") ///
         yscale(log)                                 ///
         name(haz_log, replace)
}
		
/* (j) sensitivity to knots */
forvalues i = 1/6 {
  stpm3 i.year8594, scale(lncumhazard) df(`i') eform
  estimates store df`i'
  predict h_df`i', hazard per(1000) timevar(0 10,step(0.1)) zeros frame(f1,mergecreate)
  predict s_df`i', survival at1(year8594 0) frame(f1, merge)
}	

// Compare the log hazard ratios, standard errors AIC and BIC from the different models.
estimates table df*,   keep(1.year8594) se  stat(AIC BIC)


// an alternative way to do this if you do not like mergecreate
// is to create a frame first
// the model variables must exist in the data, so create these in the new frame
// I use the merge option for each predict command, 
// as we run predict from the results frame.
frame create f4
frame f4 {
  set obs 101
  range tt 0 10 
  gen year8594 = .
}
forvalues i = 1/6 {
  stpm3 i.year8594, scale(lncumhazard) df(`i') eform
  estimates store df`i'
  frame f4 {
    predict h_df`i', hazard per(1000) timevar(tt) merge zeros
    predict s_df`i', survival timevar(tt) merge zeros
  }
}	


/* (k) compare hazard and survival */
frame f4 {
  line s_df* tt, sort                         ///
       legend(ring(0) cols(1) pos(1))         ///
       xtitle("Time since diagnosis (years)") ///
       ytitle("Survival")                     ///
       name(S_df, replace)

  line h_df* tt, sort                                          ///
	     legend(ring(0) cols(1) pos(1))                          ///
	     xtitle("Time since diagnosis (years)")                  ///
	     ytitle("Cause specific mortality rate (per 1000 py's)") ///
       name(h_df, replace)
}	
	
	
/* (l) knot locations */
// run the following code to fit 10 models with 5df (6 knots) where
// the 4 internal knots are selected at random centiles of the 
// distribution of event times.
//
// As there are many ties in this data we add a small 
// random number to the survival times
// (otherwise we risk having knots in the same location)
replace _t = _t + runiform()*0.001
gen lnt = ln(_t)
set seed 4231
global legorder
forvalues i = 1/10 {
	local clist
	forvalues j = 1/4 {
		local z`j': display %3.1f runiform()*100
		local clist  `clist' `z`j''
	}
	numlist "`clist'", sort
	local clist `r(numlist)'
	di "`clist'"
	_pctile lnt if _d, p(`clist')
	local klist `r(r1)' `r(r2)' `r(r3)' `r(r4)'
	summ lnt if _d == 1
	local klist `r(min)' `klist' `r(max)'
	stpm3 i.year8594, scale(lncumhazard) knots(`klist') initmod(exp) 
	predict sp`i', surv at1(year8594 0) timevar(0 10,step(0.1)) frame(f5,mergecreate)
	predict hp`i', hazard per(1000) at1(year8594 0) frame(f5,merge)
	estimates store mp`i'
	global legorder ${legorder} `i' `""`clist'""'
}


// compare log hazard ratios and standard errors
estimates table mp*, keep(1.year8594) se(%5.4f) b(%5.4f)

// compare baseline hazard and survival curves
frame f5 {
  twoway (line hp* tt, sort),                              ///
         legend(order(${legorder}) ring(0) pos(1) cols(1)) ///
		     name(hp,replace)	

  twoway (line sp* tt, sort),                              ///
         legend(order(${legorder}) ring(0) pos(1) cols(1)) ///
		     name(sp,replace)	
}

	
/* (m) Include effect of age group and sex */
// use stset again as we have altered _t
stset surv_mm, failure(status==1) exit(time 120.5) scale(12)
stcox i.female i.year8594 i.agegrp
estimate store cox
stpm3 i.female i.year8594 i.agegrp, df(4) scale(lncumhazard) eform
estimates store stpm3_ph

// compare to Cox model
estimates table cox stpm3_ph,  equation(1) keep(#1:)  se

/* (n) obtaining predictions */
/* predict using at option*/
estimates restore stpm3_ph
predict S0, survival zeros timevar(0 10,step(0.1)) frame(f6, replace)
frame f6: line S0 tt, sort

predict S_F_8594_age75, survival frame(f6, merge) ci ///
        at1(female 1 year8594 1 agegrp 3) 

frame f6 {		
  twoway (rarea S_F_8594_age75_lci S_F_8594_age75_uci tt, pstyle(ci)) ///
         (line S_F_8594_age75 tt)                                     ///
         , legend(off)                                                ///
         xtitle("Time since diagnosis (years)")                       ///
         ytitle("S(t)")                                               ///
         title("Female age 75+ diagnosed 1985-1994")
}

    
/*    
Some comments

(b) - The Weibull model clearly does not fit well as there is disagreement
      with the KM curve.
    - It is hard to work out the shape of the hazard function just by 
      looking at the survival function.

(c) - The hazard clearly has a turning point. 
    - The hazard function for a Weibull model is montonic, 
      i.e. increasing, decreasing or constant.
    
(d) - We advise using timevar and saving to a new frame unless 
      you have a good reason not to.

(e) - Hard to tell the difference between KM and model fitted values.

(g) - Hazard ratios are very similar, as are standard errors and
      confidence intervals.

(i) - The difference is constant as a multiplicative effect is additive on the
      log scale.
      
(j) - The hazard ratios and their standard errors are very similar. 
    - The lowest AIC is for 5df and the lowest BIC for 2 df.

(k) - The estrimated curves are very similar excet for 1 df.
    - There is less variation on the survival scale.    
    
(l) - The fitted curves show very little difference,
      even when knots are close togther.
      
(n) - Estimates are similar as we are trying to estimate the same thing.
    - The Cox model does not estimate the baseline hazard, 
      so it can take any shape.       
    - The splines modles are flexible and an approximate many comples shapes.
    
(o) - (i)  A male, in the youngest age group in the earlist time period.

*/