//==================//
// EXERCISE 287
// New MAY 2019
//==================//

// (a) Load melanoma data, restrict to the 1985-1994 calendar period 
//     and merge in the expected mortality rates.

use melanoma, clear
keep if year8594 == 1
stset surv_mm, fail(status==1 2) id(id) scale(12) exit(time 60.5)

gen _age = min(int(age + _t),99)
gen _year = int(yydx + _t)

sort _year sex _age
merge m:1 _year sex _age using popmort,  keep(match master)

/* (b) Generate a dummy for female, generate age splines with 3 df and form interactions
between female and the age spline variables */

gen female = sex==2
// generate splines
rcsgen age, gen(agercs) df(3)

// interactions
forvalues i = 1/3 {
	gen f_agercs`i' = female*agercs`i'
}

// Fit a model with including the effects of sex and age and their interaction.
// Allow the effects of age and sex to be time-dependent */

stpm2 female agercs* f_agercs*, scale(h) df(4) ///
		tvc(agercs* female) dftvc(2) bhazard(rate)

// These coefficient do not have a useful interpretation, but we can predict many things.


/* (c) Predict age standardised relative survival for males and females, using the
combined age distribution of males as females as the age standard */

// As we have interactions we have to put extra work when we manipulate exposures
// in our predictions. When we predict for females we need to make sure the 
// interaction terms are used for all predictions.

range tt 0 5 101
standsurv, at1(female 0 f_agercs1 0 f_agercs2 0 f_agercs3 0) ///
           at2(female 1 f_agercs1 = agercs1 f_agercs2 = agercs2 f_agercs3 = agercs3) ///
           timevar(tt) ci ///
		   atvar(rs_m rs_f) 

// Plot the resulting functions and their 95% confidence intervals		   
twoway (line rs_m* tt, lcolor(blue..) lpattern(solid dash dash)) ///
	   (line rs_f* tt, lcolor(red..) lpattern(solid dash dash)) ///
		, xtitle("Years from diagnosis") ///
		ytitle("Relative Survival") ///
		ylabel(0.5(0.1)1,angle(h) format(%3.1f)) ///
		legend(order(1 "Males" 4 "Females") ring(0) pos(1) cols(1)) ///
		name(rs, replace)

// Males have lower marginal relative survival. This is a fair comparison
// as each group is standardised to the same age distribution.		
		
// (d) Calculate the survival at 5 years for all subjects where everyone is 
//     forced to be male and then female. Take the mean of these estimates.
//     Show these are the same as the estimates at 5 years when using standsurv.

gen t5 = 5
predict rs_ms_m, surv at(female 0 f_agercs1 0 f_agercs2 0 f_agercs3 0) timevar(t5)
predict rs_ms_f, surv at(female 1 f_agercs1 = agercs1 f_agercs2 = agercs2 f_agercs3 = agercs3) timevar(t5)
tabstat rs_ms_m rs_ms_f		
list rs_m rs_f if tt==5, noobs

// The estmates are the same. This should illustrate what standsurv is actually
// doing. 

	
/* (e) We will now predict all cause survival. We need to incorporate the 
       population mortality file and various options to estimate 
       the expected survival. */ 		

standsurv, at1(female 0 f_agercs1 0 f_agercs2 0 f_agercs3 0, atif(female==0)) ///
           at2(female 1 f_agercs1=agercs1 f_agercs2=agercs2 f_agercs3=agercs3, atif(female==1)) ///
           timevar(tt) ci ///
           atvar(acs_m acs_f) ///
           expsurv(using(popmort.dta) ///  popmort file.
             agediag(age)             ///  age at diagnosis
             datediag(dx)             ///  date at diagnosis
             pmage(_age)              ///  age variable in popmort file
             pmyear(_year)            ///  year variable in popmort file		
             pmother(sex)             ///  other variables in popmort file
             pmrate(rate)             ///  rate varible in popmort file
             pmmaxyear(2000)          ///  maximum year in popmort file
             at1(sex 1)               ///  variables to match with main at options  
             at2(sex 2)               ///  variables to match with main at options 
             )

// As we standardized within each sex, we can compare our modelled based estimate
// with the Kaplan-Meier estimate
				   
sts gen s_km = s, by(female)				   
twoway (line acs_m* tt, lcolor(blue..) lpattern(solid dash dash)) ///
       (line acs_f* tt, lcolor(red..) lpattern(solid dash dash)) ///
       (line s_km _t if female == 0, sort lcolor(black) lpattern(shortdash) connect(stairstep)) ///
       (line s_km _t if female == 1, sort lcolor(black) lpattern(shortdash) connect(stairstep)) ///
       , xtitle("Years from diagnosis") ///
       ytitle("All cause Survival") ///
       ylabel(0.5(0.1)1,angle(h) format(%3.1f)) ///
       legend(order(1 "Males" 4 "Females") ring(0) pos(1) cols(1)) ///
       name(acs, replace)
				   
// Does our model capture the observed all-cause survival?

// Yes, very good agreement.

// Give two reasons that when comparing the curves, it is not a fair compraison 
// in terms of potential differential cancer mortality.

// (1) We are using each sex's own age distribution, which could explain differences
// (2) There are differences in other cause mortality rates (females have lower rates).
//     Thus the differences are a combination of cancer and other cause mortality.


// (f) We will now try to quantify the differences between males and females. 
// We first ask the question, "how would the relative survival change if males had the same
// age-specific excess mortality predictions as females?"

// We can restrict the standardisation to males and estimate with and without 
// forcing the excess mortality rates to be those of females. 
// We will also predict for females.

standsurv, at1(female 0 f_agercs1 0 f_agercs2 0 f_agercs3 0, atif(female==0)) ///
           at2(female 1 f_agercs1=agercs1 f_agercs2=agercs2 f_agercs3=agercs3, atif(female==0)) ///
           at3(female 1 f_agercs1=agercs1 f_agercs2=agercs2 f_agercs3=agercs3, atif(female==1)) ///
           timevar(tt) ci ///
		   atvar(rs2_m_pred rs2_m_withf rs2_f) 

twoway (line rs2_m_pred* tt, lcolor(blue..) lpattern(solid dash dash)) ///
       (line rs2_m_withf* tt, lcolor(red..) lpattern(solid dash dash)) ///
       (line rs2_f tt, lcolor(black..) lpattern(dash)) ///
       , xtitle("Years from diagnosis") ///
       ytitle("Relative Survival") ///
       ylabel(0.5(0.1)1,angle(h) format(%3.1f)) ///
       legend(order(1 "Males" 4 "Males (with female RS)" 7 "Females") ring(0) pos(1) cols(1)) ///
       name(rs2, replace)

list rs2_m_pred rs2_m_withf rs2_f if tt==5, noobs		
		
// Explain why the relative survival of males with females relative survival is not the same as 
// the relative survival of females
		
// This is because the age distribution is different and thus we are averaging over
// a different set of observations.
		
		
// (g) Now we can see what do these difference mean in terms of all-cause survival, 
// i.e. in the real world. We are essentially asking how would the all cause survival
// change for males, if they had the excess mortality rates of females.

standsurv, at1(female 0 f_agercs1 0 f_agercs2 0 f_agercs3 0, atif(female==0)) ///
           at2(female 1 f_agercs1=agercs1 f_agercs2=agercs2 f_agercs3=agercs3, atif(female==0)) ///
           timevar(tt) ci ///
           atvar(acs2_m_pred acs2_m_withf) ///
           expsurv(using(popmort.dta) ///    popmort file.
             agediag(age)             ///    age at diagnosis
             datediag(dx)             ///    date at diagnosis
             pmage(_age)              ///    age variable in popmort file
             pmyear(_year)            ///    year variable in popmort file		
             pmother(sex)             ///    other variables in popmort file
             pmrate(rate)             ///    rate varible in popmort file
             pmmaxyear(2000)          ///    maximum year in popmort file
             at1(sex 1)               ///    variables to match with main at options  
             at2(sex 1)               ///    variables to match with main at options 
             )                        ///
           contrast(difference)       ///	   
           contrastvar(acs_diff2)	   

twoway (line acs2_m_pred* tt, lcolor(blue..) lpattern(solid dash dash)) ///
       (line acs2_m_withf* tt, lcolor(red..) lpattern(solid dash dash)) ///
       , xtitle("Years from diagnosis") ///
       ytitle("All cause Survival") ///
       ylabel(0.5(0.1)1,angle(h) format(%3.1f)) ///
       legend(order(1 "Males" 4 "Males (with female RS)") ring(0) pos(1) cols(1)) ///
       name(acs2, replace)

list acs2_m_pred acs2_m_withf  if tt==5, noobs		
			
// This gives the change in all-cause survival if males had the excess mortality 
// rates of females.			
			
			
			
// (h) Plot the absolute difference in the survival predicted for males with their own
// excess mortality rates compared to if the males had the excess mortality rates
// of the females.

twoway (rarea acs_diff2_lci acs_diff2_uci tt, color(red%30)) ///
       (line acs_diff2 tt, lcolor(red..)) ///
       , xtitle("Years from diagnosis") ///
       ytitle("All cause Survival") ///
       ylabel(0.0(0.01)0.1,angle(h) format(%3.2f)) ///
       legend(off) ///
       name(acs2_diff, replace)
	   
// This is created due to the use of the contrast option.	
 
		
// (i) Rather than just look at the difference we can ask how many fewer deaths 
// would there be if the males had the excess mortality rates of the females.
// To do this we need to define a population size. One sensible option is to use 
// average number of males diagnosed each year.

// Calculate the average number of males diagnosed per year between 1990 and 1994.

count if yydx>=1990 & female==0
di `r(N)'/5

// We will round up to 245 males diagnosed per year
// We can repeat the previous standsurv command and add the per(245) option.
// This will just multiply our predictions by 245 and give us the avoidable deaths
// in a cohort of men diagnosed in a typical calendar year.
standsurv, at1(female 0 f_agercs1 0 f_agercs2 0 f_agercs3 0, atif(female==0)) ///
           at2(female 1 f_agercs1=agercs1 f_agercs2=agercs2 f_agercs3=agercs3, atif(female==0)) ///
           timevar(tt) ci ///
           atvar(acs3_m_pred acs3_m_withf) ///
           expsurv(using(popmort.dta) ///   popmort file.
             agediag(age)             ///   age at diagnosis
             datediag(dx)             ///   date at diagnosis
             pmage(_age)              ///   age variable in popmort file
             pmyear(_year)            ///   year variable in popmort file		
             pmother(sex)             ///   other variables in popmort file
             pmrate(rate)             ///   rate varible in popmort file
             pmmaxyear(2000)          ///   maximum year in popmort file
             at1(sex 1)               ///   variables to match with main at options  
             at2(sex 1)               ///   variables to match with main at options 
             )                        ///
           contrast(difference)       ///	   
           contrastvar(acs_diff3)     ///
           per(245)
 		
twoway (rarea acs_diff3_lci acs_diff3_uci tt, color(red%30)) ///
       (line acs_diff3 tt, lcolor(red..)) ///
       , xtitle("Years from diagnosis") ///
       ytitle("Avoidable Deaths") ///
       ylabel(,angle(h) format(%3.0f)) ///
       legend(off) ///
       name(acs3_diff, replace)
	   
// We estimate that there would be around 15 fewer deaths 5 years after diagnosis 
// for males for a cohort diagnosed in a typical calendar year