#
# BMI data on twin pairs
#
# 
# We may click "ctrl-r" on the lines to follow.
#

library(lava)
library(mets)  # This loads the mets package contributed to R
		   # Installation into R, type; install.packages("mets") 

data(twinbmi)  # A twin dataset on BMI is contained in the package

str(twinbmi)   # Content information
head(twinbmi)  # A few rows of each variable in the dataframe "twinbmi"


# restrict to data, where response is not missing.
# this will not affect a measured response in a cotwin of a missing twin response. 
twinbmi <- subset(twinbmi, !is.na(bmi))
# subset() is an R function to choose subset of a data-frame 
#    is.na(bm)  yields a logical vector same length as bmi. 
#              Entries are TRUE if the entry in bmi is missing, else yields FALSE
#      ! is the 'negation' in R: e.g.
	!FALSE
#   	   so !is.na(bmi) asks to choose those rows where bmi is not-missing


#chooses the column zyg  and from this only the 3rd to 7th element
twinbmi$zyg[3:7] 



# zygosity by gender cross-tabulation
table(twinbmi$zyg,twinbmi$gender)

# histogram by gender
par(mfrow=c(1,2))
hist(twinbmi$bmi[twinbmi$gender=="male"], main="BMI Males",xlab="kg/m^2", col="gray60")
hist(twinbmi$bmi[twinbmi$gender=="female"], main="BMI Females",xlab="kg/m^2", col="gray60")


# BMI is usually studied on log-scale.
# boxcox(bmi ~ age*gender, data = twinbmi)
  #we define the new variable logbmi as the natural(sic!) logarithm of bmi 
twinbmi$logbmi <- log(twinbmi$bmi)



## Pairs
##
## We pivote the dataset to wide-form only for drawing figures. 
## The above original long-form is used in all analyses.

# a number, 1 or 2, is assigned to each twin in a pair.
twinbmi$nr <- with(twinbmi, ave(bmi,tvparnr, FUN = seq_along))  

#' We print the first six obervation, but only the variables nr and tvparnr
#' Notice: the entry before the ',' in [] selects rows, the entry after the ',' the columns 
twinbmi[1:6, c('tvparnr', 'nr')]



# data is in "long-form" i.e. the bmi is given in one variable/column sequentially for two twins of the same pair
# In the "wide-form" the information for these twins is given in one row side by side.
# The "wide-form" is convenient for twin-twin plots etc.
bmiwide <- reshape(twinbmi, idvar = "tvparnr", timevar = "nr",
			v.names=c("bmi", "age", "logbmi"),
			direction = "wide")

			head(bmiwide)
#write.table(bmiwide,file="bmiwide.dat") # this will create a text-file 
						     # with data in wide-form 
						     # (not necessary for further analysis).

par(mfrow=c(2,2))			   # set up a 2 by 2 figure of plots.
plot.window(xlim=c(15,40),ylim=c(15,40))
plot(logbmi.1~logbmi.2, data=bmiwide[bmiwide$zyg=="MZ" & bmiwide$gender=="male",],
     main="MZ males")
plot(logbmi.1~logbmi.2, data=bmiwide[bmiwide$zyg=="DZ" & bmiwide$gender=="male",],
     main="DZ males")
plot(logbmi.1~logbmi.2, data=bmiwide[bmiwide$zyg=="MZ" & bmiwide$gender=="female",],
     main="MZ females")
plot(logbmi.1~logbmi.2, data=bmiwide[bmiwide$zyg=="DZ" & bmiwide$gender=="female",],
     main="DZ females")

##
## Saturated model.
##

#We begin by fitting the saturated model, that is,
# less structure for mean and covariance.
lnbmi.sat <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
                    type="sat")
#we check the convergence  (the scores should be ideally all equal to 0)
score(lnbmi.sat)
# We take the mean of the squared scores:
mean(score(lnbmi.sat)^2)
lnbmi.sat$estimate$opt$message

#not quite zero, but almost. We refit using results from above run as
#starting values and the Newton-Raphson algorithm this time:
aa <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
             type="sat",control=list(method="NR",start=coef(lnbmi.sat)))
#and check if zero
mean(score(aa)^2)
aa$estimate$opt$message
#yes!
#The first attempt succeeded since same log-likelihood
#and same estimates of parameters
mean((coef(lnbmi.sat)-coef(aa))^2)
logLik(lnbmi.sat)-logLik(aa)

#above refit can be done by one call:
lnbmi.sat <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
            type="sat",control=list(refit=TRUE))
mean(score(lnbmi.sat)^2)

#let's see the results
lnbmi.sat

##
## Equal regressions, intercepts and residual variances for twin 1 and twin 2.
##
lnbmi.flex <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
			type="flex")
score(lnbmi.flex)
lnbmi.flex$estimate$opt$message

#refit
lnbmi.flex <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
			type="flex", control=list(start=coef(lnbmi.sat)))
mean(score(lnbmi.flex)^2)
#fine, and results
lnbmi.flex

#comparison with saturated model
compare(lnbmi.sat,lnbmi.flex)
#no significant worsening by imposing constraints.


##
## Further, equal for MZ and DZ twins as well.  
##
lnbmi.u <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg", data=twinbmi,
			type="u")
score(lnbmi.u)
# refit

lnbmi.u <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg", data=twinbmi,
                  type="u",  control=list(refit=TRUE))
score(lnbmi.u)
lnbmi.u

compare(lnbmi.u,lnbmi.flex)
#not quite. But natural assumption and we go on

##
## The ACE model 
##
lnbmi.ace <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
			type="ace", control=list(start=coef(lnbmi.sat)))
score(lnbmi.ace)
# looks good, estimation has converged
lnbmi.ace

lnbmi.ace <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
			type="ace", control=list(refit=TRUE))
score(lnbmi.ace)
# values are too large - results will not be trustworthy.
# use those above which has converged.
# This may be caused by estimation from exact starting values creating singularities.


AIC(lnbmi.u,lnbmi.ace)  # compare AIC if non-nested models.
				# lowest AIC is most parsimonious model. 

##
## ADE
##
lnbmi.ade <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
			type="ade")
score(lnbmi.ade)
lnbmi.ade <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
			type="ade", control=list(refit=TRUE))
score(lnbmi.ade)
lnbmi.ade  

#comparison of non-nested models
AIC(lnbmi.ace,lnbmi.ade)
#lower (=preferable) Akaike information index for ADE

##
## AE
##
lnbmi.ae <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
			type="ae")
score(lnbmi.ae)
lnbmi.ae <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
			type="ae",control=list(refit=TRUE))
score(lnbmi.ae)
lnbmi.ae  

compare(lnbmi.ade,lnbmi.ae)

##
## CE (no genetic effects at all)
##
lnbmi.ce <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi,
			type="ce",control=list(method="NR"))
lnbmi.ce$estimate$opt$message
score(lnbmi.ce)
#fine!
lnbmi.ce  

AIC(lnbmi.ae,lnbmi.ce)
compare(lnbmi.ace,lnbmi.ce)
#not good at all.
#We report the AE model.

# Tables for Latex:
library(xtable)
names(summary(lnbmi.ae))
xtable(summary(lnbmi.ae)$estimate)
xtable(summary(lnbmi.ae)$heritability)

#' Tables for HTML -> Word:
#' If you do not use Latex you can produce HTML tables that can be  copied to Word
w1<-xtable(summary(lnbmi.ae)$estimate)
w2<-xtable(summary(lnbmi.ae)$heritability)
print(w1,file='wdummy.html',type='html')
print(w2,file='wdummy.html',type='html',append=TRUE)
#' You can open the file 'wdummy.html' and copy the tables to a Word file
 
##
##We are done!
##


#Sweave("twinbmi.Rnw")