# # BMI data on twin pairs # # # We may click "ctrl-r" on the lines to follow. # library(lava) library(mets) # This loads the mets package contributed to R # Installation into R, type; install.packages("mets") data(twinbmi) # A twin dataset on BMI is contained in the package str(twinbmi) # Content information head(twinbmi) # A few rows of each variable in the dataframe "twinbmi" # restrict to data, where response is not missing. # this will not affect a measured response in a cotwin of a missing twin response. twinbmi <- subset(twinbmi, !is.na(bmi)) # subset() is an R function to choose subset of a data-frame # is.na(bm) yields a logical vector same length as bmi. # Entries are TRUE if the entry in bmi is missing, else yields FALSE # ! is the 'negation' in R: e.g. !FALSE # so !is.na(bmi) asks to choose those rows where bmi is not-missing #chooses the column zyg and from this only the 3rd to 7th element twinbmi$zyg[3:7] # zygosity by gender cross-tabulation table(twinbmi$zyg,twinbmi$gender) # histogram by gender par(mfrow=c(1,2)) hist(twinbmi$bmi[twinbmi$gender=="male"], main="BMI Males",xlab="kg/m^2", col="gray60") hist(twinbmi$bmi[twinbmi$gender=="female"], main="BMI Females",xlab="kg/m^2", col="gray60") # BMI is usually studied on log-scale. # boxcox(bmi ~ age*gender, data = twinbmi) #we define the new variable logbmi as the natural(sic!) logarithm of bmi twinbmi$logbmi <- log(twinbmi$bmi) ## Pairs ## ## We pivote the dataset to wide-form only for drawing figures. ## The above original long-form is used in all analyses. # a number, 1 or 2, is assigned to each twin in a pair. twinbmi$nr <- with(twinbmi, ave(bmi,tvparnr, FUN = seq_along)) #' We print the first six obervation, but only the variables nr and tvparnr #' Notice: the entry before the ',' in [] selects rows, the entry after the ',' the columns twinbmi[1:6, c('tvparnr', 'nr')] # data is in "long-form" i.e. the bmi is given in one variable/column sequentially for two twins of the same pair # In the "wide-form" the information for these twins is given in one row side by side. # The "wide-form" is convenient for twin-twin plots etc. bmiwide <- reshape(twinbmi, idvar = "tvparnr", timevar = "nr", v.names=c("bmi", "age", "logbmi"), direction = "wide") head(bmiwide) #write.table(bmiwide,file="bmiwide.dat") # this will create a text-file # with data in wide-form # (not necessary for further analysis). par(mfrow=c(2,2)) # set up a 2 by 2 figure of plots. plot.window(xlim=c(15,40),ylim=c(15,40)) plot(logbmi.1~logbmi.2, data=bmiwide[bmiwide$zyg=="MZ" & bmiwide$gender=="male",], main="MZ males") plot(logbmi.1~logbmi.2, data=bmiwide[bmiwide$zyg=="DZ" & bmiwide$gender=="male",], main="DZ males") plot(logbmi.1~logbmi.2, data=bmiwide[bmiwide$zyg=="MZ" & bmiwide$gender=="female",], main="MZ females") plot(logbmi.1~logbmi.2, data=bmiwide[bmiwide$zyg=="DZ" & bmiwide$gender=="female",], main="DZ females") ## ## Saturated model. ## #We begin by fitting the saturated model, that is, # less structure for mean and covariance. lnbmi.sat <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="sat") #we check the convergence (the scores should be ideally all equal to 0) score(lnbmi.sat) # We take the mean of the squared scores: mean(score(lnbmi.sat)^2) lnbmi.sat$estimate$opt$message #not quite zero, but almost. We refit using results from above run as #starting values and the Newton-Raphson algorithm this time: aa <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="sat",control=list(method="NR",start=coef(lnbmi.sat))) #and check if zero mean(score(aa)^2) aa$estimate$opt$message #yes! #The first attempt succeeded since same log-likelihood #and same estimates of parameters mean((coef(lnbmi.sat)-coef(aa))^2) logLik(lnbmi.sat)-logLik(aa) #above refit can be done by one call: lnbmi.sat <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="sat",control=list(refit=TRUE)) mean(score(lnbmi.sat)^2) #let's see the results lnbmi.sat ## ## Equal regressions, intercepts and residual variances for twin 1 and twin 2. ## lnbmi.flex <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="flex") score(lnbmi.flex) lnbmi.flex$estimate$opt$message #refit lnbmi.flex <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="flex", control=list(start=coef(lnbmi.sat))) mean(score(lnbmi.flex)^2) #fine, and results lnbmi.flex #comparison with saturated model compare(lnbmi.sat,lnbmi.flex) #no significant worsening by imposing constraints. ## ## Further, equal for MZ and DZ twins as well. ## lnbmi.u <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg", data=twinbmi, type="u") score(lnbmi.u) # refit lnbmi.u <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg", data=twinbmi, type="u", control=list(refit=TRUE)) score(lnbmi.u) lnbmi.u compare(lnbmi.u,lnbmi.flex) #not quite. But natural assumption and we go on ## ## The ACE model ## lnbmi.ace <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="ace", control=list(start=coef(lnbmi.sat))) score(lnbmi.ace) # looks good, estimation has converged lnbmi.ace lnbmi.ace <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="ace", control=list(refit=TRUE)) score(lnbmi.ace) # values are too large - results will not be trustworthy. # use those above which has converged. # This may be caused by estimation from exact starting values creating singularities. AIC(lnbmi.u,lnbmi.ace) # compare AIC if non-nested models. # lowest AIC is most parsimonious model. ## ## ADE ## lnbmi.ade <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="ade") score(lnbmi.ade) lnbmi.ade <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="ade", control=list(refit=TRUE)) score(lnbmi.ade) lnbmi.ade #comparison of non-nested models AIC(lnbmi.ace,lnbmi.ade) #lower (=preferable) Akaike information index for ADE ## ## AE ## lnbmi.ae <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="ae") score(lnbmi.ae) lnbmi.ae <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="ae",control=list(refit=TRUE)) score(lnbmi.ae) lnbmi.ae compare(lnbmi.ade,lnbmi.ae) ## ## CE (no genetic effects at all) ## lnbmi.ce <- twinlm(logbmi~age*gender, id="tvparnr", DZ="DZ", zyg="zyg",data=twinbmi, type="ce",control=list(method="NR")) lnbmi.ce$estimate$opt$message score(lnbmi.ce) #fine! lnbmi.ce AIC(lnbmi.ae,lnbmi.ce) compare(lnbmi.ace,lnbmi.ce) #not good at all. #We report the AE model. # Tables for Latex: library(xtable) names(summary(lnbmi.ae)) xtable(summary(lnbmi.ae)$estimate) xtable(summary(lnbmi.ae)$heritability) #' Tables for HTML -> Word: #' If you do not use Latex you can produce HTML tables that can be copied to Word w1<-xtable(summary(lnbmi.ae)$estimate) w2<-xtable(summary(lnbmi.ae)$heritability) print(w1,file='wdummy.html',type='html') print(w2,file='wdummy.html',type='html',append=TRUE) #' You can open the file 'wdummy.html' and copy the tables to a Word file ## ##We are done! ## #Sweave("twinbmi.Rnw")