# fruitfly code data <- read.table("http://www.amstat.org/publications/jse/datasets/fruitfly.dat") names(data) <- c("id","partners","type","longevity","thorax","sleep") attach(data) # make scatterplot matrix pairs(data) # define vectors for the colors (col) and plotting characters (pch) col.partn <- 1*(partners==0)+2*(partners==1)+3*(partners==8) # 1=black, 2=red, 3=green pch.type <- 1*(type==0) + 2*(type==1)+3*(type==9) # 1='circle', 2='triangle', 3='plus' # type=0: pregnant, type=1: virgin, type=9: NA col.partn pch.type # make scatterplot with different colors and plotting symbols par(mfrow=c(1,1)) plot(thorax, longevity, pch=pch.type, col=col.partn, ylim=c(16,97),xlim=c(.64,.94)) legend(.635,97,c("1 pregnant","8 pregnant","1 virgin","8 virgin","0 partners"), pch=c(1,2,1,2,3),col=c(2,2,3,3,1)) # make three different scatterplots, for 0, 1, and 8 females par(mfrow=c(2,2)) plot(thorax[partners==0],longevity[partners==0],pch=3,col=1, main="0 partners",ylim=c(16,97),xlim=c(.64,.94), xlab="thorax length", ylab="longevity") plot(thorax[partners==1],longevity[partners==1],pch=pch.type[partners==1], col=2,main="1 partners",ylim=c(16,97),xlim=c(.64,.94), xlab="thorax length", ylab="longevity") legend("topleft",c("pregnant","virgin"),pch=c(1,2),col=2) plot(thorax[partners==8],longevity[partners==8],pch=pch.type[partners==8], col=3,main="8 partners",ylim=c(16,97),xlim=c(.64,.94), xlab="thorax length", ylab="longevity") legend("topleft",c("pregnant","virgin"),pch=c(1,2),col=3) # create dummy variables for the different categories: dummy.1.p <- (partners==1)*(type==0)*1 dummy.1.v <- (partners==1)*(type==1)*1 dummy.8.p <- (partners==8)*(type==0)*1 dummy.8.v <- (partners==8)*(type==1)*1 dummy.0 <- (partners==0) # make boxplot of longevity and thorax for different groups par(mfrow=c(1,2)) boxplot(longevity[dummy.1.p==1],longevity[dummy.1.v==1],longevity[dummy.8.p==1], longevity[dummy.8.v==1],longevity[dummy.0==1], main="Longevity",names=c("1p","8p","1v","8v","0"),col="grey") means <- c(mean(longevity[dummy.1.p==1]),mean(longevity[dummy.1.v==1]), mean(longevity[dummy.8.p==1]),mean(longevity[dummy.8.v==1]), mean(longevity[dummy.0==1])) points(c(1:5),means) boxplot(thorax[dummy.1.p==1],thorax[dummy.1.v==1],thorax[dummy.8.p==1], thorax[dummy.8.v==1],thorax[dummy.0==1], main="Thorax length",names=c("1p","8p","1v","8v","0"),col="grey") means <- c(mean(thorax[dummy.1.p==1]),mean(thorax[dummy.1.v==1]), mean(thorax[dummy.8.p==1]),mean(thorax[dummy.8.v==1]), mean(thorax[dummy.0==1])) points(c(1:5),means) # Is thorax a confounding variable? # Thorax seems to have an effect on longevity: m1 <- lm(longevity ~ thorax) summary(m1) # But thorax is quite similar for the various groups. # This is visible in the boxplot. # We can also test it formally: m2 <- lm(thorax ~ dummy.1.p + dummy.1.v + dummy.8.p + dummy.8.v) m3 <- lm(thorax ~ 1) anova(m3,m2) # So thorax is not much correlated with group. # That was to be expected, since this was a randomized experiment. # So can we leave thorax out of the model? # Consider the following tests: # Test effect of type of female between the groups with 1 partner # First, we do not include thorax m4 <- lm(longevity[partners==1] ~ factor(type[partners==1])) summary(m4) # as a side note: note that we simply fit the means for both groups: mean(longevity[dummy.1.p==1]) mean(longevity[dummy.1.v==1]) mean(longevity[dummy.1.v==1])-mean(longevity[dummy.1.p==1]) # And now we do the same but with including thorax: m5 <- lm(longevity[partners==1] ~ thorax[partners==1] + factor(type[partners==1])) summary(m5) # Why is the result much more significant in the second model? # Let's test for interaction between number of females and type of females # We first try: partners.f <- as.factor(partners) type.f <- as.factor(type) m6 <- lm(longevity ~ thorax + partners.f + type.f + partners.f*type.f) summary(m6) # What went wrong? # We would still like to know if there is a significant interaction # between 'type of female' and 'number of females'? That is, we want to # whether the effect of 'type of female' depends on the value of 'number # of females'. How could you test this? # we start by fitting a model with a common slope (effect of thorax length) and # different intercepts for the 5 groups, taking the group with # zero females as the baseline: # y = alpha + beta*thorax + gamma.1.p*dummy.1.p + # gamma.1.v*dummy.1.v + gamma.8.p*dummy.8.p + # gamma.8.v*dummy.8.v + epsilon m6 <- lm(longevity ~ thorax + dummy.1.p + dummy.1.v + dummy.8.p + dummy.8.v) summary(m6) # vector with estimated coefficients: m6$coef # Interpretation of coefficients... # Is there a significant difference between the three control groups? control <- (dummy.1.p==1) | (dummy.8.p==1) | (dummy.0==1) m7 <- lm(longevity[control] ~ thorax[control] + dummy.1.p[control] + dummy.8.p[control]) m8 <- lm(longevity[control] ~ thorax[control]) anova(m8,m7) # show fitted regression lines: # 0 females: par(mfrow=c(2,2)) plot(thorax[partners==0],longevity[partners==0],pch=3,col=1, main="0 partners",ylim=c(16,97),xlim=c(.64,.94), xlab="thorax length", ylab="longevity") abline(m6$coef[1], m6$coef[2]) # 1 female: plot(thorax[partners==1],longevity[partners==1],pch=pch.type[partners==1], col=2,main="1 partners",ylim=c(16,97),xlim=c(.64,.94), xlab="thorax length", ylab="longevity") legend(0.635,97,c("pregnant","virgin"),pch=c(1,2),col=2) abline(m6$coef[1]+m6$coef[3], m6$coef[2], col="red") # 1 pregnant female abline(m6$coef[1]+m6$coef[4], m6$coef[2], col="red") # 1 virgin female # 8 females: plot(thorax[partners==8],longevity[partners==8],pch=pch.type[partners==8], col=3,main="8 partners",ylim=c(16,97),xlim=c(.64,.94), xlab="thorax length", ylab="longevity") legend(0.635,97,c("pregnant","virgin"),pch=c(1,2),col=3) abline(m6$coef[1]+m6$coef[5], m6$coef[2], col="green") # 8 pregnant females abline(m6$coef[1]+m6$coef[6], m6$coef[2], col="green") # 8 virgin females # Note that all lines are parallel: # Does this model allow for interaction between thorax and type of female? # Does this model allow for interaction between thorax and number of females? # Does this model allow for interaction between type of female and # number of females? # If there is no interaction between type of female and number of females, # then the effect of type of female does not depend on the number of females: # This implies: gamma.1.p - gamma.1.v = gamma.8.p - gamma.8.v # or equivalently: gamma.1.p = gamma.1.v + gamma.8.p - gamma.8.v # Fitting this reduced linear model (see derivation on the board): m9 <- lm(longevity ~ thorax + I(dummy.1.v + dummy.1.p) + I(dummy.8.p + dummy.1.p) + I(dummy.8.v - dummy.1.p)) anova(m9,m6)