HW # 8 Solution

9 pts total

Problem 1

A - 1pt

howell <- read.table("https://stats.are-awesome.com/datasets/Howell_craniometry.txt", header=TRUE, sep=",")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

B - 1pt

howell <- howell %>% 
  dplyr::select(ID, Sex, Population, BNL, MDH, EKB, ZOR, BAA, NBA) %>%
  dplyr::filter(Population %in% c("BUSHMAN", "PERU", "NORSE", "ZULU"))
head(howell)
##   ID Sex Population BNL MDH EKB ZOR BAA NBA
## 1  1   M      NORSE 100  31 100  81  39  76
## 2  2   M      NORSE 102  19  96  84  35  79
## 3  3   M      NORSE 102  28  97  82  38  72
## 4  4   M      NORSE 100  25  99  79  46  75
## 5  5   M      NORSE  97  26  97  79  42  80
## 6  6   M      NORSE 106  29  98  88  39  77

C - 1pt

cov(howell[,4:9])
##           BNL       MDH        EKB       ZOR        BAA       NBA
## BNL 31.990992  8.122307 15.9348763 21.629802 -1.5687912 -2.261421
## MDH  8.122307 16.495175  5.4709631  3.281787  4.9648033  1.509080
## EKB 15.934876  5.470963 20.0245801 13.425144 -0.6598362 -2.085858
## ZOR 21.629802  3.281787 13.4251439 24.084470 -4.6930390 -6.212397
## BAA -1.568791  4.964803 -0.6598362 -4.693039 11.3239096  3.836069
## NBA -2.261421  1.509080 -2.0858584 -6.212397  3.8360691 10.873610
#gives the same results
var(howell[,4:9])
##           BNL       MDH        EKB       ZOR        BAA       NBA
## BNL 31.990992  8.122307 15.9348763 21.629802 -1.5687912 -2.261421
## MDH  8.122307 16.495175  5.4709631  3.281787  4.9648033  1.509080
## EKB 15.934876  5.470963 20.0245801 13.425144 -0.6598362 -2.085858
## ZOR 21.629802  3.281787 13.4251439 24.084470 -4.6930390 -6.212397
## BAA -1.568791  4.964803 -0.6598362 -4.693039 11.3239096  3.836069
## NBA -2.261421  1.509080 -2.0858584 -6.212397  3.8360691 10.873610

Problem 2

A - 1pt

howellPCA <- prcomp(howell[,4:9], scale=TRUE)
## equivalently, use a formula to specify the variables explicitly
howellPCA <- prcomp(formula = ~ BNL + MDH + EKB + ZOR + BAA + NBA, data=howell, scale=TRUE)
howellPCA
## Standard deviations (1, .., p=6):
## [1] 1.6111576 1.2718569 0.8470599 0.7070900 0.6375459 0.4032368
## 
## Rotation (n x k) = (6 x 6):
##            PC1        PC2        PC3         PC4         PC5         PC6
## BNL -0.5491712  0.1353923 -0.2174536  0.03038470  0.45434939 -0.65225549
## MDH -0.2252196  0.5697790  0.4075710  0.65241682 -0.15456185  0.09474583
## EKB -0.5008242  0.1398848 -0.1659344 -0.36470110 -0.75351280 -0.03584305
## ZOR -0.5657892 -0.1237440 -0.0650274 -0.06528556  0.36459373  0.72329154
## BAA  0.1429996  0.6298377  0.3130779 -0.64280335  0.26129312  0.05803093
## NBA  0.2372676  0.4748104 -0.8104439  0.15153822  0.02632561  0.19437863

B - 1pt

howellPCA$rotation
##            PC1        PC2        PC3         PC4         PC5         PC6
## BNL -0.5491712  0.1353923 -0.2174536  0.03038470  0.45434939 -0.65225549
## MDH -0.2252196  0.5697790  0.4075710  0.65241682 -0.15456185  0.09474583
## EKB -0.5008242  0.1398848 -0.1659344 -0.36470110 -0.75351280 -0.03584305
## ZOR -0.5657892 -0.1237440 -0.0650274 -0.06528556  0.36459373  0.72329154
## BAA  0.1429996  0.6298377  0.3130779 -0.64280335  0.26129312  0.05803093
## NBA  0.2372676  0.4748104 -0.8104439  0.15153822  0.02632561  0.19437863

ZOR loads strongest on PC1 with a loading of -0.5657892. Note: it is the absolute value that matters here!

C - 1pt

summary(howellPCA)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5    PC6
## Standard deviation     1.6112 1.2719 0.8471 0.70709 0.63755 0.4032
## Proportion of Variance 0.4326 0.2696 0.1196 0.08333 0.06774 0.0271
## Cumulative Proportion  0.4326 0.7022 0.8218 0.90516 0.97290 1.0000

The cumulative proportion of variance up to and including PC2 is 0.7022

D - 3 pts

forPlot <- data.frame(howellPCA$x, howell$Population)
library(ggplot2)
xAxisLabel <- sprintf("PC1 (%.2f%%)", summary(howellPCA)$importance[2,"PC1"]*100)
yAxisLabel <- sprintf("PC2 (%.2f%%)", summary(howellPCA)$importance[2,"PC2"]*100)
pcPlot <- ggplot(forPlot, aes(x=PC1, y=PC2)) + 
  geom_point(size=3,aes(color=howell.Population)) + 
  labs(title="PCA of Howell Cranial Measurments", 
       color="Population", 
       x=xAxisLabel, 
       y=yAxisLabel) + 
  theme_bw(20) + 
  theme(legend.position="top")
pcPlot

you get a quadruple gold star if you managed to plot the variable loadings on the scatterplot

forLoadings <- data.frame(howellPCA$rotation, x=rep(0, 6), y=rep(0,6), var=rownames(howellPCA$rotation))

scalar <- 4

pcPlot + 
  geom_segment(data=forLoadings, 
               aes(x=x, y=y, xend=PC1*scalar, yend=PC2*scalar), 
               arrow = arrow()) + 
  geom_text(data=forLoadings, 
            aes(x=PC1*scalar*1.15, y=PC2*scalar*1.15, label=var))