Illustrative example of LDA

The dataset holds data on weight, height and gender of 10000 individuals and was taken from https://www.kaggle.com/mustafaali96/weight-height

## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
life_data <- read.csv("weight-height.csv")

# height in inches, weight in pounds, transform to cm and kg
life_data <- life_data %>%
  mutate(Height = Height * 2.54,
         Weight = life_data$Weight * 0.454,
         Gender = as.factor(Gender))

glimpse(life_data)
## Rows: 10,000
## Columns: 3
## $ Gender <fct> Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Mal…
## $ Height <dbl> 187.5714, 174.7060, 188.2397, 182.1967, 177.4998, 170.8227, 174…
## $ Weight <dbl> 109.81968, 73.68895, 96.58435, 99.89928, 93.68281, 69.10432, 83…

Analyse height only

First, consider only the height

# visualize the data
life_data %>% ggplot(aes(x=Gender, y=Height, fill=Gender)) + 
  geom_boxplot()

life_data %>% ggplot(aes(x=Height, fill=Gender)) +
  geom_density(alpha=0.6)

Perform the classification tasks with Logistic regression

# run Logistic regression - start with a simple Height model
lr <- glm(Gender ~ Height, life_data, family=binomial)
# Look the model - does a coefficient for Height have any (meaningful) interpretation?
# Maybe revise what quantity the Y = B0+B1x1+... represents in the logistic regression
lr
## 
## Call:  glm(formula = Gender ~ Height, family = binomial, data = life_data)
## 
## Coefficients:
## (Intercept)       Height  
##    -45.3001       0.2689  
## 
## Degrees of Freedom: 9999 Total (i.e. Null);  9998 Residual
## Null Deviance:       13860 
## Residual Deviance: 7542  AIC: 7546
# Obtain the probability of being a Man for train data
lr_prob <- predict(lr, life_data, type="response")
gender_pred <- lr_prob > 0.5

# Compare the 2 calls below. Can you guess how do they differ and how to get from one to the other?
# HINT: Recall, how to get from Y = B0+B1x1+... to the probability
predict(lr, life_data, type="response") %>% head
##         1         2         3         4         5         6 
## 0.9941160 0.8416663 0.9950788 0.9755074 0.9184738 0.6517203
predict(lr, life_data)                  %>% head
##         1         2         3         4         5         6 
## 5.1296161 1.6706788 5.3092777 3.6845852 2.4217888 0.6266095
my_accuracy <- function(preds, reference, print_confmat=FALSE){
  confusion_mat <- table(preds, reference)
  acc <- (confusion_mat %>% diag %>% sum) / sum(confusion_mat)
  
  if(print_confmat){
    cat("\nConfusion matrix:\n")
    print(confusion_mat)
  }
  cat("\n", sprintf("Accuracy: %s", acc), "\n")
  
}

# Evaluate accuracy of Logistic regression
my_accuracy(gender_pred, life_data$Gender, print_confmat = TRUE)
## 
## Confusion matrix:
##        reference
## preds   Female Male
##   FALSE   4171  845
##   TRUE     829 4155
## 
##  Accuracy: 0.8326
# Prepare data to plot the sigmoid
sigmoid_data <- life_data %>%
  mutate(lr_prob = lr_prob,
         gender_num = as.numeric(Gender) - 1)

# Using the sigmoid data, estimate the decision boundary of the logistic regression
# You will see a better way to analytically find the decision boundary
lr_decisionBoundary <- sigmoid_data %>% filter(lr_prob > 0.5) %>% pull(Height) %>% min()
sprintf("LR decision boundary: %s", lr_decisionBoundary)
## [1] "LR decision boundary: 168.497650366066"
# Plot the sigmoid
sigmoid_data %>%   
  ggplot(aes(x=Height, y=lr_prob)) +
  geom_point(col="red") +
  geom_point(aes(x=Height, y=gender_num)) +
  # Logistic regression decision boundary
  geom_vline(xintercept = lr_decisionBoundary, col="blue") +
  geom_hline(yintercept = 0.5, linetype="dotted") +
  ylab("P(Gender=Male|Height)")

Fit LR with both Height and Weight

# First, observe, how the multivariate normal distributions of Height + Weight looks
life_data %>%
  ggplot(aes(x=Height, y=Weight, col=Gender)) +
  geom_point(alpha=0.1)

lr_hw <- glm(Gender ~ ., life_data, family=binomial)
lr_hw
## 
## Call:  glm(formula = Gender ~ ., family = binomial, data = life_data)
## 
## Coefficients:
## (Intercept)       Height       Weight  
##      0.6925      -0.1939       0.4369  
## 
## Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
## Null Deviance:       13860 
## Residual Deviance: 4183  AIC: 4189
# Predict the gender and assess the accuracy
gender_pred <- predict(lr_hw, life_data, type = "response") > 0.5
my_accuracy(gender_pred, life_data$Gender)
## 
##  Accuracy: 0.9194
# Plotting decision boundary
class(lr_hw) <- c("lr", class(lr_hw)) # a bit of adjusting for plotting the decision boundary
# the decision boundary is linear!
predict.lr <- function(object, newdata, ...)
  predict.glm(object, newdata, type = "response") > .5
decisionplot(lr_hw, life_data, class = "Gender", main = "Logistic regression", resolution=200) 

# Question: Why is the logistic regression decision boundary linear? Can you derive the equation?
# HINT: Refer again to the equation of LR. What probability do the points lying on the decision boundary have?

LDA

# run LDA
lda_res <- lda(Gender ~ Height, life_data)
lda_res
## Call:
## lda(Gender ~ Height, data = life_data)
## 
## Prior probabilities of groups:
## Female   Male 
##    0.5    0.5 
## 
## Group means:
##          Height
## Female 161.8203
## Male   175.3269
## 
## Coefficients of linear discriminants:
##              LD1
## Height 0.1415641
# Take look at the predicted posterior probabilities for each gender
gender_pred <- predict(lda_res, life_data)
# View(gender_pred$posterior)

# Check the confusion matrix and the accuracy of predictions
my_accuracy(gender_pred$class, 
            life_data$Gender,
            print_confmat = TRUE)
## 
## Confusion matrix:
##         reference
## preds    Female Male
##   Female   4189  861
##   Male      811 4139
## 
##  Accuracy: 0.8328
# Can you guess, what is the threshold value by which we discriminate genders (in a good way)?
# You can use this DF
gender_pred_df <- data.frame(
  Height = life_data$Height,
  Gender_pred = gender_pred$class
)

# Un-comment to view the predictions
# View(gender_pred_df)

lda_decisionBoundary <- gender_pred_df %>% 
  filter(Gender_pred == "Female") %>% 
  dplyr::select(Height) %>% 
  max

# compared to lr_decBoundary, a tiny difference exists
lda_decisionBoundary
## [1] 168.5736
lr_decisionBoundary
## [1] 168.4977

Understand LDA by arriving at the same results yourself

The core of LDA is the Bayes theorem (hover to see the human-readable format). $ Pr(Male|Height) = {Pr(Height|Male) + Pr(Height|Female)}$ Note that the above expression is a simplified form of the theorem since the prior probabilities of genders are the same \(P(Male)=P(Female)\) LDA further assumes the distributions of heights in each gender to be normal \(Pr(Height|Male) \sim \mathcal{N}(\mu,\sigma^2)\). All you need to do is to estimate parameters of the normal distributions and use them in the Bayes formula.

## [1] 0.9942165 0.8404279 0.9951696 0.9756555 0.9182121 0.6477859
##        Female      Male
## 1 0.005810296 0.9941897
## 2 0.159773209 0.8402268
## 3 0.004853568 0.9951464
## 4 0.024423757 0.9755762
## 5 0.081951918 0.9180481
## 6 0.352339590 0.6476604
## 
##  Accuracy: 0.8328
## [1] 168.5736
## [1] 168.5736

Plot the LDA decision boundary

# Create normal approximations of our data (i.e the "ideal data") that LDA internally works with
height_seq <- seq(min(life_data$Height), max(life_data$Height), length.out=nrow(life_data)/2)

ideal_life_data <- data.frame(
  height=height_seq, 
  # Densities of heights under the normality assumption
  Female=dnorm(height_seq, mean=mu_f, sd=sd_all), 
  Male = dnorm(height_seq, mean=mu_m, sd=sd_all)
) 

# Reshape for plotting purposes
ideal_life_data <- ideal_life_data %>%
  reshape2::melt(id.vars="height", variable.name = "Gender")

# Plot the distributions, their normal approximations and the decision boundary (or point in this 1D case)
life_data %>% ggplot(aes(x=Height, fill=Gender)) +
  geom_density(alpha=0.6) +
  geom_line(data = ideal_life_data, aes(x=height, y=value), linetype = 2)  +
  # Add the decision boundary
  geom_vline(xintercept = (mu_f + mu_m) / 2, size=1) +
  ggtitle("Decision boundary\n (dotted lines indicate normal approximations)") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# standard deviations averaged, decision boundary does not cross the intersection of class densities in the real case

Work with both height and weight

As with Logistic Regression, let’s include all features in our model.

# run LDA function
lda_hw <- lda(Gender ~ ., life_data)
lda_hw
## Call:
## lda(Gender ~ ., data = life_data)
## 
## Prior probabilities of groups:
## Female   Male 
##    0.5    0.5 
## 
## Group means:
##          Height   Weight
## Female 161.8203 61.68048
## Male   175.3269 84.90736
## 
## Coefficients of linear discriminants:
##                LD1
## Height -0.06766107
## Weight  0.15646959
gender_pred <- predict(lda_hw, life_data)$class

# Accuracy, by 8% better than with Height only
my_accuracy(gender_pred, life_data$Gender)
## 
##  Accuracy: 0.9193
decisionplot(lda_hw, life_data, class = "Gender", main = "LDA", resolution=200)

# Question: Again, why is the decision boundary linear?
# Short answer: Using the formula for the normal distribution density and setting P(man|X)=P(woman|X) we can 
# derive a term for the boundary, which will be linear in X. 
# For details refer to "Introduction to statistical learning in R", page 142

With two features, the Bayes formula extends to: \(Pr(Male|Heigh,Weight) = \frac{Pr(Heigh,Weight|Male)} {Pr(Heigh,Weight|Male) + Pr(Heigh,Weight|Female)}\)

# estimate parameters of distributions, substitute them into the Bayes formula
library(mvtnorm)
mu_f <- colMeans(life_data[life_data$Gender=="Female",c(2,3)])
mu_m <- colMeans(life_data[life_data$Gender=="Male",c(2,3)])
cov_f <- cov(life_data[life_data$Gender=="Female",c(2,3)])
cov_m <- cov(life_data[life_data$Gender=="Male",c(2,3)])
cov_avg <- (cov_f + cov_m)/2

# Again, compute the posterior for the samples (simplified by the fact that P(Male)=P(Female))
postM <- dmvnorm(life_data[c(2,3)], mu_m, cov_avg) / 
  (dmvnorm(life_data[c(2,3)], mu_m, cov_avg) + dmvnorm(life_data[c(2,3)], mu_f, cov_avg))
head(postM)
##          1          2          3          4          5          6 
## 0.99999416 0.27675255 0.99815610 0.99985189 0.99912096 0.09998952
gender_pred <- postM > 0.5

my_accuracy(gender_pred, life_data$Gender)
## 
##  Accuracy: 0.9193
# The train accuracy of Logistic regression was 0.9194

##QDA

# run QDA
qda_res <- qda(Gender ~ .,life_data)
qda_res
## Call:
## qda(Gender ~ ., data = life_data)
## 
## Prior probabilities of groups:
## Female   Male 
##    0.5    0.5 
## 
## Group means:
##          Height   Weight
## Female 161.8203 61.68048
## Male   175.3269 84.90736
gender_pred <- predict(qda_res, life_data)$class

# Evaluate accuracy
confusion_mat <- table(gender_pred, life_data$Gender)
(confusion_mat %>% diag %>% sum) / sum(confusion_mat) # almost the same train accuracy as LDA, more freedom does not help here
## [1] 0.9192
decisionplot(qda_res, life_data, class = "Gender", main = "QDA", resolution=200) # decision boundary close to linear

qda_res$scaling # both the genders scale nearly equally (scaling transforms the observations so that within-group covariance matrices are spherical)
## , , Female
## 
##                1          2
## Height 0.1460161 -0.2352167
## Weight 0.0000000  0.2195462
## 
## , , Male
## 
##                1          2
## Height -0.137496 -0.2348510
## Weight  0.000000  0.2203915
var(as.matrix(subset(life_data, Gender=="Male",2:3)) %*% qda_res$scaling[,,"Male"]) # after scaling, the group variances equal 1 and covariances 0
##               1             2
## 1  1.000000e+00 -1.795308e-15
## 2 -1.795308e-15  1.000000e+00
# The decision boundary of logistic regression seem to be indistinguishable from the one of LDA
# The decision boundary is linear!
decisionplot(lda_hw, life_data, class = "Gender", main = "Logistic regression", resolution=200) 

# Look however at the coefficients of Logistic regression and LDA. Why they are different and what meaning they have?
print("Logistic regression coefficients:")
## [1] "Logistic regression coefficients:"
coef(lr_hw)
## (Intercept)      Height      Weight 
##   0.6925431  -0.1939449   0.4368732
print("LDA coefficients:")
## [1] "LDA coefficients:"
coef(lda_hw)
##                LD1
## Height -0.06766107
## Weight  0.15646959

Independent work

  1. Which method is expected to work best on test data in this task (LDA, QDA or LR)? Answer without testing first. Use the knowledge of the individual methods assumptions.
  2. Experimentally verify your answer. Note that you may need to deal with different (and sometimes very small) train sets to see any difference …