The dataset holds data on weight, height and gender of 10000 individuals and was taken from https://www.kaggle.com/mustafaali96/weight-height
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
life_data <- read.csv("weight-height.csv")
# height in inches, weight in pounds, transform to cm and kg
life_data <- life_data %>%
mutate(Height = Height * 2.54,
Weight = life_data$Weight * 0.454,
Gender = as.factor(Gender))
glimpse(life_data)
## Rows: 10,000
## Columns: 3
## $ Gender <fct> Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Mal…
## $ Height <dbl> 187.5714, 174.7060, 188.2397, 182.1967, 177.4998, 170.8227, 174…
## $ Weight <dbl> 109.81968, 73.68895, 96.58435, 99.89928, 93.68281, 69.10432, 83…
First, consider only the height
# visualize the data
life_data %>% ggplot(aes(x=Gender, y=Height, fill=Gender)) +
geom_boxplot()
life_data %>% ggplot(aes(x=Height, fill=Gender)) +
geom_density(alpha=0.6)
# run Logistic regression - start with a simple Height model
lr <- glm(Gender ~ Height, life_data, family=binomial)
# Look the model - does a coefficient for Height have any (meaningful) interpretation?
# Maybe revise what quantity the Y = B0+B1x1+... represents in the logistic regression
lr
##
## Call: glm(formula = Gender ~ Height, family = binomial, data = life_data)
##
## Coefficients:
## (Intercept) Height
## -45.3001 0.2689
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9998 Residual
## Null Deviance: 13860
## Residual Deviance: 7542 AIC: 7546
# Obtain the probability of being a Man for train data
lr_prob <- predict(lr, life_data, type="response")
gender_pred <- lr_prob > 0.5
# Compare the 2 calls below. Can you guess how do they differ and how to get from one to the other?
# HINT: Recall, how to get from Y = B0+B1x1+... to the probability
predict(lr, life_data, type="response") %>% head
## 1 2 3 4 5 6
## 0.9941160 0.8416663 0.9950788 0.9755074 0.9184738 0.6517203
predict(lr, life_data) %>% head
## 1 2 3 4 5 6
## 5.1296161 1.6706788 5.3092777 3.6845852 2.4217888 0.6266095
my_accuracy <- function(preds, reference, print_confmat=FALSE){
confusion_mat <- table(preds, reference)
acc <- (confusion_mat %>% diag %>% sum) / sum(confusion_mat)
if(print_confmat){
cat("\nConfusion matrix:\n")
print(confusion_mat)
}
cat("\n", sprintf("Accuracy: %s", acc), "\n")
}
# Evaluate accuracy of Logistic regression
my_accuracy(gender_pred, life_data$Gender, print_confmat = TRUE)
##
## Confusion matrix:
## reference
## preds Female Male
## FALSE 4171 845
## TRUE 829 4155
##
## Accuracy: 0.8326
# Prepare data to plot the sigmoid
sigmoid_data <- life_data %>%
mutate(lr_prob = lr_prob,
gender_num = as.numeric(Gender) - 1)
# Using the sigmoid data, estimate the decision boundary of the logistic regression
# You will see a better way to analytically find the decision boundary
lr_decisionBoundary <- sigmoid_data %>% filter(lr_prob > 0.5) %>% pull(Height) %>% min()
sprintf("LR decision boundary: %s", lr_decisionBoundary)
## [1] "LR decision boundary: 168.497650366066"
# Plot the sigmoid
sigmoid_data %>%
ggplot(aes(x=Height, y=lr_prob)) +
geom_point(col="red") +
geom_point(aes(x=Height, y=gender_num)) +
# Logistic regression decision boundary
geom_vline(xintercept = lr_decisionBoundary, col="blue") +
geom_hline(yintercept = 0.5, linetype="dotted") +
ylab("P(Gender=Male|Height)")
# First, observe, how the multivariate normal distributions of Height + Weight looks
life_data %>%
ggplot(aes(x=Height, y=Weight, col=Gender)) +
geom_point(alpha=0.1)
lr_hw <- glm(Gender ~ ., life_data, family=binomial)
lr_hw
##
## Call: glm(formula = Gender ~ ., family = binomial, data = life_data)
##
## Coefficients:
## (Intercept) Height Weight
## 0.6925 -0.1939 0.4369
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
## Null Deviance: 13860
## Residual Deviance: 4183 AIC: 4189
# Predict the gender and assess the accuracy
gender_pred <- predict(lr_hw, life_data, type = "response") > 0.5
my_accuracy(gender_pred, life_data$Gender)
##
## Accuracy: 0.9194
# Plotting decision boundary
class(lr_hw) <- c("lr", class(lr_hw)) # a bit of adjusting for plotting the decision boundary
# the decision boundary is linear!
predict.lr <- function(object, newdata, ...)
predict.glm(object, newdata, type = "response") > .5
decisionplot(lr_hw, life_data, class = "Gender", main = "Logistic regression", resolution=200)
# Question: Why is the logistic regression decision boundary linear? Can you derive the equation?
# HINT: Refer again to the equation of LR. What probability do the points lying on the decision boundary have?
# run LDA
lda_res <- lda(Gender ~ Height, life_data)
lda_res
## Call:
## lda(Gender ~ Height, data = life_data)
##
## Prior probabilities of groups:
## Female Male
## 0.5 0.5
##
## Group means:
## Height
## Female 161.8203
## Male 175.3269
##
## Coefficients of linear discriminants:
## LD1
## Height 0.1415641
# Take look at the predicted posterior probabilities for each gender
gender_pred <- predict(lda_res, life_data)
# View(gender_pred$posterior)
# Check the confusion matrix and the accuracy of predictions
my_accuracy(gender_pred$class,
life_data$Gender,
print_confmat = TRUE)
##
## Confusion matrix:
## reference
## preds Female Male
## Female 4189 861
## Male 811 4139
##
## Accuracy: 0.8328
# Can you guess, what is the threshold value by which we discriminate genders (in a good way)?
# You can use this DF
gender_pred_df <- data.frame(
Height = life_data$Height,
Gender_pred = gender_pred$class
)
# Un-comment to view the predictions
# View(gender_pred_df)
lda_decisionBoundary <- gender_pred_df %>%
filter(Gender_pred == "Female") %>%
dplyr::select(Height) %>%
max
# compared to lr_decBoundary, a tiny difference exists
lda_decisionBoundary
## [1] 168.5736
lr_decisionBoundary
## [1] 168.4977
The core of LDA is the Bayes theorem (hover to see the human-readable format). $ Pr(Male|Height) = {Pr(Height|Male) + Pr(Height|Female)}$ Note that the above expression is a simplified form of the theorem since the prior probabilities of genders are the same \(P(Male)=P(Female)\) LDA further assumes the distributions of heights in each gender to be normal \(Pr(Height|Male) \sim \mathcal{N}(\mu,\sigma^2)\). All you need to do is to estimate parameters of the normal distributions and use them in the Bayes formula.
## [1] 0.9942165 0.8404279 0.9951696 0.9756555 0.9182121 0.6477859
## Female Male
## 1 0.005810296 0.9941897
## 2 0.159773209 0.8402268
## 3 0.004853568 0.9951464
## 4 0.024423757 0.9755762
## 5 0.081951918 0.9180481
## 6 0.352339590 0.6476604
##
## Accuracy: 0.8328
## [1] 168.5736
## [1] 168.5736
# Create normal approximations of our data (i.e the "ideal data") that LDA internally works with
height_seq <- seq(min(life_data$Height), max(life_data$Height), length.out=nrow(life_data)/2)
ideal_life_data <- data.frame(
height=height_seq,
# Densities of heights under the normality assumption
Female=dnorm(height_seq, mean=mu_f, sd=sd_all),
Male = dnorm(height_seq, mean=mu_m, sd=sd_all)
)
# Reshape for plotting purposes
ideal_life_data <- ideal_life_data %>%
reshape2::melt(id.vars="height", variable.name = "Gender")
# Plot the distributions, their normal approximations and the decision boundary (or point in this 1D case)
life_data %>% ggplot(aes(x=Height, fill=Gender)) +
geom_density(alpha=0.6) +
geom_line(data = ideal_life_data, aes(x=height, y=value), linetype = 2) +
# Add the decision boundary
geom_vline(xintercept = (mu_f + mu_m) / 2, size=1) +
ggtitle("Decision boundary\n (dotted lines indicate normal approximations)") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# standard deviations averaged, decision boundary does not cross the intersection of class densities in the real case
As with Logistic Regression, let’s include all features in our model.
# run LDA function
lda_hw <- lda(Gender ~ ., life_data)
lda_hw
## Call:
## lda(Gender ~ ., data = life_data)
##
## Prior probabilities of groups:
## Female Male
## 0.5 0.5
##
## Group means:
## Height Weight
## Female 161.8203 61.68048
## Male 175.3269 84.90736
##
## Coefficients of linear discriminants:
## LD1
## Height -0.06766107
## Weight 0.15646959
gender_pred <- predict(lda_hw, life_data)$class
# Accuracy, by 8% better than with Height only
my_accuracy(gender_pred, life_data$Gender)
##
## Accuracy: 0.9193
decisionplot(lda_hw, life_data, class = "Gender", main = "LDA", resolution=200)
# Question: Again, why is the decision boundary linear?
# Short answer: Using the formula for the normal distribution density and setting P(man|X)=P(woman|X) we can
# derive a term for the boundary, which will be linear in X.
# For details refer to "Introduction to statistical learning in R", page 142
With two features, the Bayes formula extends to: \(Pr(Male|Heigh,Weight) = \frac{Pr(Heigh,Weight|Male)} {Pr(Heigh,Weight|Male) + Pr(Heigh,Weight|Female)}\)
# estimate parameters of distributions, substitute them into the Bayes formula
library(mvtnorm)
mu_f <- colMeans(life_data[life_data$Gender=="Female",c(2,3)])
mu_m <- colMeans(life_data[life_data$Gender=="Male",c(2,3)])
cov_f <- cov(life_data[life_data$Gender=="Female",c(2,3)])
cov_m <- cov(life_data[life_data$Gender=="Male",c(2,3)])
cov_avg <- (cov_f + cov_m)/2
# Again, compute the posterior for the samples (simplified by the fact that P(Male)=P(Female))
postM <- dmvnorm(life_data[c(2,3)], mu_m, cov_avg) /
(dmvnorm(life_data[c(2,3)], mu_m, cov_avg) + dmvnorm(life_data[c(2,3)], mu_f, cov_avg))
head(postM)
## 1 2 3 4 5 6
## 0.99999416 0.27675255 0.99815610 0.99985189 0.99912096 0.09998952
gender_pred <- postM > 0.5
my_accuracy(gender_pred, life_data$Gender)
##
## Accuracy: 0.9193
# The train accuracy of Logistic regression was 0.9194
##QDA
# run QDA
qda_res <- qda(Gender ~ .,life_data)
qda_res
## Call:
## qda(Gender ~ ., data = life_data)
##
## Prior probabilities of groups:
## Female Male
## 0.5 0.5
##
## Group means:
## Height Weight
## Female 161.8203 61.68048
## Male 175.3269 84.90736
gender_pred <- predict(qda_res, life_data)$class
# Evaluate accuracy
confusion_mat <- table(gender_pred, life_data$Gender)
(confusion_mat %>% diag %>% sum) / sum(confusion_mat) # almost the same train accuracy as LDA, more freedom does not help here
## [1] 0.9192
decisionplot(qda_res, life_data, class = "Gender", main = "QDA", resolution=200) # decision boundary close to linear
qda_res$scaling # both the genders scale nearly equally (scaling transforms the observations so that within-group covariance matrices are spherical)
## , , Female
##
## 1 2
## Height 0.1460161 -0.2352167
## Weight 0.0000000 0.2195462
##
## , , Male
##
## 1 2
## Height -0.137496 -0.2348510
## Weight 0.000000 0.2203915
var(as.matrix(subset(life_data, Gender=="Male",2:3)) %*% qda_res$scaling[,,"Male"]) # after scaling, the group variances equal 1 and covariances 0
## 1 2
## 1 1.000000e+00 -1.795308e-15
## 2 -1.795308e-15 1.000000e+00
# The decision boundary of logistic regression seem to be indistinguishable from the one of LDA
# The decision boundary is linear!
decisionplot(lda_hw, life_data, class = "Gender", main = "Logistic regression", resolution=200)
# Look however at the coefficients of Logistic regression and LDA. Why they are different and what meaning they have?
print("Logistic regression coefficients:")
## [1] "Logistic regression coefficients:"
coef(lr_hw)
## (Intercept) Height Weight
## 0.6925431 -0.1939449 0.4368732
print("LDA coefficients:")
## [1] "LDA coefficients:"
coef(lda_hw)
## LD1
## Height -0.06766107
## Weight 0.15646959