Personal code snippets of @tmasjc

Site powered by Hugo + Blogdown

Image by Mads Schmidt Rasmussen from unsplash.com

Minimal Bootstrap Theme by Zachary Betz

Pseudo R Squared

Nov 26, 2018 #descr

library(tidyverse)
library(descr)

# make reproducible 
set.seed(1234)
# number of points
nn <- 100

# An Simulated Model --------------------------------------------------------

# simulate binary response
y <- rbinom(nn, 1, prob = .5)
# single predictor
x <- y + rnorm(nn, mean = 1, sd = .5)
cor(x, y)
## [1] 0.7404515
# null model (without any predictor)
null_mod <- glm(y ~  1, family = "binomial")
# we know x is inheritly random
some_mod <- glm(y ~ x, family = "binomial")

# glance 
summary(some_mod)
## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.09191  -0.31476  -0.06342   0.43491   1.85033  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -7.9100     1.5971  -4.953 7.32e-07 ***
## x             4.9034     0.9643   5.085 3.68e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 137.63  on 99  degrees of freedom
## Residual deviance:  59.99  on 98  degrees of freedom
## AIC: 63.99
## 
## Number of Fisher Scoring iterations: 6
# the same result
1 - logLik(some_mod) / logLik(null_mod)
## 'log Lik.' 0.5641121 (df=2)
1 - some_mod$deviance / some_mod$null.deviance
## [1] 0.5641121
# from package
descr::LogRegR2(some_mod)
## Chi2                 77.63748 
## Df                   1 
## Sig.                 0 
## Cox and Snell Index  0.5399292 
## Nagelkerke Index     0.7223295 
## McFadden's R2        0.5641121
# Pseudo Rsq ~ Predictor Power ------------------------------------------

# vary predictor power by adding various degree of deviation
# return a list of int vector
ints <- map(seq(.1, .8, .01), ~ y + rnorm(nn, mean = 1, sd = .x))

# fit and extract pseudo rsq
get_pseudo_rsq <- function(vec) {
  fit <- glm(y ~ vec, family = "binomial")
  pseudo_rsq <- LogRegR2(fit)
  # return
  with(pseudo_rsq, list(
    corr = cor(vec, y),
    `McFadden` = RL2,
    `Cox & Snell` = CoxR2,
    `Nagelkerke` = NagelkerkeR2
  ))
} 
rsqs <- map(ints, get_pseudo_rsq)

# visualize corresponding pseudo rsq by correlation
rsqs %>% 
  bind_rows() %>% 
  gather(met, rsq, -corr) %>% 
  ggplot(aes(corr, rsq, col = met)) + 
  geom_line() + 
  geom_hline(yintercept = .5, col = "darkgray", lty = 3) + 
  theme_minimal(base_family = "Menlo") + 
  labs(x = "Correlation", y = "Pseudo R Squared", col = "Variant")

For more info, see https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-what-are-pseudo-r-squareds/