Code Monkey home page Code Monkey logo

simulate_fstat's Introduction

Simulating F-statistics

library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Let’s create 5000 data points from a multi-group comparison with no difference in group means

n_data <- 5000
n_per_group <- 10
n_groups <- 3
overall_mean <- 0
overal_sd <- 1
set.seed(1)
sim_df <- bind_rows(lapply(seq_len(5000), function(x){
  tibble(
    id = x,
    value = rnorm(n = n_per_group * n_groups, 
                  mean = overall_mean,
                  sd = overal_sd),
    group = rep(letters[seq_len(n_groups)], each = 10)
  )
}))

As an example we will inspect one of the instances:

filter(sim_df, id == 20) %>% 
  ggplot(aes(x = group, y = value)) +
  geom_dotplot(binaxis='y', stackdir='center') +
  stat_summary(fun=mean, geom="point", 
               shape=18, color="red")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Now we will compute, both the group mean and the overall mean for each instance (unique id):

sim_df <- sim_df %>% 
  group_by(id) %>% 
  mutate(obs_overall_mean = mean(value)) %>% 
  group_by(id, group) %>% 
  mutate(obs_group_mean = mean(value)) %>% 
  ungroup()

sim_df
## # A tibble: 150,000 x 5
##       id  value group obs_overall_mean obs_group_mean
##    <int>  <dbl> <chr>            <dbl>          <dbl>
##  1     1 -0.626 a               0.0825          0.132
##  2     1  0.184 a               0.0825          0.132
##  3     1 -0.836 a               0.0825          0.132
##  4     1  1.60  a               0.0825          0.132
##  5     1  0.330 a               0.0825          0.132
##  6     1 -0.820 a               0.0825          0.132
##  7     1  0.487 a               0.0825          0.132
##  8     1  0.738 a               0.0825          0.132
##  9     1  0.576 a               0.0825          0.132
## 10     1 -0.305 a               0.0825          0.132
## # … with 149,990 more rows

And now we’ll compute the residuals within groups and globally:

sim_df <- sim_df %>% 
  mutate(overall_residuals = value - obs_overall_mean,
         group_residuals = value - obs_group_mean) 

sim_df
## # A tibble: 150,000 x 7
##       id  value group obs_overall_mean obs_group_mean overall_residua…
##    <int>  <dbl> <chr>            <dbl>          <dbl>            <dbl>
##  1     1 -0.626 a               0.0825          0.132           -0.709
##  2     1  0.184 a               0.0825          0.132            0.101
##  3     1 -0.836 a               0.0825          0.132           -0.918
##  4     1  1.60  a               0.0825          0.132            1.51 
##  5     1  0.330 a               0.0825          0.132            0.247
##  6     1 -0.820 a               0.0825          0.132           -0.903
##  7     1  0.487 a               0.0825          0.132            0.405
##  8     1  0.738 a               0.0825          0.132            0.656
##  9     1  0.576 a               0.0825          0.132            0.493
## 10     1 -0.305 a               0.0825          0.132           -0.388
## # … with 149,990 more rows, and 1 more variable: group_residuals <dbl>

Next, we will compute residual sum of squares

rss_df <- sim_df %>% 
  group_by(id) %>% 
  summarise(RSS_0 = sum(overall_residuals^2),
            RSS_1 = sum(group_residuals^2)) %>% 
  ungroup
## `summarise()` ungrouping output (override with `.groups` argument)
hist(rss_df$RSS_0, probability = TRUE)
lines(seq(min(rss_df$RSS_0), max(rss_df$RSS_0), 0.1),
      dchisq(seq(min(rss_df$RSS_0), max(rss_df$RSS_0), 0.1), df = n_groups * n_per_group - 1), col = "red")

hist(rss_df$RSS_1, probability = TRUE)
lines(seq(min(rss_df$RSS_1), max(rss_df$RSS_1), 0.1),
      dchisq(seq(min(rss_df$RSS_1), max(rss_df$RSS_1), 0.1), df = n_groups * n_per_group - n_groups), col = "red")

hist(rss_df$RSS_0 - rss_df$RSS_1, probability = TRUE)
lines(seq(min(rss_df$RSS_0 - rss_df$RSS_1), 
          max(rss_df$RSS_0 - rss_df$RSS_1), 0.1),
      dchisq(seq(min(rss_df$RSS_0 - rss_df$RSS_1), 
                 max(rss_df$RSS_0 - rss_df$RSS_1), 0.1), df = n_groups - 1), col = "red")

d1 <- n_groups - 1
d2 <- n_groups * n_per_group - n_groups
Fstat <- (rss_df$RSS_0 - rss_df$RSS_1)/rss_df$RSS_1 * d2/d1

hist(Fstat, probability = TRUE)
lines(seq(min(Fstat), max(Fstat), 0.1),
      df(seq(min(Fstat), max(Fstat), 0.1), 
         df1 = d1, df2 = d2), col = "red")

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.