# Just in case we get errors asking that the package repositories be explicitly set when installing new packages:
options(repos = list(CRAN = "http://cran.rstudio.com/"))
# Install and load required packages
# We can use the {pacman} package to easily install and load several packages:
# ({pacman} itself may need installing if not yet installed)
::p_load(
pacman
tidyverse, sjlabelled, easystats,
ggformula, ggeffects, marginaleffects,
modelsummary, gtsummary,# for some further model dataframe/tibble management functions
modelr, # for the function `mvrnorm` to generate variables with pre-specified covariance structure
MASS )
Week 8 Worksheet Notes
Study design: Simulation-based power analysis for study design
Aims
By the end of the session you will learn how to perform:
- Simulation of random variables
- Power analysis via simulation
R packages
The uses of simulation methods
Simulating data uses generating random data sets with known properties using code (or some other method). This can be useful in various contexts.
- To better understand our models. Probability models mimic variation in the world, and the tools of simulation can help us better understand this variation. Patterns of randomness are contrary to normal human thinking and simulation helps in training our intuitions about averages and variation
- To run statistical analyses (e.g., simulating a null distribution against which to compare a sample)
- To approximate the sampling distribution of data and propagate this to the sampling distribution of statistical estimates and procedures
Distribution functions
Base R functions:
rnorm()
: sampling from a normal distributionrunif()
: sampling from a uniform distributionrbinom()
: sampling from a binomial distributionrpois()
: sampling from a Poisson distribution
(Other distributions are also available)
sample()
: sampling elements from an R object with or without replacementreplicate()
: often plays a role in conjunction with sampling functions; it is used to evaluate an expressionN
number of times repeatedly
Not base-R packages:
MASS::mvtnorm()
: multivariate normal; sampling multiple variables with a known correlation structure (i.e., we can tell R how variables should be correlated with one another) and normally distributed errors
Sampling from a uniform distribution
The runif
function returns some number (n
) of random numbers from a uniform distribution with a range from \(a\) (min
) to \(b\) (max
) such that \(X\sim\mathcal U(a,b)\) (verbally, \(X\) is sampled from a uniform distribution with the parameters \(a\) and \(b\)), where \(-\infty < a < b < \infty\) (verbally, \(a\) is greater than negative infinity but less than \(b\), and \(b\) is finite). The default is to draw from a standard uniform distribution (i.e., \(a = 0\) and \(b = 1\)):
# Sample a vector of ten numbers and store the results in the object `rand_unifs`
# Note that the numbers will be different each time we re-run the `runif` function above.
# If we want to recreate the same sample, we should set a `seed` number first
<- runif(n = 10000, min = 0, max = 1); rand_unifs
The first 40 numbers from the sample are:
[1] 0.37502992 0.68982595 0.98590044 0.71059641 0.06132830 0.17670731
[7] 0.79833820 0.34533205 0.85024223 0.63665065 0.63472925 0.81505794
[13] 0.88980845 0.09416752 0.55637083 0.99175606 0.21588906 0.71801451
[19] 0.41727368 0.63044807 0.55003150 0.44468624 0.77229814 0.84770886
[25] 0.23830448 0.89719279 0.91995817 0.97298726 0.16592863 0.50980450
[31] 0.60880849 0.17703089 0.30092050 0.65307591 0.96833831 0.47484581
[37] 0.06716160 0.23575399 0.96125830 0.54294399 0.14238520 0.69111469
[43] 0.51297232 0.06054244 0.58162491 0.23383184 0.70888075 0.10920273
[49] 0.47519353 0.83015849
To visualise the entire sample, we can plot it on a histogram:
Sampling from a normal distribution
The rnorm
function returns some number (n
) of randomly generated values given a set mean (\(\mu\); mean
) and standard deviation (\(\sigma\); sd
), such that \(X\sim\mathcal N(\mu,\sigma^2)\). The default is to draw from a standard normal (a.k.a., “Gaussian”) distribution (i.e., \(\mu = 0\) and \(\sigma = 1\)):
<- rnorm(n = 10000, mean = 0, sd = 1)
rand_norms_10000
print(rand_norms_10000[1:20])
[1] 0.04627100 1.03334482 -1.84240888 1.62460072 1.60980100 1.65705963
[7] 2.15433314 0.09817849 -0.41576874 -2.78007368 0.31180445 -0.40322909
[13] 1.58767242 0.52701584 -0.28336089 -1.14416504 0.15796135 0.13720856
[19] 0.22746198 -0.88482968
- Histograms allow us to check how samples from the same distribution might vary.
Sampling from a Poisson distribution
A Poisson process describes events happening with some given probability over an area of time or space such that \(X\sim Poisson(\lambda)\), where the rate parameter \(\lambda\) is both the mean and variance of the Poisson distribution (note that by definition, \(\lambda > 0\), and although \(\lambda\) can be any positive real number, data are always integers, as with count data).
Sampling from a Poisson distribution can be done in R with rpois
, which takes only two arguments specifying the number of values to be returned (n
) and the rate parameter (lambda
). There are no default values for rpois
.
<- rpois(n = 10, lambda = 1.5)
rand_poissons
print(rand_poissons)
[1] 0 7 1 3 1 2 1 0 3 4
A histogram of a large number of values to see the distribution when \(\lambda = 4.5\):
<- rpois(n = 10000, lambda = 4.5) rand_poissons_10000
Sampling from a binomial distribution
A binomial distribution describes the number of ‘successes’ for some number of independent trials (\(\Pr(success) = p\)).
The
rbinom
function returns the number of successes aftersize
trials, in which the probability of success in each trial isprob
.Sampling from a binomial distribution in R with
rbinom
is a bit more complex than usingrunif
,rnorm
, orrpois
.Like those previous functions, the
rbinom
function returns some number (n
) of random numbers, but the arguments and output can be slightly confusing at first.For example, suppose we want to simulate the flipping of a fair coin 1000 times, and we want to know how many times that coin comes up heads (‘success’). We can do this with the following code:
<- rbinom(n = 1, size = 1000, prob = 0.5)
coin_flips
coin_flips
[1] 515
The above result shows that the coin came up heads 515 times. But note the (required) argument
n
. This allows us to set the number of sequences to run.If we instead set
n = 2
, then this could simulate the flipping of a fair coin 1000 times once to see how many times heads comes up, then repeating the whole process a second time to see how many times heads comes up again (or, if it is more intuitive, the flipping of two separate fair coins 1000 times at the same time).
<- rbinom(n = 2, size = 1000, prob = 0.5)
coin_flips_2
coin_flips_2
[1] 483 504
A coin was flipped 1000 times and returned 483 heads, and then another fair coin was flipped 1000 times and returned 504 heads.
As with the
rnorm
andrunif
functions, we can check to see what the distribution of the binomial function looks like if we repeat this process.Suppose that we want to see the distribution of the number of times heads comes up after 1000 flips. We can simulate the process of flipping 1000 times in a row with 10000 different coins:
<- rbinom(n = 10000, size = 1000, prob = 0.5) coin_flips_10000
Random sampling using sample
Sometimes it is useful to sample a set of values from a vector or list. The R function
sample
is very flexible for sampling a subset of numbers or elements from some structure (x
) in R according to some set probabilities (prob
).Elements can be sampled from
x
some number of times (size
) with or without replacement (replace
), though an error will be returned if thesize
of the sample is larger thanx
butreplace = FALSE
(default).Suppose we want to ask R to pick a random number from one to ten with equal probability:
<- sample(x = 1:10, size = 1)
rand_number_1
print(rand_number_1)
[1] 5
- We can increase the
size
of the sample to10
:
<- sample(x = 1:10, size = 10)
rand_number_10 print(rand_number_10)
[1] 8 6 9 7 2 5 1 3 4 10
Note that all numbers from 1 to 10 have been sampled, but in a random order. This is because the default is to sample without replacement, meaning that once a number has been sampled for the first element in
rand_number_10
, it is no longer available to be sampled again.We can change this and allow for sampling with replacement:
<- sample(x = 1:10, size = 10, replace = TRUE)
rand_number_10_r
print(rand_number_10_r)
[1] 4 9 3 8 5 3 10 2 5 9
Note that the numbers {3, 5, 9} are now repeated in the set of randomly sampled values above.
So far, because we have not specified a probability vector
prob
, the function assumes that every element in1:10
is sampled with equal probabilityHere’s an example in which the numbers 1-5 are sampled with a probability of 0.05, while the numbers 6-10 are sampled with a probability of 0.15, thereby biasing sampling toward larger numbers; we always need to ensure that these probabilities need to sum to 1.
<- c( rep(x = 0.05, times = 5), rep(x = 0.15, times = 5))
prob_vec
<- sample(x = 1:10, size = 10, replace = TRUE, prob = prob_vec)
rand_num_bias
print(rand_num_bias)
[1] 6 10 9 8 8 9 7 6 6 9
Sampling random characters from a list
We can also sample characters from a list of elements; it is no different than sampling numbers
For example, if we want to create a simulated data set that includes three different species of some plant or animal, we could create a vector of species identities from which to sample:
<- c("species_A", "species_B", "species_C"); species
- We can then sample from these three possible categories. For example:
<- sample(x = species, size = 24, replace = TRUE,
sp_sample prob = c(0.5, 0.25, 0.25))
- What did the code above do?
[1] "species_B" "species_C" "species_C" "species_B" "species_B" "species_C"
[7] "species_B" "species_C" "species_C" "species_C" "species_C" "species_A"
[13] "species_A" "species_B" "species_A" "species_C" "species_C" "species_C"
[19] "species_B" "species_A" "species_A" "species_B" "species_C" "species_A"
Simulating data with known correlations
We can generate variables \(X_{1}\) and \(X_{2}\) that have known correlations \(\rho\) with with one another.
For example: two standard normal random variables with a sample size of 10000, and with correlation between them of 0.3:
<- 10000
N <- 0.3
rho <- rnorm(n = N, mean = 0, sd = 1)
x1 <- (rho * x1) + sqrt(1 - rho*rho) * rnorm(n = N, mean = 0, sd = 1) x2
- These variables are generated by first simulating the sample \(x_{1}\) (
x1
above) from a standard normal distribution. Then, \(x_{2}\) (x2
above) is calculated as
\(x_{2} = \rho x_{1} + \sqrt{1 - \rho^{2}}x_{rand}\),
where \(x_{rand}\) is a sample from a normal distribution with the same variance as \(x_{1}\).
We can generate variables \(X_{1}\) and \(X_{2}\) that have known correlations \(\rho\) with with one another.
For example: two standard normal random variables with a sample size of 10000, and with correlation between them of 0.3:
<- 10000
N <- 0.3
rho <- rnorm(n = N, mean = 0, sd = 1)
x1 <- (rho * x1) + sqrt(1 - rho*rho) * rnorm(n = N, mean = 0, sd = 1) x2
- Does the correlation equal
rho
(with some sampling error)?
cor(x1, x2)
[1] 0.2968118
There is a more efficient way to generate any number of variables with different variances and correlations to one another.
We need to use the MASS library, which can be installed and loaded as below:
In the MASS library, the function
mvrnorm
can be used to generate any number of variables for a pre-specified covariance structure.
Statistical power
Statistical power is defined as the probability, before a study is performed, that a particular comparison will achieve “statistical significance” at some predetermined level (typically a p-value below 0.05), given some assumed true effect size
If a certain effect of interest exists (e.g. a difference between two groups) power is the chance that we actually find the effect in a given study
A power analysis is performed by first hypothesizing an effect size, then making some assumptions about the variation in the data and the sample size of the study to be conducted, and finally using probability calculations to determine the chance of the p-value being below the threshold
The conventional view is that you should avoid low-power studies because they are unlikely to succeed
There are several problems with this view, but it’s often required by research funding bodies
Example 1: Simulating a simple randomized experiment
This example comes from Gelman, Hill, and Vehtari (2020)[Ch. 16]. They demonstrate data simulation for power analysis with an artificial example of a randomized experiment on 100 students designed to test an intervention for improving final exam scores.
Data
set.seed(862)
<- function(n = 100) {
sim_1 <- rnorm(n, mean = 60, sd = 20)
y_if_control <- y_if_control + 5
y_if_treated tibble(
z = rep(0:1, n / 2) |> sample(),
y = if_else(z == 1, y_if_treated, y_if_control)
)
}
<- sim_1() data_1a1
Having simulated the data, we can now compare treated to control outcomes and compute the standard error for the difference.
|>
data_1a1 summarize(
diff = mean(y[z == 1]) - mean(y[z == 0]),
diff_se =
sqrt(sd(y[z == 0])^2 / sum(z == 0) + sd(y[z == 1])^2 / sum(z == 1))
)
# A tibble: 1 × 2
diff diff_se
<dbl> <dbl>
1 4.66 3.80
Equivalently, we can run the regression:
set.seed(619)
<- lm(y ~ z, data = data_1a1, refresh = 0) fit_1a1
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 'refresh' will be disregarded
fit_1a1
Call:
lm(formula = y ~ z, data = data_1a1, refresh = 0)
Coefficients:
(Intercept) z
59.008 4.656
To give a sense of why it would be a mistake to focus on the point estimate, we repeat the above steps for a new batch of 100 students simulated from the model. Here is the result:
set.seed(827)
<- sim_1() data_1a2
set.seed(619)
<- lm(y ~ z, data = data_1a2, refresh = 0) fit_1a2
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 'refresh' will be disregarded
fit_1a2
Call:
lm(formula = y ~ z, data = data_1a2, refresh = 0)
Coefficients:
(Intercept) z
56.19 12.19
A naive read of this table would be that the design with 100 students is just fine, as the estimate is well over two standard errors away from zero. But that conclusion would be a mistake, as the coefficient estimate here is too noisy to be useful.
Including a pre-treatment predictor
Add a pre-test variable simulated independently of the potential outcomes for the final test score.
set.seed(134)
<-
data_1b |>
data_1a1 mutate(x = rnorm(n(), mean = 50, sd = 20))
We can then adjust for pre-test in our regression:
set.seed(619)
<- lm(y ~ z + x, data = data_1b, refresh = 0) fit_1b
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 'refresh' will be disregarded
fit_1b
Call:
lm(formula = y ~ z + x, data = data_1b, refresh = 0)
Coefficients:
(Intercept) z x
62.18784 4.44862 -0.06444
Because the pre-test variable was simulated independently of the potential outcomes for the final test score, the standard error for the coefficient of z
wasn’t reduced.
To perform a realistic simulation, we will now simulate both test scores in a correlated way.
set.seed(822)
<- 100
n
<- rnorm(n, mean = 50, sd = 16)
true_ability <- true_ability + rnorm(n, mean = 0, sd = 12) + 10
y_if_control <- y_if_control + 5
y_if_treated
<-
data_2 tibble(
x = true_ability + rnorm(n, mean = 0, sd = 12),
z = rep(0:1, n / 2) |> sample(),
y = if_else(z == 1, y_if_treated, y_if_control)
)
The simple comparison is equivalent to a regression on the treatment indicator:
set.seed(619)
<- lm(y ~ z, data = data_2, refresh = 0) fit_2a
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 'refresh' will be disregarded
fit_2a
Call:
lm(formula = y ~ z, data = data_2, refresh = 0)
Coefficients:
(Intercept) z
59.671 6.363
And the estimate adjusting for pre-test:
set.seed(619)
<- lm(y ~ z + x, data = data_2, refresh = 0) fit_2b
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 'refresh' will be disregarded
fit_2b
Call:
lm(formula = y ~ z + x, data = data_2, refresh = 0)
Coefficients:
(Intercept) z x
20.673 7.357 0.795
In this case, with a strong dependence between pre-test and post-test, this adjustment has reduced the residual standard deviation by about a quarter.
Example 2: simulating a regression design
We can use simulation to test rather complex study designs
Imagine you are interested in students attitude towards smoking and how it depends on the medium of the message and the focus of the message
We want to know whether people’s attitude is different after seeing a visual anti-smoking message (these pictures on the package) vs. a text-message (the text belonging to that picture)
We are interested in whether the attitude that people report is different after seeing a message that regards the consequences on other people (e.g. smoking can harm your loved ones) as compared to yourself (smoking can cause cancer)
Study design
DV: attitude towards smoking (0-100) IV1: medium (text vs. visual) IV2: focus (internal vs. external)
This is, there are 4 groups:
group_TI will receive text-messages that are internal
group_TE will receive text-messages that are external
group_VI will receive visual messages that are internal
group_VE will receive visual messages that are external
assume that we expect that people’s attitude will be more negative after seeing a visual rather than text message if the focus is internal (i.e. the message is about yourself) because it might be difficult to imagine that oneself would get cancer after reading a text but seeing a picture might cause fear regardless
for the external focus on the other hand, we expect a more negative attitude after reading a text as compared to seeing a picture, as it might have more impact on attitude to imagine a loved one get hurt than seeing a stranger in a picture suffering from the consequences of second-hand smoking
we expect that the internal focus messages will be related to lower attitudes compared to the external focus messages on average but we expect no main-effect of picture vs. text-messages
visualize some rough means that show the desired behavior that we described in words earlier and see where we are going
we could make the overall mean of the internal focus groups (group_TI and group_VI) 20 and the mean of the external groups (group_TE and group_VE) 50 (this would already reflect the main-effect but also a belief that the smoking-attitudes are on average quite negative as we assume both means to be on the low end of the scale)
assume that the mean of group_TI is 30 while the mean of group_VI is 10 and we could assume that the mean of group_TE is 40 and the mean of group_VE is 60
<- rep(c("internal", "external"), each = 2)
focus <- rep(c("text", "visual"), times = 2)
media <- 50
mean_TI <- 20
mean_VI <- 30
mean_TE <- 60
mean_VE
<- data.frame(score = c(mean_TI, mean_VI, mean_TE, mean_VE), focus = focus, media = media)
pd
interaction.plot(pd$focus, pd$media, pd$score, ylim = c(0,100))
<- rep(c("internal", "external"), each = 2)
focus <- rep(c("text", "visual"), times = 2)
media <- 43
mean_TI <- 40
mean_VI <- 45
mean_TE <- 47
mean_VE
<- data.frame(score = c(mean_TI, mean_VI, mean_TE, mean_VE), focus = focus, media = media)
pd
interaction.plot(pd$focus, pd$media, pd$score, ylim = c(0,100))
in the new example there is a difference between the two media groups on average but it is only .50 points, so arguably it is small enough to represent the assumption of “no” effect, as in real-life “no” effect in terms of a difference being actually 0 is rather rare
come up with some reasonable standard-deviation; if we start at 50 and we want most people to be < 80, we can set the 2-SD bound at 80 to get a standard-deviation of 15 (80-50)/2.
let’s assume that each of our groups has a standard-deviation of 15 points.
group_TI = normal(n, 43, 15)
group_VI = normal(n, 40, 15)
group_TE = normal(n, 45, 15)
group_VE = normal(n, 47, 15)
<- 1e5
n <- rnorm(n, 43, 15)
group_TI <- rnorm(n, 40, 15)
group_VI <- rnorm(n, 45, 15)
group_TE <- rnorm(n, 47, 15)
group_VE
<- c(1:(n*4))
participant <- rep(c("internal", "external"), each = n*2)
focus <- rep(c("text", "visual"), each = n, times = 2)
media
<- data.frame(participant = participant, focus = focus, media = media, score = c(group_TI, group_VI, group_TE, group_VE))
data
summary(data)
participant focus media score
Min. :1e+00 Length:400000 Length:400000 Min. :-36.36
1st Qu.:1e+05 Class :character Class :character 1st Qu.: 33.51
Median :2e+05 Mode :character Mode :character Median : 43.77
Mean :2e+05 Mean : 43.77
3rd Qu.:3e+05 3rd Qu.: 54.01
Max. :4e+05 Max. :113.93
Power-analysis
Some additional assumptions: suppose we have enough funding for a sizeable data collection and the aim is to ensure that we do not draw unwarranted conclusions from the research
We should then set the alpha-level at a more conservative value (\(\alpha = .001\)); with this, we expect to draw non-realistic conclusions in the interaction effect in only about 1 in every 1,000 experiments
We also want to be sure that we do detect an existing effect and keep our power high at 95%; with this, we expect that if there is an interaction effect, we would detect it in 19 out of 20 cases (only miss it in 1 out of 20, or 5%)
Running the power-simulation can be very memory-demanding and the code can run a very long time to complete; it’s advised to start from various “low-resolution” sample-sizes (e.g. n = 10, n = 100, n = 200, etc.) to get a rough idea of where we can expect our loop to end. Then, the search can be made more specific in order to identify a more precise sample size.
set.seed(1)
<- 1000 # we want 1000 simulations
n_sims <- c()
p_vals <- c(0) # this vector will contain the power for each sample-size (it needs the initial 0 for the while-loop to work)
power_at_n <- 100 # sample-size and start at 100 as we can be pretty sure this will not suffice for such a small effect
n <- 100 # by which stepsize should n be increased
n_increase <- 2
i
<- .95
power_crit <- .001
alpha
while(power_at_n[i-1] < power_crit){
for(sim in 1:n_sims){
<- rnorm(n, 43, 15)
group_TI <- rnorm(n, 40, 15)
group_VI <- rnorm(n, 45, 15)
group_TE <- rnorm(n, 47, 15)
group_VE
<- c(1:(n*4))
participant <- rep(c("internal", "external"), each = n*2)
focus <- rep(c("text", "visual"), each = n, times = 2)
media
<- data.frame(participant = participant, focus = focus, media = media, score = c(group_TI, group_VI, group_TE, group_VE))
data $media_sum_num <- ifelse(data$media == "text", 1, -1) # apply sum-to-zero coding
data$focus_sum_num <- ifelse(data$focus == "external", 1, -1)
data<- lm(score ~ 1 + focus_sum_num + media_sum_num + focus_sum_num:media_sum_num, data = data) # fit the model with the interaction
lm_int <- lm(score ~ 1 + focus_sum_num + media_sum_num, data = data) # fit the model without the interaction
lm_null <- anova(lm_int, lm_null)$`Pr(>F)`[2] # put the p-values in a list
p_vals[sim]
}print(n)
<- mean(p_vals < alpha) # check power (i.e. proportion of p-values that are smaller than alpha-level of .10)
power_at_n[i] names(power_at_n)[i] <- n
<- n+n_increase # increase sample-size by 100 for low-resolution testing first
n <- i+1 # increase index of the while-loop by 1 to save power and cohens d to vector
i }
[1] 100
[1] 200
[1] 300
[1] 400
[1] 500
[1] 600
[1] 700
[1] 800
[1] 900
<- power_at_n[-1] # delete first 0 from the vector power_at_n
We can plot the results form the power-simulation:
- At roughly 900 participants we observe sufficient power