Understanding the tdistribution
When performing a ttest, you first calculate your tstatistic using the familiar formula:
$$ t = \frac{X  M}{SE} $$
\(X\) is the observed value, \(M\) is the expected value under the null hypothesis (or population mean), and \(SE\) is the standard error. Once you've computed the tstatistic, you then compare it to the socalled critical value, which comes from the relevant tdistribution.
The shape of a tdistribution, and thus the critical value, is determined entirely by its degrees of freedom. To demonstrate this, let's draw some density plots for tdistributions using different degrees of freedom.
Instructions
 Create a vector
x
that contains a sequence of length 100 between 4 and 4. See?seq
for help.  Use
dt()
to generate tdistributions with 4, 6, 8, 10, and 12 degrees of freedom (in that order). The first argument todt()
is the vector of values at which to evalute the tdistribution (x
from above) and the second argument (df
) is the degrees of freedom.  Plot each of the tdistributions. Once the inital
plot()
is created, you'll uselines()
to plot each additional distribution. The two arguments tolines()
are the same as the first two arguments toplot()
, except that you'll have to substitute the appropriate yvalues. Use the color black for 4 degrees of freedom, red for 6, orange for 8, green for 10, and blue for 12.  Add a
legend()
to your plot. The legend should be situated at the top right corner of your plot and should have the title "tdistributions". This is done by setting the first argument to"topright"
and thetitle
argument to"tdistributions"
.
# Generate a vector of 100 values between 4 and 4
x < seq(___, ___, length = ___)
# Simulate the tdistribution
y_1 < dt(x, df = ___)
y_2 < dt(x, df = ___)
y_3 < dt(x, df = ___)
y_4 < dt(x, df = ___)
y_5 < dt(x, df = ___)
# Plot the tdistributions
plot(x, y_1, type = "l", lwd = 2, xlab = "tvalue", ylab = "Density",
main = "Comparison of tdistributions", col = "black")
lines(___, ___, col = "red")
lines(___, ___, col = "orange")
lines(___, ___, col = "green")
lines(___, ___, col = "blue")
# Add a legend
legend(___, c("df = 4", "df = 6", "df = 8", "df = 10", "df = 12"),
col = c("black", "red", "orange", "green", "blue"),
title = ___, lty = 1)
# Generate a vector of 100 values between 4 and 4
x < seq(4, 4, length = 100)
# Simulate the tdistribution
y_1 < dt(x, df = 4)
y_2 < dt(x, df = 6)
y_3 < dt(x, df = 8)
y_4 < dt(x, df = 10)
y_5 < dt(x, df = 12)
# Plot the tdistributions
plot(x, y_1, type = "l", lwd = 2, xlab = "tvalue", ylab = "Density",
main = "Comparison of tdistributions", col = "black")
lines(x, y_2, col = "red")
lines(x, y_3, col = "orange")
lines(x, y_4, col = "green")
lines(x, y_5, col = "blue")
# Add a legend
legend("topright", c("df = 4", "df = 6", "df = 8", "df = 10", "df = 12"),
col = c("black", "red", "orange", "green", "blue"),
title = "tdistributions", lty = 1)
test_object("x",
undefined_msg = "It seems that you did not define <code>x</code>. Remember that <code>x</code> is a vector that contains 100 values between 4 and 4.",
incorrect_msg = "Make sure that the variable <code>x</code> contains 100 numbers between 4 and 4.")
test_object("y_1",
undefined_msg = "It seems that you did not define <code>y_1</code>. <code>y_1</code> should be a tdistribution with 4 degrees of freedom.",
incorrect_msg = "Make sure that the variable <code>y_1</code> is a tdistribution with 4 degrees of freedom.")
test_object("y_2",
undefined_msg = "It seems that you did not define <code>y_2</code>. <code>y_2</code> should be a tdistribution with 6 degrees of freedom.",
incorrect_msg = "Make sure that the variable <code>y_2</code> is a tdistribution with 6 degrees of freedom.")
test_object("y_3",
undefined_msg = "It seems that you did not define <code>y_3</code>. <code>y_3</code> should be a tdistribution with 8 degrees of freedom.",
incorrect_msg = "Make sure that the variable <code>y_3</code> is a tdistribution with 8 degrees of freedom.")
test_object("y_4",
undefined_msg = "It seems that you did not define <code>y_4</code>. <code>y_4</code> should be a tdistribution with 10 degrees of freedom.",
incorrect_msg = "Make sure that the variable <code>y_4</code> is a tdistribution with 10 degrees of freedom.")
test_object("y_5",
undefined_msg = "It seems that you did not define <code>y_5</code>. <code>y_5</code> should be a tdistribution with 12 degrees of freedom.",
incorrect_msg = "Make sure that the variable <code>y_5</code> is a tdistribution with 12 degrees of freedom.")
test_function("dt", not_called_msg = "Please use the dt function to make the vectors <code>y_1</code>, <code>y_2</code>, <code>y_3</code>, <code>y_4</code>, and <code>y_5</code>. Type <code>?dt</code> in the console to get the help file.")
test_function("plot", not_called_msg = "Please use the plot function to make a graph of the tdistributions. Type <code>?plot</code> in the console to get the help file.")
test_function("lines", not_called_msg = "Please use the lines function to add lines to your plot of the tdistributions. Type <code>?lines</code> in the console to get the help file.")
test_function("plot", args = c("x", "y", "lwd"))
test_function("lines", args = c("x", "y", "col"), index = 1)
test_function("lines", args = c("x", "y", "col"), index = 2)
test_function("lines", args = c("x", "y", "col"), index = 3)
test_function("lines", args = c("x", "y", "col"), index = 4)
test_function("legend", args = c("x"),
not_called_msg = "Please use the legend function to add a legend to your plot of the tdistributions. Type <code>?legend</code> in the console to get the help file.",
incorrect_msg = "Make sure that the legend is positioned in the top right of the graph. Do this by typing topright, in speech bubbles, as the first argument. Type <code>?legend</code> to bring up the help file for this function.")
success_msg("Fantastic! Notice that the peaks and tails of the distributions are different for different degrees of freedom.")
 As an example, the command
dt(x, df = 8)
evaluates the tdistribution with 8 degrees of freedom at every value ofx
.  For the legend, you only have to fill in the position
"topright"
andtitle = "tdistributions"
. Don't forget the quotation marks!
The working memory dataset
In the following exercises, you will conduct a dependent (or paired) ttest on the "working memory" dataset. This dataset consists of the intelligence scores for subjects before and after training, as well as for a control group. Our goal is to assess whether intelligence training results in significantly different intelligence scores for the individuals.
The observations of individuals before and after training are two samples from the same group at different points in time, which calls for a dependent ttest. This will test whether or not the difference in mean intelligence scores before and after training are significant.
The working memory dataset has been loaded into your workspace as the object wm
. It contains the data for both the group who received training and the group who did not.
Instructions
 Print
wm
to the console to get a feel for the data  Create a subset of
wm
that includes only the training group and store the result inwm_t
. A value of1
in thetrain
column indicates that a subject received training, while a value of0
indicates that they did not.  View summary statistics for
wm_t
with thedescribe()
function.  Use the
boxplot()
function to create a boxplot of thepre
andpost
column ofwm_t
. Give the xaxis the label "Pre and PostTraining" and the yaxis the label "Intelligence Score".
library(psych)
library(car)
library(lsr)
# Read data into a dataframe that is called W%
wm < read.table(url("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.08.txt"), header = T)
# Take a look at the dataset
wm
# Create training subset of wm
wm_t < ___
# Summary statistics
# Create a boxplot with pre and posttraining groups
boxplot(___, ___, main = "Boxplot",
xlab = ___, ylab = ___,
col = c("red", "green"))
# Take a look at the dataset
wm
# Create training subset of wm
wm_t < subset(wm, wm$train == "1")
# Summary statistics
describe(wm_t)
# Create a boxplot with pre and posttraining groups
boxplot(wm_t$pre, wm_t$post, main = "Boxplot",
xlab = "Pre and PostTraining", ylab = "Intelligence Score",
col = c("red", "green"))
test_predefined_objects("wm")
test_output_contains("wm", incorrect_msg = "Did you examine the data?")
test_object("wm_t", eval = FALSE,
undefined_msg = "It seems that you did not create a subset of the data.")
test_function("subset", not_called_msg = "Please use the <code>subset()</code> function to create a subset of the data that contains only those subjects who trained.")
test_object("wm_t", incorrect_msg = "Make sure that you create a subset that contains only those subject who trained. The subjects which trained are represented in the <code>train</code> column with <code>1</code>.")
test_function("describe", args = "x",
not_called_msg = "Please use the <code>describe()</code> function to view summary statistics for your subset. Type <code>?describe</code> in the console to get the help file.",
incorrect_msg = "Did you look at the summary statistics for the subset <code>wm_t</code>?")
test_function("boxplot", args = c("x", "main", "xlab", "ylab"),
not_called_msg = "Please use the <code>boxplot()</code> function to create boxplots of the pre and posttraining groups. Type <code>?boxplot</code> in the console to get the help file.")
success_msg("Good job! The boxplot shows a difference in the means of the two groups but is this difference significant or simply due to chance? Let us find out.")
Perform a dependent ttest
Conducting a dependent ttest, also known as a paired ttest, requires the following steps:
 Define null and alternative hypotheses
 Decide significance level \(\alpha\)
 Compute observed tvalue
 Find critical value
 Compare observed value to critical value
We're performing a Null Hypothesis Significance Test (NHST), so our null hypothesis is that there's no effect (i.e. training has no impact on intelligence scores). The alternative hypothesis is that training results in signficantly different intelligence scores. We'll use a significance level of 0.05, which is very common in statistics. That takes care of the first two steps!
In this exercise, we'll focus on computing the observed tvalue, which is computed as follows:
$$ t = \frac{\bar{x}_D}{s_D / \sqrt{n}} $$
\(n\) is just the sample size, or the number of individuals in our sample. \(\bar{x}_D\) is the mean of the difference scores, or sum of the difference scores divided by the sample size. Finally, \(s_D\) is the standard deviation of the difference scores:
$$s_D = \sqrt\frac{\sum{(x_D  \bar{x}_D)^2}}{n1}$$
In the formula for \(s_D\), \(x_D\) are the individual difference scores and should not be confused with \(\bar{x}_D\), which is the mean of the difference scores.
Instructions
 Use the code provided to assign the sample size to
n
.  Calculate the mean of the difference scores by summing up the differences with
sum()
and dividing byn
. The differences are contained in thegain
column ofwm_t
.  Compute the standard deviation of the difference scores as defined above. Use
n
andmean_diff
in your calculation and be careful with your brackets! Save the result tosd_diff
.  Compute the observed tvalue by combining
mean_diff
,sd_diff
, andn
. Store the result int_obs
.
wm < read.table(url("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.08.txt"), header = T)
wm_t < subset(wm, wm$train == "1")
## The training subset, wm_t, is available in your workspace
# Define the sample size
n < nrow(wm_t)
# Mean of the difference scores
mean_diff < ___
# Standard deviation of the difference scores
sd_diff < sqrt(sum((___  ___)^2) / (___))
# Observed tvalue
t_obs < ___ / (___ / sqrt(___))
# Print observed tvalue
## The training subset, wm_t, is available in your workspace
# Define the sample size
n < nrow(wm_t)
# Mean of the difference scores
mean_diff < sum(wm_t$gain) / n
# Standard deviation of the difference scores
sd_diff < sqrt(sum((wm_t$gain  mean_diff)^2) / (n  1))
# Observed tvalue
t_obs < mean_diff / (sd_diff / sqrt(n))
# Print observed tvalue
t_obs
test_object("n", incorrect_msg = "Did you use the `nrow` function of `wm_t` to define the sample size?")
test_object("mean_diff", incorrect_msg = "Did you correctly define the mean of the difference scores?")
test_object("sd_diff", incorrect_msg = "Did you correctly define the standard deviation of the difference scores?")
test_object("t_obs", incorrect_msg = "Did you correctly define the observed tvalue?")
test_output_contains("t_obs", incorrect_msg = "Did you print `t_obs`?")
test_error()
success_msg("Great work!")
Refer to the formulas above if you get stuck. To calculate the standard deviation of the difference scores, you'll need to subtract the mean of the difference scores (mean_diff
) from the vector of difference scores (wm_t$gain
). This results in a vector of numbers that gets squared and summed up, then divided by n  1
. Finally, the square root of the whole thing gives you the standard deviation!
Perform a dependent ttest (2)
Now that we've determined our null and alternative hypotheses, decided on a significance level, and computed our observed tvalue, all that remains is to calculate the critical value for this test and compare it to our observed tvalue. This will tell us whether we have sufficient evidence to reject our null hypothesis. We'll even go one step further and compute an effect size with Cohen's d!
The critical value is the point on the relevant tdistribution that determines whether the value we observed is extreme enough to warrant rejecting the null hypothesis. Recall that a tdistribution is defined by its degrees of freedom, which in turn is equal to the sample size minus 1. In this example, we have 80 subjects so the relevant tdistribution has 79 degrees of freedom.
We're performing a twotailed ttest in this situation since we care about detecting a significant effect in either the positive or negative direction. In other words, we want to know if training significantly increases or decreases intelligence, however, given that our observed tvalue is positive (14.49) the righthand is the only relevant value here.
Furthermore, since our desired significance level (i.e. alpha) is 0.05, our critical value is the point on our tdistribution at which 0.025 (0.05 / 2) of its total area of 1 is to the right and thus 0.975 (1  0.025) of its total area is to the left.
This point is called the 0.975 quantile and is computed for a tdistrbution in R using the qt()
function.
Instructions
 Compute the right critical value and store the result in
t_crit
. Useqt()
with two arguments (in order): The desired area to the left of the critical value (i.e. quantile value)
 The relevant degrees of freedom for our tdistribution
 Print the critical value to the console
 Print the observed tvalue (
t_obs
) to compare with the critical value  Compute Cohen's d, which is the mean of the difference scores divided by the standard deviation of the differences scores.
mean_diff
andsd_diff
from the previous exercise are still available in your workspace. Save the result tocohens_d
.  Print
cohens_d
to the console.
# Read data into a dataframe that is called wm
wm < read.table(url("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.08.txt"), header = T)
# subset data
wm_t < subset(wm, wm$train == "1")
# Define the sample size
n < nrow(wm_t)
# Mean of the difference scores
mean_diff < sum(wm_t$gain) / n
# Standard deviation of the difference scores
sd_diff < sqrt(sum((wm_t$gain  mean_diff)^2) / (n  1))
# Observed tvalue
t_obs < mean_diff / (sd_diff / sqrt(n))
## The variables from the previous exercise are preloaded
# Compute the critical value
t_crit < ___
# Print the critical value
# Print the observed tvalue to compare
# Compute Cohen's d
# View Cohen's d
## The variables from the previous exercise are preloaded
# Compute the critical value
t_crit < qt(0.975, df = 79)
# Print the critical value
t_crit
# Print the observed tvalue to compare
t_obs
# Compute Cohen's d
cohens_d < mean_diff / sd_diff
# View Cohen's d
cohens_d
test_object("t_crit", incorrect_msg = "Did you use correctly define `t_crit`? Use the `qt()` function.")
test_output_contains("t_crit", incorrect_msg = "Did you print `t_crit`?")
test_output_contains("t_obs", incorrect_msg = "Did you print `t_obs`?")
test_object("cohens_d", incorrect_msg = "Did you correctly define `cohens_d`?")
test_output_contains("cohens_d", incorrect_msg = "Did you print `cohens_d`?")
test_error()
success_msg("Great work!")
As an example, qt(0.75, df = 20)
computes the 0.75 quantile of a tdistribution with 20 degrees of freedom.
Let R do the dirty work
Thanks to the handy t.test()
function in R, it's not necessary to manually compute the tstatistic every time you want to do a ttest. Similarly, the cohensD()
function from the lsr
package automates the process of computing Cohen's d. We'll continue with the working memory example to illustrate.
In the case of our dependent ttest, we need to specify these arguments to t.test()
:

x
: Column ofwm_t
containing posttraining intelligence scores 
y
: Column ofwm_t
containing pretraining intelligence scores 
paired
: Whether we're doing a dependent (i.e. paired) ttest or independent ttest. In this example, it'sTRUE
Note that t.test()
carries out a twosided ttest by default, which is exactly what we want here, but this behavior can be altered with the alternative
argument. See ?t.test
for more info.
For cohensD()
, we'll need to specify three arguments:

x
: Column ofwm_t
containing posttraining intelligence scores 
y
: Column ofwm_t
containing pretraining intelligence scores 
method
: Version of Cohen's d to compute, which should be"paired"
in this case
If you're successful, you'll get the same values you got when you did everything "by hand" in the previous exercises.
Instructions
The wm_t
dataset has been loaded into your workspace.
 Apply
t.test()
towm_t
to test whether there's a significant difference between pre and posttraining intelligence scores. Posttraining scores are contained inwm_t$post
and pretraining scores inwm_t$post
. Don't save the result.  Apply
cohensD()
towm_t
to compute Cohen's d. Use the same columns as inputs forx
andy
. Don't save the result.
library(lsr)
# Read data into a dataframe that is called wm
wm < read.table(url("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.08.txt"), header = T)
wm_t < subset(wm, wm$train == "1")
## The lsr package has been loaded for you
# Conduct a paired ttest using the t.test function
# Calculate Cohen's d
## The lsr package has been loaded for you
# Apply the t.test function
t.test(wm_t$post, wm_t$pre, paired = TRUE)
# Calculate Cohen's d
cohensD(wm_t$post, wm_t$pre, method = "paired")
test_error()
msg1 < "Use the <code>t.test()</code> function to execute a paired ttest. Type <code>?t.test</code> in the console to get the help file."
msg2 < "Make sure to pass the correct data to the <code>t.test()</code> function."
test_function("t.test", "x", not_called_msg = msg1, incorrect_msg = msg2)
test_output_contains("t.test(wm_t$post, wm_t$pre, paired = T)",
incorrect_msg = paste("Make sure that the <code>t.test()</code> function is executed with <code>wm_t$post</code> as the first",
"argument and that a paired ttest is specified. The order of the variables determines the sign of the tvalue."))
test_function("cohensD", args = c("x","y","method"),
not_called_msg = "Please use the cohensD function to calculate Cohen's d for this ttest. Type <code>?cohensD</code> in the console to get the help file.",
incorrect_msg = paste("Make sure that the <code>cohensD()</code> function is executed with <code>wm_t$post</code>",
"as the first argument and <code>mt_t$pre</code> as the second argument. The order changes",
"the direction of the test. Also make sure that the Cohen's d statistic is calculated for a <i>paired</i> ttest."))
test_output_contains("cohensD(wm_t$post, wm_t$pre, method = 'paired')",
incorrect_msg = "Did you use the correct arguments within the `cohensD()` function?")
success_msg("Excellent! Have a look at the output of functions <code>t.test()</code> and <code>cohensD()</code> to confirm that they are indeed the same as the results from the previous exercises.")
Remember that R is case sensitive. Do not forget the capital 'D' in the function cohensD()
. Also, make sure you specify the post
column first and pre
column second in both functions.