R-Code for Exam, Modules 9-13 Flashcards
query_X = DATASET_NAME$CAT_VARIABLE == “X”
index_X = which(query_X)
quantities_X = DATASET_NAME$QUANT_VARIABLE[index_X]
m_quantities_X = mean(quantities_X)
sum_squares_X = sum((quantities_X - m_quantities_X)^2)
we have a data set with three categorical variables – X, Y and Z – and we want to find the sum of squares associated with the quantities for variable X
query_Y = DATASET_NAME$CAT_VARIABLE == “Y”
index_Y = which(query_Y)
quantities_Y = DATASET_NAME$QUANT_VARIABLE[index_Y]
m_quantities_Y = mean(quantities_Y)
sum_squares_Y = sum((quantities_Y - m_quantities_Y)^2)
query_Z = DATASET_NAME$CAT_VARIABLE == “Z”
we have a data set with three categorical variables – X, Y and Z – and we want to find the sum of squares associated with the quantities for variable Y
query_Z = DATASET_NAME$CAT_VARIABLE == “Z”
index_Z = which(query_Z)
quantities_Z = DATASET_NAME$QUANT_VARIABLE[index_Z]
m_quantities_Z = mean(quantities_Z)
sum_squares_Z = sum((quantities_Z - m_quantities_Z)^2)
we have a data set with three categorical variables – X, Y and Z – and we want to find the sum of squares associated with the quantities for variable Z
within_group_ss = sum(sum_squares_X + sum_squares_Y + sum_squares_Z)
calculates the within group sum of squares (“within_group_ss”) for three categorical variables (X, Y and Z), whose individual sum of squares has already been calculated
grand_mean = mean(DATASET_NAME$QUANT_VARIABLE)
calculates the grand mean of the quantitative variable of the dataset (saved as “grand_mean”), which represents the mean value across all the members of the categorical variable (X, Y, Z, etc.)
among_group_ss =
(# REPS_X)(m_quantities_X - grand_mean)^2 +
(# REPS_Y)(m_quantities_Y - grand_mean)^2 +
(# REPS_Z)*(m_quantities_Z - grand_mean)^2
code to find the among group sum of squares for three categorical variables (X, Y and Z), given the number of replications they have in the dataset, the grand mean among the three variables, and the mean quantities of each
total_ss = within_group_ss + among_group_ss
the total sum of squares equals the sum of the within group and the among group sum of squares
table_ss =
as.table(rbind(c(“total_ss”,
“within_group_ss”,
“among_group_ss”),
c(total_ss,
within_group_ss,
among_group_ss)))
code which establishes a table to compare the within, among, and total sum of squares for a set of data
anova_model =
aov(QUANTITATIVE~CATEGORICAL,
data = DATASET_NAME)
code to establish an ANOVA model for the data (“anova_model”) which relates a quantitative and categorical variable (“QUANTITATIVE”; “CATEGORICAL”) to a dataset (“DATASET_NAME”)
plot(factored_variable, DATASET_NAME$QUANT_VARIABLE,
ylim = c(#, #), ylab = “BLAH BLAH”,
xlab = “BLAH BLAH”,
col = “red”/”blue”/”green” [etc.])
creates a box-plot of a categorical variable which has been coded as a factor variable, which allows one to see if the categories differ significantly in their expression from one another
anova_model_1 = aov(quantitative_variable ~
factored_variable,
data = DATASET_NAME)
first method to establish an ANOVA model of a dataset, assuming we have already created the object “factored_variable” which relates to our variables “X”, “Y” and “Z”
anova_model_2 = aov(quantitative_variable ~ factor(DATASET_NAME$CAT_VARIABLE, c(“X”, “Y”, “Z”),
data = DATASET_NAME)
second method to establish an ANOVA model of a dataset, assuming we have not yet created the object “factored_variable” for the variables “X”, “Y” and “Z”
residuals = quantitative_variable – predict.lm(anova_model)
plot(predict.lm(anova_model), residuals, cex.lab = 1.15,
ylim = c(LOW #, HIGH #),
xlim = c(LOW #, HIGH #),
ylab = “BLAH BLAH”,
xlab = “YADA YADA”)
abline(a = 0, b = 0, lwd = 2/3, lty = “dashed”, col = “red”)
code we could establish to plot the residuals versus predicted values for the data set to check for the homogeneity of variance
stdRes = rstandard(anova_model)
calculates the standard residuals for the ANOVA model (“anova_model”) and saves them as the object “stdRes”
qqnorm(stdRes,
ylab=”Standardized Residuals”,
xlab=”Theoretical Quantiles”)
qqline(stdRes, col=”red”, lwd=2)
establishes a normal Q-Q plot for the residuals, along with the linear line of best fit
TukeyHSD(anova_model, ordered = TRUE)
conducts a Tukey’s Honestly Significant Difference test on the ANOVA model in order to determine which groups in the ANOVA vary significantly from one another
plot(TukeyHSD(anova_model, ordered = TRUE))
establishes a plot of the Tukey’s test of the ANOVA model – pairs of variables which don’t overlap with the dashed horizontal line at x = 0 are significantly different, while those which lie on the dashed line are significantly different
quant_var = DATASET_NAME$QUANTITATIVE_VARIABLE
fact_var1 = factor(DATASET_NAME$CATEGORY_VARIABLE_1)
fact_var2 = factor(DATASET_NAME$CATEGORY_VARIABLE_2)
par(mfrow = c(1, 2))
plot(fact_var1, quant_var,
c(LOW #, HIGH #),
ylab = “BLAH BLAH”,
xlab = “YADA YADA”,
col = “red”)
plot(fact_var2, quant_var,
c(LOW #, HIGH #),
ylab = “BLAH BLAH”,
xlab = “YADA YADA”,
col = “blue”)
codes which if typed out would factor two categorical variables and establish an object for the quantitative variables, then would allow for two box-plots to be set up to compare the relationship between the first factored variable and the quantitative variable and the second factored variable and the quantitative variable
2way_anova_model = aov(quant_var ~ fact_var1 + fact_var2)
code to produce a 2-way ANOVA model, given we have the quantitative variable and have factorized the two categorical variables already
summary(2way_anova_model
[Pr(>F); fact_var1 < 0.05,
fact_var2 <0.05]
summary(1way_anova_model)
[Pr(>F); factored_variable > 0.05]
describes the possibility that when running a block ANOVA, the P-values for the two factor variables may be significant, but having a simpler one-factor ANOVA model which doesn’t take one of those variables into account may produce a P-value which is not significant
library(HSAUR2)
data(“USairpollution”)
loads the “USairpollution” dataset from the “HSAUR2” package; “HSAUR2” is ingrained into RStudio, so you can load it without having to rely on the data sets provided by the professors
plot(USairpollution$predays, USairpollution$SO2,
xlab = “days with precipitation”,
ylab = “sulfur emissions”)
creates a box-plot which relates the days with precipitation on the x-axis and the sulfur emissions on the y-axis
pollution_model = lm(USairpollution$SO2 ~ USairpollution$predays)
abline(pollution_model,
col = “red”, lwd = 3)
graphs a red linear line of best fit with a width of three for the relationship between sulfur emissions and days with precipitation from the “USairpollution” data set
predicted = predict.lm(pollution_model)
residuals = USairpollution$SO2 - predicted
plot(predicted, residuals,
xlab = “Predicted Y”,
ylab = “Residuals”)
abline(a = 0, b = 0,
col = “red”, lwd = 3, lty = “dashed”)
commands to establish a plot of the residuals versus the predicted values for the sulfur emissions given the number of days with precipitation (variance appears heteroscedastic).
stdRes = rstandard(pollution_model)
log_pollute_mod = lm(log(USairpollution$SO2) ~ USairpollution$predays)
plot(USairpollution$predays, log(USairpollution$SO2),
xlab = “days with precipitation”,
ylab = “sulfur emissions (log-transformed)”)
abline(log_pollute_mod,
col = “red”, lwd = 3)
graphs a red linear line of best fit with a width of three for the relationship between the log-transformed sulfur emissions and days with precipitation from the “USairpollution” data set
log_predicted = predict.lm(log_pollute_mod)
log_residuals = log(USairpollution$SO2) - log_predicted
plot(log_predicted, log_residuals,
xlab = “Log-Trans Predicted Y”, ylab = “Log-Trans Residuals”)
abline(a = 0, b = 0, col = “red”, lwd = 3, lty = “dashed”)
commands to establish a plot of the log-transformed residuals versus the predicted values for the sulfur emissions given the number of days with precipitation (variance still appears heteroscedastic).
rainydays_sulfur = lm(USairpollution$SO2 ~ USairpollution$predays)
stdRes = rstandard(rainydays_sulfur)
qqnorm(stdRes, xlab = “Theoretical Quantiles”, ylab = “Standardized Residuals”)
qqline(stdRes, col = “red”, lwd = 2, lty = “dashed”)
creates a QQ-plot for the relationship between SO2 emissions and days with precipitation
log_stdRes = rstandard(log_pollute_mod)
qqnorm(log_stdRes, xlab = “Theoretical Quantiles (Log-Transformed)”,
ylab = “Standardized Residuals (Log-Transformed)”)
qqline(log_stdRes, col = “red”, lwd = 2, lty = “dashed”)
establishes the qq-plot for the log-transformed data from the USairpollution data set
plot(x = DATASET_NAME$VARIABLE_1, y = DATASET_NAME$VARIABLE_2)
mean_var1 = mean(DATASET_NAME$VARIABLE_1)
mean_var2 = mean(DATASET_NAME$VARIABLE_2)
abline(v = mean_var1, col = “gray”)
abline(h = mean_var2, col = “gray”)
commands relevant to simple regression which are used to see if two variables which are quantitative (VARIABLE_1 and VARIABLE_2) have a positive, negative or no correlation
cor(DATASET_NAME)
gives the correlation between the different variables in a dataset – each variable has a 1.000… correlation to itself but can have a corrleation of anywhere between -1.000… and 1.000… to another variable
devi_x = DATASET_NAME$VARIABLE_1 - mean_var1
devi_y = DATASET_NAME$VARIABLE_2 - mean_var2
Pearson_numerator = sum(devi_x*devi_y)
manually calculates the numerator of the Pearson coefficient for VARIABLE_1 and VARIABLE_2
sosq_devi_x = sum(devi_x^2)
sosq_devi_y = sum(devi_y^2)
Pearson_denominator = sqrt(sosq_devi_x*sosq_devi_y)
manually calculates the numerator of the Pearson coefficient for VARIABLE_1 and VARIABLE_2
Pearson_coefficient = Pearson_numerator/Pearson_denominator
manually derives the Pearson coefficient, which is above zero if there is a positive correlation and below zero if there is a negative correlation
query_neg_Pearson = devi_x*devi_y < 0
index_neg_Pearson = which(query_neg_Pearson)
length(index_neg_Pearson)/nrow(DATASET_NAME)
provides in the console window the percentage of coordinates which result in negative correlations when x and y are multiplied
query_pos_Pearson = devi_x*devi_y > 0
index_pos_Pearson = which(query_pos_Pearson)
length(index_pos_Pearson)/nrow(DATASET_NAME)
provides in the console window the percentage of coordinates which result in positive correlations when x and y are multiplied
x = seq(from = -10, to = 10, length.out = 100)
y = x^2
plot(x, y)
mean_x = mean(x)
mean_y = mean(y)
abline(v = mean_x, col=”grey”)
abline(h = mean_y, col=”grey”)
generates a graph of the quadratic relationship y = x^2, bound between [-10, 10] – while there is clearly a strong relationship here, a manually calculated Pearson coefficient would result in an answer of zero because there are as many coordinates in the positive quadrants as in the negative quadrants
dataset_narrowed = dataset_general[, c(‘COLUMN_1’, ‘COLUMN_2’, etc.)]
for datasets with a crap-ton of columns, using brackets with the command “c(‘COLUMN_1’, ‘COLUMN_2’, etc.)” will reduce the number therein
query_elim = !is.na(DATASET_NAME$BAD_COLUMN_1) &
!is.na(DATASET_NAME$BAD_COLUMN_2) &
DATASET_NAME$BAD_COLUMN_1 < HIGH # &
DATASET_NAME$BAD_COLUMN_1 > LOW # &
DATASET_NAME$BAD_COLUMN_2 < HIGH # &
DATASET_NAME$BAD_COLUMN_2 > LOW #
index_elim = which(query_elim)
dataset_retained = dataset_narrowed[index_elim, ]
if the dataset in the narrowed group has bogus values, this set of commands can keep only the true values which may be more reasonable
lowess_retained = lowess(x = dataset_retained$COLUMN_1,
y = dataset_retained$COLUMN_2, f = 1/#)
lines(x = lowess_retained$x, y = lowess_retained$y, col = ‘red’, lwd = 3)
codes to establish a lowess line for the values
plot(x = DATASET_NAME$VARIABLE_X, y = DATASET_NAME$VARIABLE_Y,
xlab = “VARIABLE X”, ylab = “VARIABLE Y”)
regression_results = lm(VARIABLE_Y~VARIABLE_X, data = DATASET_NAME)
lines = (x = DATASET_NAME$VARIABLE_X,
y = regression_results$fitted.values)
plots the simple regression model between two variables, X and Y
regression_summary = summary(regression_results)
establishes the summary of the simple regression for two variables
slope = regression_summary$coefficients[“VARIABLE_X”, “Estimate”]
establishes the slope of the standard regression
slope_standard_error =
regression_summary$coefficients[“VARIABLE_X”, “Std. Error”]
establishes the standard error of the standard regression
slope_tvalue = slope/slope_standard_error
calculates the t-value of the slope of the simple regression line
intercept = regression_summary$coefficients[“(Intercept)”, “Estimate”]
intercept_standard_error =
regression_summary$coefficients[“(Intercept)”, “Std. Error”]
intercept_tvalue = intercept/intercept_standard_error
code used to find the intercept t-value