STAT 2 - LAB 1 Flashcards
Load the Boston data set from the MASS library
library(MASS)
data(“Boston”)
Retrieve description of Boston dataset
help(Boston)
or
?Boston
get a glimpse of structure of data
str(data)
glimpse(data)
names(data)
change int variable into factor
census_tracts$chas <- as.factor(census_tracts$chas)
check levels of a factor variable
levels(census_tracts$chas)
compute basic statistics for each variable
summary(census_tracts)
take all numeric variables and exclude categorical ones
var_interest_numeric <- colnames(census_tracts)[!(colnames(census_tracts) %in% c(“chas”, “rad”))]
plot a side-by-side histogram analysing multiple columns
census_tracts %>%
select(crim, zn, age) %>%
gather(cols, value) %>%
ggplot(aes(x = value)) + geom_histogram(bins = 20) + facet_wrap(.~ cols, ncol = 3)
compute correlation between quantitative variables
cor(census_tracts[,var_interest_numeric]
rounded:
round(cor(census_tracts[,var_interest_numeric]), 2)
plot the correlation matrix
heatmap(corr_matrix)
library(corrplot)
corrplot::corrplot(corr_matrix)
(you can plot it like this and consider the upper triangle or you can perform this before:
corr_matrix[lower.tri(corr_matrix)] <- 0 (to visualize only upper triangle))
scatterplots of relationships between quantitative variables
with function pairs:
pairs(census_tracts[, c(“medv”, “lstat”,”dis”)])
fit a simple linear regression model
lm.fit.simple <- lm(formula = medv ~ lstat, data = census_tracts)
Calculate 95% confidence intervals for —0 and —1 using the function confint.
confint(lm.fit.simple)
Calculate 95% confidence intervals for medv at values of lstat are 5,10,15 using the function predict.
predict(lm.fit.simple,
newdata = data.frame(lstat = (c(5,10,15))),
interval = “confidence”)
Calculate 95% prediction intervals medv at values of lstat are 5,10,15 using the function predict.
predict(lm.fit.simple,
newdata = data.frame(lstat = (c(5,10,15))),
interval = “prediction”)
Fit multiple linear regression model with medv as the response variable and lstat, and dis as the predictor
lm.fit.multiple <- lm(formula = medv ~ lstat + dis, data = census_tracts)
compare multiple and simple linear regression model by using anova
anova(lm.fit.simple,lm.fit.multiple)
Fit the multiple linear regression model with medv as the response variable, and all quantitative variables as predictors
lm.fit.full <- lm(formula = medv ~ . -chas -rad, data = census_tracts)