R notes Flashcards
library(dplyr)
pipes, group by, summarise
group_by(factor column)
summarise( Means = mean(), Medians = median(), Freq = n())
Combining levels
library(plyr)
mapvalues(dataset$column, levels(dataset$column), c(“name level1”, etc.)
Unique values
length(unique(dataset$column))
Create Factor of intervals
cut(dataset$column,
breaks = c(0, 15, 35, 65, 100)
labels = c(“Child”, “Young Adult”, Middle.aged”, “Elderly”))
Binary Target - Factor to Numeric
ifelse(factor == “success level”, 1, 0)
Relevel
levels(factor)
relevel(dataset$column, ref = “new reference level”)
Scatterplot variation
ggplot(dataset, aes(x = variable, y = target)) +
geom_count()
Regularization- code
requires matrix object for x: Xmat <- model.matrix(target ~ predictors, data = dataset)
library(glmnet)
rid.mod1 <- glmnet( x = Xmat[, -1], y = dataset$target,
family = “gaussian”,
lambda = ?,
alpha = ?,
standardize = T <- default, so not needed
)
coef(rid.mod1) #all estimates in one place
plot(cv.glmet()) - produces a CV visual; the left dashed line indicates where CV error is minimized; the right dashed line indicates the one-standard-error rule; the numbers at the top indicate the number of predictors in the model
Regularization - notes
hyperparameters: alpha and lambda
-alpha = 0 -> Ridge; alpha = 1 -> Lasso
-lambda = 0 -> no shrinkage
predict(rid.mod4, newx = Xmat[, -1])
training RMSE <- sqrt(sum((dataset$target - predict(rid.mod4, newx = Xmat[, -1]))^2) / nrow(dataset)
Symbols
~ = ‘explained by’
+ = ‘include’
. = ‘all remaining variables’
- = ‘exclude’
: = ‘interact’
* = ‘cross’ -> include and interact
Regularization - CV code
set.seed(161)
rid.cv <- cv.glmnet( x = Xmat[, -1], y = dataset$target,
family = “gaussian”,
alpha = ?,
lambda = ??,
standardize = T <- default, so not needed
)
-autogenerates lambda if not specified
-nfold is 10 by default
rid.cv$lambda[, ] # Shows results for the 100 lambda values (100 is default)
rid.cv$lambda.min #returns best lambda OR rid.cv$lambda.1se
–>use rid.cv$lambda.min for lambda in regular glmnet model
coef(rid.mod1)
Train function pt 1
library(caret)
set.seed(161)
tree.cv1 <- train(
target ~ predictors,
data = dataset,
method = “rpart”,
trControl = trainControl(method = “cv”, number = 10),
metric = “RMSE”,
tuneGrid = expand.grid(cp = seq(0.0, 0.005, 0.0005)),
control = rpart.control(minsplit = , minbucket = maxdepth = ),
na.action = na.pass
)
plot(tree.cv1)
tree.cv1$results
tree.mod# <- tree.cv1$finalModel #extract tree with lowest cv
rpart.plot(tree.mod#)
tree.cv1$bestTune #extract the optimal cp value
control optional, will use defaults
na.action shouldn’t be needed if we adequately handle missing data at the start
can replace 1st 2 arguments with:
y = dataset$column, x = dataset[, c(“predictor1”, “predictor2”, etc.)]
Train pt 2
need a binary factor target
library(caret)
set.seed(161)
tree.cv1 <- train(
factor(target) ~ predictors,
data = dataset,
method = “rpart”,
trControl = trainControl(method = “cv”, number = 10),
metric = “Accuracy”,
tuneGrid = expand.grid(cp = seq(0.0, 1, 0.01)),
na.action = na.pass,
parms = list(split = “gini” or “information”)
)
plot(tree.cv1) #higher the accuracy the better
tree.mod# <- tree.cv1$finalModel #extract the model with the highest cv accuracy
rpart.plot(tree.mod#, extra = 4) #makes tree
tree.cv$bestTune #extract optimal cp value
“Accuracy” - %age of correct predictions = 1 - classification error rate
Other
poly(variable, 2) = variable + I(variable^2)
Train pt 3
RF plots
library(pdp)
partial(rf.mod#, pred.var = “variable”, plot = T)
PCA
base package
pca1 <- promp(~. - factor, data = dataset, center = T, scale. = T)
OR pca1 <- prcomp(dataset[, colnames(dataset) != “factor”], center = T, scale. = T)
-exclude factors because this function doe not work with factors
-center default = T
pca1$rotation #access the loadings
pca1$x #obtain scores of the PCs (for each observation)
summary(pca1) # proportion of variance explained
plot(pca1, type = “line”) #scree plot
OR screeplot(pca1, type = “line”, npcs = # of PCs to plot)
pca1$x[, 1 or “PC1”] #scores of PC1
dataset.new <- dataset
dataset.new$PC1 <- pca1$x[, “PC1”]
dataset.new$variable1, 2, etc. <- NULL
k-means clustering
set.seed(161)
dataset.scaled <- data.frame(scale(iris[, columns(dataset) != “factor”]))
km1 <-kmeans(
x = dataset[, c(“variable1”, “varaible2”], # Required - must be scaled
centers = 5, #Required - k = the number of clusters
iter.max = , # max iterations with default = 10, can try higher (30, etc.)
nstart = # # of random initial cluster assignments with default = 1 -> ‘best’ chosen
algorithm , #irrelevant, we viewed Lloyd which is not the default
)
km1$cluster # assignments for each obs
km1$tot.withinss #extract total within-cluster variation
km1$betweenss/dm1$totss
library(ggplot2)
ggplot(dataset.scaled, aes(x = variable1, y = variable2, color = factor(km1$cluster))) +
geom_point()
hierarchical clustering
nc1 <- hclust(
plot(hc1) #dendrogram
cuttree(hc1, h = 4) # cut at height of 4
OR currtree(hc1, k = 3) # cut into 3clusters
R code
lm = linear models
mlr.mod1 <- lm(target ~ predictors, data = dataset)
summary(mlr.mod1)
predict(mlr.mod1) -> y^’s
predict(mlr.mod1, newdata = )
->newdata does require specific columns but the model only uses ones included in the model formula
RMSE: sqrt(sum(dataset$target - predict(mlr.mod1, newdata = dataset)^2) / nrow(dataset))
R code - Binarization
bin.mod <- dummyVars(~factor, data = dataset, fullRank = T)
predict(bin.mod, newdata = dataset)
new.dataset <- cbind(dataset[, colnames(dataset) != “factor”], predict(bin.mod, newdata = dataset))
R Code - Stratified Sampling
R:
library(caret)
set.seed(161)
ind <- createDataPartition(dataset$target, p = 0.7, list = F)
train <- dataset[ind, ]
test <- dataset[-ind, ]
rm(ind) #remove… ->not necessary
mean(train$target)
mean(test$target)
R Code: Stepwise Selection
Null model: mlr.mod0 <- lm(target ~ 1, data = dataset)
Biggest model: mlr.mod4 <- lm(target ~ ., data = dataset) <-largest model you want to consider
library(MASS)
AIC:
stepAIC(Null model, direction = “forward”, scope = list(upper = largest model), k = 2)
AIC -> k = 2; BIC -> k = log(nrow(dataset))
Backward–>no scope needed, and if scope is not included, the default for direction is “backward”
k = 2 (default)
direction = “backward” (default)
drop1() -> what does 1st round with backward selection and AIC produce for AIC if drop each predictor
R code
function: glm
glm.mod1 <- glm(target ~ predictors, family = “distribution”(link = “link”), data = dataset, weights = column in dataset of weights)
->if using weights, target must be on a per weight basis; also exclude column of weights from predictors; only with normal, gamma, or inverse gaussian on exam
predict(glm.mod1, type = “response”) -> this returns mu^ not (1/mu^), etc.