R notes Flashcards
library(dplyr)
pipes, group by, summarise
group_by(factor column)
summarise( Means = mean(), Medians = median(), Freq = n())
Combining levels
library(plyr)
mapvalues(dataset$column, levels(dataset$column), c(“name level1”, etc.)
Unique values
length(unique(dataset$column))
Create Factor of intervals
cut(dataset$column,
breaks = c(0, 15, 35, 65, 100)
labels = c(“Child”, “Young Adult”, Middle.aged”, “Elderly”))
Binary Target - Factor to Numeric
ifelse(factor == “success level”, 1, 0)
Relevel
levels(factor)
relevel(dataset$column, ref = “new reference level”)
Scatterplot variation
ggplot(dataset, aes(x = variable, y = target)) +
geom_count()
Regularization- code
requires matrix object for x: Xmat <- model.matrix(target ~ predictors, data = dataset)
library(glmnet)
rid.mod1 <- glmnet( x = Xmat[, -1], y = dataset$target,
family = “gaussian”,
lambda = ?,
alpha = ?,
standardize = T <- default, so not needed
)
coef(rid.mod1) #all estimates in one place
plot(cv.glmet()) - produces a CV visual; the left dashed line indicates where CV error is minimized; the right dashed line indicates the one-standard-error rule; the numbers at the top indicate the number of predictors in the model
Regularization - notes
hyperparameters: alpha and lambda
-alpha = 0 -> Ridge; alpha = 1 -> Lasso
-lambda = 0 -> no shrinkage
predict(rid.mod4, newx = Xmat[, -1])
training RMSE <- sqrt(sum((dataset$target - predict(rid.mod4, newx = Xmat[, -1]))^2) / nrow(dataset)
Symbols
~ = ‘explained by’
+ = ‘include’
. = ‘all remaining variables’
- = ‘exclude’
: = ‘interact’
* = ‘cross’ -> include and interact
Regularization - CV code
set.seed(161)
rid.cv <- cv.glmnet( x = Xmat[, -1], y = dataset$target,
family = “gaussian”,
alpha = ?,
lambda = ??,
standardize = T <- default, so not needed
)
-autogenerates lambda if not specified
-nfold is 10 by default
rid.cv$lambda[, ] # Shows results for the 100 lambda values (100 is default)
rid.cv$lambda.min #returns best lambda OR rid.cv$lambda.1se
–>use rid.cv$lambda.min for lambda in regular glmnet model
coef(rid.mod1)
Train function pt 1
library(caret)
set.seed(161)
tree.cv1 <- train(
target ~ predictors,
data = dataset,
method = “rpart”,
trControl = trainControl(method = “cv”, number = 10),
metric = “RMSE”,
tuneGrid = expand.grid(cp = seq(0.0, 0.005, 0.0005)),
control = rpart.control(minsplit = , minbucket = maxdepth = ),
na.action = na.pass
)
plot(tree.cv1)
tree.cv1$results
tree.mod# <- tree.cv1$finalModel #extract tree with lowest cv
rpart.plot(tree.mod#)
tree.cv1$bestTune #extract the optimal cp value
control optional, will use defaults
na.action shouldn’t be needed if we adequately handle missing data at the start
can replace 1st 2 arguments with:
y = dataset$column, x = dataset[, c(“predictor1”, “predictor2”, etc.)]
Train pt 2
need a binary factor target
library(caret)
set.seed(161)
tree.cv1 <- train(
factor(target) ~ predictors,
data = dataset,
method = “rpart”,
trControl = trainControl(method = “cv”, number = 10),
metric = “Accuracy”,
tuneGrid = expand.grid(cp = seq(0.0, 1, 0.01)),
na.action = na.pass,
parms = list(split = “gini” or “information”)
)
plot(tree.cv1) #higher the accuracy the better
tree.mod# <- tree.cv1$finalModel #extract the model with the highest cv accuracy
rpart.plot(tree.mod#, extra = 4) #makes tree
tree.cv$bestTune #extract optimal cp value
“Accuracy” - %age of correct predictions = 1 - classification error rate
Other
poly(variable, 2) = variable + I(variable^2)
Train pt 3