R notes Flashcards

1
Q

library(dplyr)

A

pipes, group by, summarise

group_by(factor column)

summarise( Means = mean(), Medians = median(), Freq = n())

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
1
Q

Combining levels

A

library(plyr)
mapvalues(dataset$column, levels(dataset$column), c(“name level1”, etc.)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
2
Q

Unique values

A

length(unique(dataset$column))

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
3
Q

Create Factor of intervals

A

cut(dataset$column,
breaks = c(0, 15, 35, 65, 100)
labels = c(“Child”, “Young Adult”, Middle.aged”, “Elderly”))

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
4
Q

Binary Target - Factor to Numeric

A

ifelse(factor == “success level”, 1, 0)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
5
Q

Relevel

A

levels(factor)

relevel(dataset$column, ref = “new reference level”)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
6
Q

Scatterplot variation

A

ggplot(dataset, aes(x = variable, y = target)) +
geom_count()

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
7
Q

Regularization- code

A

requires matrix object for x: Xmat <- model.matrix(target ~ predictors, data = dataset)
library(glmnet)
rid.mod1 <- glmnet( x = Xmat[, -1], y = dataset$target,
family = “gaussian”,
lambda = ?,
alpha = ?,
standardize = T <- default, so not needed
)

coef(rid.mod1) #all estimates in one place
plot(cv.glmet()) - produces a CV visual; the left dashed line indicates where CV error is minimized; the right dashed line indicates the one-standard-error rule; the numbers at the top indicate the number of predictors in the model

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
8
Q

Regularization - notes

A

hyperparameters: alpha and lambda
-alpha = 0 -> Ridge; alpha = 1 -> Lasso
-lambda = 0 -> no shrinkage

predict(rid.mod4, newx = Xmat[, -1])
training RMSE <- sqrt(sum((dataset$target - predict(rid.mod4, newx = Xmat[, -1]))^2) / nrow(dataset)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
9
Q

Symbols

A

~ = ‘explained by’
+ = ‘include’
. = ‘all remaining variables’
- = ‘exclude’
: = ‘interact’
* = ‘cross’ -> include and interact

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
10
Q

Regularization - CV code

A

set.seed(161)
rid.cv <- cv.glmnet( x = Xmat[, -1], y = dataset$target,
family = “gaussian”,
alpha = ?,
lambda = ??,
standardize = T <- default, so not needed
)
-autogenerates lambda if not specified
-nfold is 10 by default

rid.cv$lambda[, ] # Shows results for the 100 lambda values (100 is default)
rid.cv$lambda.min #returns best lambda OR rid.cv$lambda.1se
–>use rid.cv$lambda.min for lambda in regular glmnet model

coef(rid.mod1)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
11
Q

Train function pt 1

A

library(caret)
set.seed(161)
tree.cv1 <- train(
target ~ predictors,
data = dataset,
method = “rpart”,
trControl = trainControl(method = “cv”, number = 10),
metric = “RMSE”,
tuneGrid = expand.grid(cp = seq(0.0, 0.005, 0.0005)),
control = rpart.control(minsplit = , minbucket = maxdepth = ),
na.action = na.pass
)

plot(tree.cv1)
tree.cv1$results
tree.mod# <- tree.cv1$finalModel #extract tree with lowest cv
rpart.plot(tree.mod#)
tree.cv1$bestTune #extract the optimal cp value

control optional, will use defaults
na.action shouldn’t be needed if we adequately handle missing data at the start
can replace 1st 2 arguments with:
y = dataset$column, x = dataset[, c(“predictor1”, “predictor2”, etc.)]

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
12
Q

Train pt 2

A

need a binary factor target
library(caret)
set.seed(161)
tree.cv1 <- train(
factor(target) ~ predictors,
data = dataset,
method = “rpart”,
trControl = trainControl(method = “cv”, number = 10),
metric = “Accuracy”,
tuneGrid = expand.grid(cp = seq(0.0, 1, 0.01)),
na.action = na.pass,
parms = list(split = “gini” or “information”)
)

plot(tree.cv1) #higher the accuracy the better

tree.mod# <- tree.cv1$finalModel #extract the model with the highest cv accuracy
rpart.plot(tree.mod#, extra = 4) #makes tree
tree.cv$bestTune #extract optimal cp value

“Accuracy” - %age of correct predictions = 1 - classification error rate

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
13
Q

Other

A

poly(variable, 2) = variable + I(variable^2)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
14
Q

Train pt 3

A
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
15
Q

RF plots

A

library(pdp)

partial(rf.mod#, pred.var = “variable”, plot = T)

16
Q

PCA

A

base package
pca1 <- promp(~. - factor, data = dataset, center = T, scale. = T)
OR pca1 <- prcomp(dataset[, colnames(dataset) != “factor”], center = T, scale. = T)
-exclude factors because this function doe not work with factors
-center default = T

pca1$rotation #access the loadings
pca1$x #obtain scores of the PCs (for each observation)
summary(pca1) # proportion of variance explained
plot(pca1, type = “line”) #scree plot
OR screeplot(pca1, type = “line”, npcs = # of PCs to plot)

pca1$x[, 1 or “PC1”] #scores of PC1
dataset.new <- dataset
dataset.new$PC1 <- pca1$x[, “PC1”]
dataset.new$variable1, 2, etc. <- NULL

17
Q

k-means clustering

A

set.seed(161)
dataset.scaled <- data.frame(scale(iris[, columns(dataset) != “factor”]))
km1 <-kmeans(
x = dataset[, c(“variable1”, “varaible2”], # Required - must be scaled
centers = 5, #Required - k = the number of clusters
iter.max = , # max iterations with default = 10, can try higher (30, etc.)
nstart = # # of random initial cluster assignments with default = 1 -> ‘best’ chosen
algorithm , #irrelevant, we viewed Lloyd which is not the default
)

km1$cluster # assignments for each obs
km1$tot.withinss #extract total within-cluster variation

km1$betweenss/dm1$totss

library(ggplot2)
ggplot(dataset.scaled, aes(x = variable1, y = variable2, color = factor(km1$cluster))) +
geom_point()

18
Q

hierarchical clustering

A

nc1 <- hclust(

plot(hc1) #dendrogram
cuttree(hc1, h = 4) # cut at height of 4
OR currtree(hc1, k = 3) # cut into 3clusters

19
Q

R code

A

lm = linear models
mlr.mod1 <- lm(target ~ predictors, data = dataset)
summary(mlr.mod1)

predict(mlr.mod1) -> y^’s
predict(mlr.mod1, newdata = )
->newdata does require specific columns but the model only uses ones included in the model formula

RMSE: sqrt(sum(dataset$target - predict(mlr.mod1, newdata = dataset)^2) / nrow(dataset))

20
Q

R code - Binarization

A

bin.mod <- dummyVars(~factor, data = dataset, fullRank = T)
predict(bin.mod, newdata = dataset)

new.dataset <- cbind(dataset[, colnames(dataset) != “factor”], predict(bin.mod, newdata = dataset))

21
Q

R Code - Stratified Sampling

A

R:
library(caret)
set.seed(161)
ind <- createDataPartition(dataset$target, p = 0.7, list = F)
train <- dataset[ind, ]
test <- dataset[-ind, ]

rm(ind) #remove… ->not necessary

mean(train$target)
mean(test$target)

22
Q

R Code: Stepwise Selection

A

Null model: mlr.mod0 <- lm(target ~ 1, data = dataset)
Biggest model: mlr.mod4 <- lm(target ~ ., data = dataset) <-largest model you want to consider

library(MASS)
AIC:
stepAIC(Null model, direction = “forward”, scope = list(upper = largest model), k = 2)

AIC -> k = 2; BIC -> k = log(nrow(dataset))
Backward–>no scope needed, and if scope is not included, the default for direction is “backward”

k = 2 (default)
direction = “backward” (default)

drop1() -> what does 1st round with backward selection and AIC produce for AIC if drop each predictor

23
Q

R code

A

function: glm

glm.mod1 <- glm(target ~ predictors, family = “distribution”(link = “link”), data = dataset, weights = column in dataset of weights)

->if using weights, target must be on a per weight basis; also exclude column of weights from predictors; only with normal, gamma, or inverse gaussian on exam

predict(glm.mod1, type = “response”) -> this returns mu^ not (1/mu^), etc.

24
Q

R code

A

use family = “binomial” and link = “logit”

exp(coef(modelname))

predict(modelname, type = “response”) -predicted probabilities NOT predicted odds

To get target predictions: ifelse(predict(modelname, type = “response”)>0.5,1,0)
-0.5 is cutoff that we select to separate the target predictions

25
Q

R code - cont.

A

library(pROC)

target <- target vector
probs <- probs vector
preds <- preds vector

roc <- roc(target, probs)
auc(roc) - returns AUC of the roc object
plot(roc) - graph of roc curve

library(caret)
confusionMatrix(factor(preds), factor(target), positive = “1”)

26
Q

R code

A

poi.mod1 <- glm(target ~ predictors, family = poisson(link = “log”), data = dataset)
exp(coeff(poi.mod1))

poi.mod2 <- glm(target ~ predictors - w, family = poisson(link = “log”), offset = log(w), data = dataset)
->w is exposure column

predict(poi.mod2, type = “response”)

27
Q

R code

A

rpart() - key hyperparameters:
-minsplit - minimum number of observations in a node
-minbucket - minimum number of observations permitted for a terminal node
-maxdepth - maximum depth of terminal nodes
-cp - complexity parameter (0 most flexible; 1 least flexible)
-xval = # of cross validation to perform

anova is default for method
maxcompete - most # of competitor splits stored for each split
maxsurrogate - most # of surrogate splits stored for each split; 0 increases speed
usesurrogate - 0 - not used; 1 - used in sequence and left behind if fails; 2 used in sequence, majority if still fails (defualt)

library(rpart)
set.seed(161)
tree.mod1 <- rpart(target ~ predictors, data = dataset, method = “anova” or “poisson”, control = rpart.control(minsplit = #, minbucket = #, maxdepth = #, cp = ,
xval = , maxcompete = , maxsurrogate = , usesurrogate = )
tree.mod1 # provides data on all nodes in the tree
summary(tree.mod1)

library(rpart.plot)
rpart.plot(tree.mod1, digits = #) default digits = 2
predict(tree.mod1)

Poisson
predict(tree.mod1) -> predicted poisson rates, not counts
predict(tree.mod1)*train$w -> obtain predicted poisson counts

28
Q

R code - pruning

A

rpart already performs cv

tree.mod1$cptable #returns cp table
columns:
cp
nsplits
rel error (relative error) -> (SSE w n splits/ SSE w 0 splits)
xerror (cv error)
x std (cv std)
CP decreases as nsplits increases
As nsplits increase, relative error decreases

cp.min <- tree.mod1$cptable[whichmin(tree.mod1$cptable[, “xerror]),”CP”)

tree.mod1a <- prune.rpart(tree.mod1, cp = cp.min

plotcp(tree.mod1) # can select cp with leftmost point below the horizontal dotted line (one standard error rule)

29
Q

R code

A

need binary target
library(rpart)
set.seed(161)
tree.mod# <- rpart(target ~ predictors, data = dataset, method = “class”, parms = list(split = “gini” or “information”))
library(rpart.plot)
rpart.plot(tree.mod#, extra = 4) #4 displays probs for each class @ each node
predict(tree.mod#, type = “prob” or “class”)
#prob - explicitly display probs (default), class - explicitly display target predictions

Extract only the positive class probabilities - for use in ROC curve
predict(tree.mod#, type = “prob)[, 2] or [, “1”]

tree.mod#
node), split, n, loss (# misclassified observations), yval (prediction), (yprob) (probabilities for each class)

30
Q

R code - pruning

A

tree.mod#$cptable #complexity parameter table
-cp
-nsplit
-relerror (classification error rate with n split/classification error w 0 splits)
-xerror (cverror)
-xstd

classification error rate = %age of observations with wrong predictions
If cv error tied, we favor less flexible model due to parsimony
One-standard-error rule -> leftmost point below the horizontal line

plotcp(tree.mod#)
cp.min <- tree.mod#$cptable[which.min(tree.mod#$cptable[, “xerror”], “CP”]
tree.mod#+1 <- prune.rpart(tree.mod#, cp = cp.min)
rpart.plot(tree.mod#+1, extra = 4)

31
Q

R code - Random Forests

A

library(randomForest)
set.seed(161)
rf.mod1 <- randomForest(
target ~ predictors,
data = dataset,
importance = T, # whether to examine the importance of features
ntree = 101, # number of trees
mtry = 2, # number of candidate predictors for each split
nodesize = 5, # min number of observations permitted for a terminal node
maxnodes = #, # maximum number of terminal nodes
keep.forest = T # if want to obtain predictions for all trees later
)

rf.mod1$importance #variable importance measure
OR
library(caret)
varImp(rf.mod1)

predict(rf.mod1) #predictions on out-of-bag obs
->for all trees where that observation is out of bag and averages. If one is NOT out-of-bag for any tree, then NA
predict(rf.mod1, newdata = dataset) # to get predictions for all obs
predict(rf.mod1, newdata = dataset, predict.all = T)$individual/aggregate

mtry default: Integer and >=1
Factor -> floor(sqrt(ncol(x))) sqrt(predictors)
Not Factor -> max(floor(ncol(x)/3),1) predictors / 3

32
Q

R code - Boosting (gbm)

A

library(gbm)
gbm.mod1 <- gbm(
target ~ predictors,
data = dataset,
distribution = “guassian/bernoulli/poisson”, # R will guess if empty
n.trees = 300, # number of trees
interaction.depth = 2, # maximum depth of terminal nodes
shrinkage = 0.01, # shrinkage parameter
n.minobsinnode = 10 # min number of obs permitted for a terminal node
bag.fraction = 1, # portion of observations used for each tree if <1, set seed
)

summary(gmb.mod1) #variable importance measure
predict(gbm.mod1, n.trees = 1) #default n.trees = all

33
Q

R code - Boosting (xgboost)

A

library(xgboost)
requires new data type for datasets
m.f <- model.frame(target ~ predictors, data = head(dataset)
m.m <- model.matrix(attr(m.f., “terms”), data = dataset)
d.m <- xgb.DMatrix(m.m, label = dataset$target)

xgb.parm1 <- list(
booster = “gbtree”, #decision trees
objective = “reg:squarederror/binary:logistic”, #regression/classification
eval.metric = “rmse/auc”, #regression/classification
max.depth = 2,
eta = 0.01, # shrinkage parameter
gamma = 0, #min reduction of splitting measure
subsample = 1, #proportion of data used
colsample_bytree = 1, #proportion of predictors used
min_child_weight = 10, #min obs in node for regression; purity of node for class.
)

xgb.mod1 <- xgb.train(data = d.m, nrounds = 300, params = xgb.parm1)
nrounds = xgb.cv1$bestiteration #extracts best nrounds

set.seed(161)
xgb.cv1 <- xgb.cv(data = d.m, nrounds = 2000, params = xgb.parm1,
nfold = 5,
prediction = F,
print_every_n = 100,
early_stopping_rounds = 50,
maximize = F
)
#last line is optimal value for nrounds

xgb.importance(model = xgb.mod1)
#column gain = %age of contribution to the boosted model

predict(xgb.mod1, newdata = d.m)

34
Q

R code

A

OVER
in.over <- c(which(train$y == 0), rep(which(train$y == 1), times = k))
train.over <- train[ind.over, ]

UNDER
set.seed(161)
ind.under <- c(which(train$y == 1), sample(which(train$y == 0), size = sum(train$y == 1)))
train.under <- train[ind.under, ]

35
Q

Bivariate - numerical in R

A

cor(dataset$column, dataset$column) or cor(dataset[,~factors])
table(dataset$column, dataset$column)

library(dplyr)
dataset %>%
group_by(factor) %>%
summarise(Means = mean(target),
Medians = median(target),
Freq = n())

36
Q

Univariate - graphical in R

A

library(ggplot2)
ggplot(data = dataset, mapping = aes(x = variable)) +
* geom_histogram/bar/boxplot() +
labs(x = “label name”)

*
histogram - (bins = ) #bins optional but can adjust
bar - (position = “dodge”) or (position = “fill”) # fill = area of 1
boxplot - (position = “dodge”) or (position = “fill”) # fill = area of 1

37
Q

Bivariate - graphical in R (histogram and bar)

A

library(ggplot2)
ggplot(data = dataset, mapping = aes(x = variable, fill = factor)) +
* geom_histogram/bar() +
labs(x = “label name”)

*
histogram - (position = “dodge”, bins = ) #bins optional but can adjust
bar - (position = “dodge”) or (position = “fill”) # fill = area of 1
Consider density instead of fill
-> add “y = ..density..” after fill statement on line 1

38
Q

Bivariate - graphical in R (scatterplot and boxplot and facet)

A

library(ggplot2)
ggplot(data = dataset, mapping = aes(x = variable, y = target)) +
geom_point/count/boxplot() +
labs(x = “label name”, y = “label name”)

ggplot(data = dataset, mapping = aes(x = variable, y = target)) +
geom_point() +
labs(x = “label name”, y = “label name”) +
* facet_wrap(~factor)

*
scales = “free” - if don’t want same scaling