R notes Flashcards

1
Q

library(dplyr)

A

pipes, group by, summarise

group_by(factor column)

summarise( Means = mean(), Medians = median(), Freq = n())

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
1
Q

Combining levels

A

library(plyr)
mapvalues(dataset$column, levels(dataset$column), c(“name level1”, etc.)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
2
Q

Unique values

A

length(unique(dataset$column))

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
3
Q

Create Factor of intervals

A

cut(dataset$column,
breaks = c(0, 15, 35, 65, 100)
labels = c(“Child”, “Young Adult”, Middle.aged”, “Elderly”))

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
4
Q

Binary Target - Factor to Numeric

A

ifelse(factor == “success level”, 1, 0)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
5
Q

Relevel

A

levels(factor)

relevel(dataset$column, ref = “new reference level”)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
6
Q

Scatterplot variation

A

ggplot(dataset, aes(x = variable, y = target)) +
geom_count()

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
7
Q

Regularization- code

A

requires matrix object for x: Xmat <- model.matrix(target ~ predictors, data = dataset)
library(glmnet)
rid.mod1 <- glmnet( x = Xmat[, -1], y = dataset$target,
family = “gaussian”,
lambda = ?,
alpha = ?,
standardize = T <- default, so not needed
)

coef(rid.mod1) #all estimates in one place
plot(cv.glmet()) - produces a CV visual; the left dashed line indicates where CV error is minimized; the right dashed line indicates the one-standard-error rule; the numbers at the top indicate the number of predictors in the model

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
8
Q

Regularization - notes

A

hyperparameters: alpha and lambda
-alpha = 0 -> Ridge; alpha = 1 -> Lasso
-lambda = 0 -> no shrinkage

predict(rid.mod4, newx = Xmat[, -1])
training RMSE <- sqrt(sum((dataset$target - predict(rid.mod4, newx = Xmat[, -1]))^2) / nrow(dataset)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
9
Q

Symbols

A

~ = ‘explained by’
+ = ‘include’
. = ‘all remaining variables’
- = ‘exclude’
: = ‘interact’
* = ‘cross’ -> include and interact

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
10
Q

Regularization - CV code

A

set.seed(161)
rid.cv <- cv.glmnet( x = Xmat[, -1], y = dataset$target,
family = “gaussian”,
alpha = ?,
lambda = ??,
standardize = T <- default, so not needed
)
-autogenerates lambda if not specified
-nfold is 10 by default

rid.cv$lambda[, ] # Shows results for the 100 lambda values (100 is default)
rid.cv$lambda.min #returns best lambda OR rid.cv$lambda.1se
–>use rid.cv$lambda.min for lambda in regular glmnet model

coef(rid.mod1)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
11
Q

Train function pt 1

A

library(caret)
set.seed(161)
tree.cv1 <- train(
target ~ predictors,
data = dataset,
method = “rpart”,
trControl = trainControl(method = “cv”, number = 10),
metric = “RMSE”,
tuneGrid = expand.grid(cp = seq(0.0, 0.005, 0.0005)),
control = rpart.control(minsplit = , minbucket = maxdepth = ),
na.action = na.pass
)

plot(tree.cv1)
tree.cv1$results
tree.mod# <- tree.cv1$finalModel #extract tree with lowest cv
rpart.plot(tree.mod#)
tree.cv1$bestTune #extract the optimal cp value

control optional, will use defaults
na.action shouldn’t be needed if we adequately handle missing data at the start
can replace 1st 2 arguments with:
y = dataset$column, x = dataset[, c(“predictor1”, “predictor2”, etc.)]

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
12
Q

Train pt 2

A

need a binary factor target
library(caret)
set.seed(161)
tree.cv1 <- train(
factor(target) ~ predictors,
data = dataset,
method = “rpart”,
trControl = trainControl(method = “cv”, number = 10),
metric = “Accuracy”,
tuneGrid = expand.grid(cp = seq(0.0, 1, 0.01)),
na.action = na.pass,
parms = list(split = “gini” or “information”)
)

plot(tree.cv1) #higher the accuracy the better

tree.mod# <- tree.cv1$finalModel #extract the model with the highest cv accuracy
rpart.plot(tree.mod#, extra = 4) #makes tree
tree.cv$bestTune #extract optimal cp value

“Accuracy” - %age of correct predictions = 1 - classification error rate

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
13
Q

Other

A

poly(variable, 2) = variable + I(variable^2)

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
14
Q

Train pt 3

A
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
15
Q

RF plots

A

library(pdp)

partial(rf.mod#, pred.var = “variable”, plot = T)

16
Q

PCA

A

base package
pca1 <- promp(~. - factor, data = dataset, center = T, scale. = T)
OR pca1 <- prcomp(dataset[, colnames(dataset) != “factor”], center = T, scale. = T)
-exclude factors because this function doe not work with factors
-center default = T

pca1$rotation #access the loadings
pca1$x #obtain scores of the PCs (for each observation)
summary(pca1) # proportion of variance explained
plot(pca1, type = “line”) #scree plot
OR screeplot(pca1, type = “line”, npcs = # of PCs to plot)

pca1$x[, 1 or “PC1”] #scores of PC1
dataset.new <- dataset
dataset.new$PC1 <- pca1$x[, “PC1”]
dataset.new$variable1, 2, etc. <- NULL

17
Q

k-means clustering

A

set.seed(161)
dataset.scaled <- data.frame(scale(iris[, columns(dataset) != “factor”]))
km1 <-kmeans(
x = dataset[, c(“variable1”, “varaible2”], # Required - must be scaled
centers = 5, #Required - k = the number of clusters
iter.max = , # max iterations with default = 10, can try higher (30, etc.)
nstart = # # of random initial cluster assignments with default = 1 -> ‘best’ chosen
algorithm , #irrelevant, we viewed Lloyd which is not the default
)

km1$cluster # assignments for each obs
km1$tot.withinss #extract total within-cluster variation

km1$betweenss/dm1$totss

library(ggplot2)
ggplot(dataset.scaled, aes(x = variable1, y = variable2, color = factor(km1$cluster))) +
geom_point()

18
Q

hierarchical clustering

A

nc1 <- hclust(

plot(hc1) #dendrogram
cuttree(hc1, h = 4) # cut at height of 4
OR currtree(hc1, k = 3) # cut into 3clusters

19
Q

R code

A

lm = linear models
mlr.mod1 <- lm(target ~ predictors, data = dataset)
summary(mlr.mod1)

predict(mlr.mod1) -> y^’s
predict(mlr.mod1, newdata = )
->newdata does require specific columns but the model only uses ones included in the model formula

RMSE: sqrt(sum(dataset$target - predict(mlr.mod1, newdata = dataset)^2) / nrow(dataset))

20
Q

R code - Binarization

A

bin.mod <- dummyVars(~factor, data = dataset, fullRank = T)
predict(bin.mod, newdata = dataset)

new.dataset <- cbind(dataset[, colnames(dataset) != “factor”], predict(bin.mod, newdata = dataset))

21
Q

R Code - Stratified Sampling

A

R:
library(caret)
set.seed(161)
ind <- createDataPartition(dataset$target, p = 0.7, list = F)
train <- dataset[ind, ]
test <- dataset[-ind, ]

rm(ind) #remove… ->not necessary

mean(train$target)
mean(test$target)

22
Q

R Code: Stepwise Selection

A

Null model: mlr.mod0 <- lm(target ~ 1, data = dataset)
Biggest model: mlr.mod4 <- lm(target ~ ., data = dataset) <-largest model you want to consider

library(MASS)
AIC:
stepAIC(Null model, direction = “forward”, scope = list(upper = largest model), k = 2)

AIC -> k = 2; BIC -> k = log(nrow(dataset))
Backward–>no scope needed, and if scope is not included, the default for direction is “backward”

k = 2 (default)
direction = “backward” (default)

drop1() -> what does 1st round with backward selection and AIC produce for AIC if drop each predictor

23
Q

R code

A

function: glm

glm.mod1 <- glm(target ~ predictors, family = “distribution”(link = “link”), data = dataset, weights = column in dataset of weights)

->if using weights, target must be on a per weight basis; also exclude column of weights from predictors; only with normal, gamma, or inverse gaussian on exam

predict(glm.mod1, type = “response”) -> this returns mu^ not (1/mu^), etc.

24
R code
use family = "binomial" and link = "logit" exp(coef(modelname)) predict(modelname, type = "response") -predicted probabilities NOT predicted odds To get target predictions: ifelse(predict(modelname, type = "response")>0.5,1,0) -0.5 is cutoff that we select to separate the target predictions
25
R code - cont.
library(pROC) target <- target vector probs <- probs vector preds <- preds vector roc <- roc(target, probs) auc(roc) - returns AUC of the roc object plot(roc) - graph of roc curve library(caret) confusionMatrix(factor(preds), factor(target), positive = "1")
26
R code
poi.mod1 <- glm(target ~ predictors, family = poisson(link = "log"), data = dataset) exp(coeff(poi.mod1)) poi.mod2 <- glm(target ~ predictors - w, family = poisson(link = "log"), offset = log(w), data = dataset) ->w is exposure column predict(poi.mod2, type = "response")
27
R code
rpart() - key hyperparameters: -minsplit - minimum number of observations in a node -minbucket - minimum number of observations permitted for a terminal node -maxdepth - maximum depth of terminal nodes -cp - complexity parameter (0 most flexible; 1 least flexible) -xval = # of cross validation to perform anova is default for method maxcompete - most # of competitor splits stored for each split maxsurrogate - most # of surrogate splits stored for each split; 0 increases speed usesurrogate - 0 - not used; 1 - used in sequence and left behind if fails; 2 used in sequence, majority if still fails (defualt) library(rpart) set.seed(161) tree.mod1 <- rpart(target ~ predictors, data = dataset, method = "anova" or "poisson", control = rpart.control(minsplit = #, minbucket = #, maxdepth = #, cp = , xval = , maxcompete = , maxsurrogate = , usesurrogate = ) tree.mod1 # provides data on all nodes in the tree summary(tree.mod1) library(rpart.plot) rpart.plot(tree.mod1, digits = #) default digits = 2 predict(tree.mod1) Poisson predict(tree.mod1) -> predicted poisson rates, not counts predict(tree.mod1)*train$w -> obtain predicted poisson counts
28
R code - pruning
rpart already performs cv tree.mod1$cptable #returns cp table columns: cp nsplits rel error (relative error) -> (SSE w n splits/ SSE w 0 splits) xerror (cv error) x std (cv std) CP decreases as nsplits increases As nsplits increase, relative error decreases cp.min <- tree.mod1$cptable[whichmin(tree.mod1$cptable[, "xerror]),"CP") tree.mod1a <- prune.rpart(tree.mod1, cp = cp.min plotcp(tree.mod1) # can select cp with leftmost point below the horizontal dotted line (one standard error rule)
29
R code
need binary target library(rpart) set.seed(161) tree.mod# <- rpart(target ~ predictors, data = dataset, method = "class", parms = list(split = "gini" or "information")) library(rpart.plot) rpart.plot(tree.mod#, extra = 4) #4 displays probs for each class @ each node predict(tree.mod#, type = "prob" or "class") #prob - explicitly display probs (default), class - explicitly display target predictions Extract only the positive class probabilities - for use in ROC curve predict(tree.mod#, type = "prob)[, 2] or [, "1"] tree.mod# node), split, n, loss (# misclassified observations), yval (prediction), (yprob) (probabilities for each class)
30
R code - pruning
tree.mod#$cptable #complexity parameter table -cp -nsplit -relerror (classification error rate with n split/classification error w 0 splits) -xerror (cverror) -xstd classification error rate = %age of observations with wrong predictions If cv error tied, we favor less flexible model due to parsimony One-standard-error rule -> leftmost point below the horizontal line plotcp(tree.mod#) cp.min <- tree.mod#$cptable[which.min(tree.mod#$cptable[, "xerror"], "CP"] tree.mod#+1 <- prune.rpart(tree.mod#, cp = cp.min) rpart.plot(tree.mod#+1, extra = 4)
31
R code - Random Forests
library(randomForest) set.seed(161) rf.mod1 <- randomForest( target ~ predictors, data = dataset, importance = T, # whether to examine the importance of features ntree = 101, # number of trees mtry = 2, # number of candidate predictors for each split nodesize = 5, # min number of observations permitted for a terminal node maxnodes = #, # maximum number of terminal nodes keep.forest = T # if want to obtain predictions for all trees later ) rf.mod1$importance #variable importance measure OR library(caret) varImp(rf.mod1) predict(rf.mod1) #predictions on out-of-bag obs ->for all trees where that observation is out of bag and averages. If one is NOT out-of-bag for any tree, then NA predict(rf.mod1, newdata = dataset) # to get predictions for all obs predict(rf.mod1, newdata = dataset, predict.all = T)$individual/aggregate mtry default: Integer and >=1 Factor -> floor(sqrt(ncol(x))) sqrt(predictors) Not Factor -> max(floor(ncol(x)/3),1) predictors / 3
32
R code - Boosting (gbm)
library(gbm) gbm.mod1 <- gbm( target ~ predictors, data = dataset, distribution = "guassian/bernoulli/poisson", # R will guess if empty n.trees = 300, # number of trees interaction.depth = 2, # maximum depth of terminal nodes shrinkage = 0.01, # shrinkage parameter n.minobsinnode = 10 # min number of obs permitted for a terminal node bag.fraction = 1, # portion of observations used for each tree if <1, set seed ) summary(gmb.mod1) #variable importance measure predict(gbm.mod1, n.trees = 1) #default n.trees = all
33
R code - Boosting (xgboost)
library(xgboost) *requires new data type for datasets* m.f <- model.frame(target ~ predictors, data = head(dataset) m.m <- model.matrix(attr(m.f., "terms"), data = dataset) d.m <- xgb.DMatrix(m.m, label = dataset$target) xgb.parm1 <- list( booster = "gbtree", #decision trees objective = "reg:squarederror/binary:logistic", #regression/classification eval.metric = "rmse/auc", #regression/classification max.depth = 2, eta = 0.01, # shrinkage parameter gamma = 0, #min reduction of splitting measure subsample = 1, #proportion of data used colsample_bytree = 1, #proportion of predictors used min_child_weight = 10, #min obs in node for regression; purity of node for class. ) xgb.mod1 <- xgb.train(data = d.m, nrounds = 300, params = xgb.parm1) nrounds = xgb.cv1$bestiteration #extracts best nrounds set.seed(161) xgb.cv1 <- xgb.cv(data = d.m, nrounds = 2000, params = xgb.parm1, nfold = 5, prediction = F, print_every_n = 100, early_stopping_rounds = 50, maximize = F ) #last line is optimal value for nrounds xgb.importance(model = xgb.mod1) #column gain = %age of contribution to the boosted model predict(xgb.mod1, newdata = d.m)
34
R code
OVER in.over <- c(which(train$y == 0), rep(which(train$y == 1), times = k)) train.over <- train[ind.over, ] UNDER set.seed(161) ind.under <- c(which(train$y == 1), sample(which(train$y == 0), size = sum(train$y == 1))) train.under <- train[ind.under, ]
35
Bivariate - numerical in R
cor(dataset$column, dataset$column) or cor(dataset[,~factors]) table(dataset$column, dataset$column) library(dplyr) dataset %>% group_by(factor) %>% summarise(Means = mean(target), Medians = median(target), Freq = n())
36
Univariate - graphical in R
library(ggplot2) ggplot(data = dataset, mapping = aes(x = variable)) + * geom_histogram/bar/boxplot() + labs(x = "label name") * histogram - (bins = ) #bins optional but can adjust bar - (position = "dodge") or (position = "fill") # fill = area of 1 boxplot - (position = "dodge") or (position = "fill") # fill = area of 1
37
Bivariate - graphical in R (histogram and bar)
library(ggplot2) ggplot(data = dataset, mapping = aes(x = variable, fill = factor)) + * geom_histogram/bar() + labs(x = "label name") * histogram - (position = "dodge", bins = ) #bins optional but can adjust bar - (position = "dodge") or (position = "fill") # fill = area of 1 Consider density instead of fill -> add "y = ..density.." after fill statement on line 1
38
Bivariate - graphical in R (scatterplot and boxplot and facet)
library(ggplot2) ggplot(data = dataset, mapping = aes(x = variable, y = target)) + geom_point/count/boxplot() + labs(x = "label name", y = "label name") ggplot(data = dataset, mapping = aes(x = variable, y = target)) + geom_point() + labs(x = "label name", y = "label name") + * facet_wrap(~factor) * scales = "free" - if don't want same scaling