tidyverse Flashcards
count
rawmorg05 %>% count(month, sort = TRUE)
# A tibble: 12 × 2
month n
1 1 [January] 27346
2 8 [August] 27217
add_count adds a column with the counts by group
summarize
With grouping:
mtcars %>%
group_by(cyl) %>%
summarise(mean = mean(disp), n = n())
#> # A tibble: 3 × 3
#> cyl mean n
#> <dbl> <dbl> <int>
#> 1 4 105. 11
#> 2 6 183. 7
#> 3 8 353. 14</int></dbl></dbl>
summarize with group_by
group_by() tells R that we want to look at the dataset in terms of different groups,
instead of just a single block. If we use group_by()beforehand, we are subtly splitting
up the data into different groups. Then, summarize() will work on each group of data. Below, we use group_by() to tell R that we want to calculate the average for each student’s tests.
avg_score_by_student <- student_scores %>%
group_by(names) %>%
summarize(
avg_writing = mean(new_writing_score)
)
across
graphtrain %>%
summarize(across(where(is.numeric), mean))
remember to give the function without parentheses
We can find that out by supplying two functions to across(): one to compute the median and the other to count the missing values. You supply multiple functions by using a named list to .fns:
df_miss |>
summarize(
across(a:d, list(
median = function(x) median(x, na.rm = TRUE),
n_miss = function(x) sum(is.na(x))
)),
n = n()
)
#> # A tibble: 1 × 9
#> a_median a_n_miss b_median b_n_miss c_median c_n_miss d_median d_n_miss
#> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int>
#> 1 0.139 1 -1.11 1 -0.387 2 1.15 0
#> # ℹ 1 more variable: n <int></int></int></dbl></int></dbl></int></dbl></int></dbl>
c_across
c_across rowwise operations, e.g. creating a column that is a sum of a few columns
df <- tibble(id = 1:4, w = runif(4), x = runif(4), y = runif(4), z = runif(4))
df %>%
rowwise() %>%
mutate(
sum = sum(c_across(w:z)),
sd = sd(c_across(w:z))
)
#> # A tibble: 4 × 7
#> # Rowwise:
#> id w x y z sum sd
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.790 0.588 0.142 0.225 1.74 0.305
#> 2 2 0.892 0.514 0.781 0.207 2.39 0.305
#> 3 3 0.327 0.317 0.456 0.659 1.76 0.159
#> 4 4 0.351 0.408 0.234 0.715 1.71 0.205</dbl></dbl></dbl></dbl></dbl></dbl></int>
case_when
case_when(
x %% 35 == 0 ~ “fizz buzz”,
x %% 5 == 0 ~ “fizz”,
x %% 7 == 0 ~ “buzz”,
.default = as.character(x)
)
contains()
sat_results <- sat_results %>%
mutate(across(contains(“SAT”),as.numeric))
contains with multiple choices, acts like an OR
mtcars %>%
select(contains(c(“m”, “ar”))
This does the same thing as the following:
mtcars %>%
select(matches(‘m|ar’)) %>%
head(2)
# mpg am gear carb
#Mazda RX4 21 1 4 4
#Mazda RX4 Wag 21 1 4 4
distinct
distinct set .keep_all to TRUE to keep all the columns, default is FALSE (surprisingly to me)
mtcars |> distinct(gear, .keep_all = TRUE)
filter something containing a certain character string
mtcars$type <- rownames(mtcars)
mtcars %>%
filter(str_detect(type, ‘Toyota|Mazda’))
joins
inner_join(), right_join(), full_join() have the same interface as left_join(). The
difference is which rows they keep: left join keeps all the rows in x, the right join keeps all rows in y, the full join keeps all rows in either x or y, and the inner join only keeps rows that occur in both x and y.
matches
You want to return every column in your data whose name contains a specific string or regular expression.
Solution
table1 %>%
select(matches(“o.*u”))
Variable name in character vector, but no function call issue
for (var in names(mtcars)) {
mtcars %>% count(.data[[var]]) %>% print()
}
Passing name of variable in var, and then putting in tidyverse function
var_summary <- function(data, var) { data %>% summarise(n = n(), min = min({{ var }}), max = max({{ var }})) }
mtcars %>% group_by(cyl) %>% var_summary(mpg)
Generating a variable name programmatically
name <- “susan”
tibble(“{name}” := 2)
Select variables from names in a character vector
vars <- c(“mpg”, “vs”)
mtcars %>% select(all_of(vars))
mtcars %>% select(!all_of(vars))
select(any_of(vars)). would select all of them if there, but not give error of they don’t exist
magine you have this simple tibble and you want to count the number of observations, and compute the median of every column
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df |> summarize(
n = n(),
across(a:d, median),
)
#> # A tibble: 1 × 5
#> n a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 10 -0.246 -0.287 -0.0567 0.144</dbl></dbl></dbl></dbl></int>
Use two functions with across
df_miss |>
summarize(
across(a:d, list(
median = function(x) {median(x, na.rm = TRUE)},
n_miss = function(x) {sum(is.na(x))}
)),
n = n()
)
#> # A tibble: 1 × 9
#> a_median a_n_miss b_median b_n_miss c_median c_n_miss d_median d_n_miss
#> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int>
#> 1 0.139 1 -1.11 1 -0.387 2 1.15 0
#> # ℹ 1 more variable: n <int></int></int></dbl></int></dbl></int></dbl></int></dbl>
filter to just rows with at least one of columns a to d being NA
same as df_miss |> filter(is.na(a) | is.na(b) | is.na(c) | is.na(d))
df_miss |> filter(if_any(a:d, is.na))
#> # A tibble: 4 × 4
#> a b c d
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.434 -1.25 NA 1.60
#> 2 NA -1.43 -0.297 0.776
#> 3 -0.156 -0.980 NA 1.15
#> 4 1.11 NA -0.387 0.704</dbl></dbl></dbl></dbl>
df_miss |> filter(if_all(a:d, is.na))
#> # A tibble: 0 × 4
#> # ℹ 4 variables: a <dbl>, b <dbl>, c <dbl>, d <dbl></dbl></dbl></dbl></dbl>
Function to summarize over all of a set of variables
summarize_means <- function(df, summary_vars = where(is.numeric)) {
df |>
summarize(
across({{ summary_vars }}, function(x) {mean(x, na.rm = TRUE)}),
n = n(),
.groups = “drop”
)
}
diamonds |>
group_by(cut) |>
summarize_means()
#> # A tibble: 5 × 9
#> cut carat depth table price x y z n
#> <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 Fair 1.05 64.0 59.1 4359. 6.25 6.18 3.98 1610
#> 2 Good 0.849 62.4 58.7 3929. 5.84 5.85 3.64 4906
#> 3 Very Good 0.806 61.8 58.0 3982. 5.74 5.77 3.56 12082
#> 4 Premium 0.892 61.3 58.7 4584. 5.97 5.94 3.65 13791
#> 5 Ideal 0.703 61.7 56.0 3458. 5.51 5.52 3.40 21551</int></dbl></dbl></dbl></dbl></dbl></dbl></dbl></ord>
diamonds |>
group_by(cut) |>
summarize_means(c(carat, x:z))
#> # A tibble: 5 × 6
#> cut carat x y z n
#> <ord> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 Fair 1.05 6.25 6.18 3.98 1610
#> 2 Good 0.849 5.84 5.85 3.64 4906
#> 3 Very Good 0.806 5.74 5.77 3.56 12082
#> 4 Premium 0.892 5.97 5.94 3.65 13791
#> 5 Ideal 0.703 5.51 5.52 3.40 21551</int></dbl></dbl></dbl></dbl></ord>
Compute length of each column using map (just as an example of using map, obviously it’s the same for all of them)
df %>%
map(length)
get type of a vector
typeof(vec)
pivot_longer
songs
track wk1 wk2 wk3
Song A 1 5 9
Song B 4 3 3
songs |>
pivot_longer(
cols = starts_with(“wk”),
names_to = “week”,
values_to = “rank”
)
track week rank
Song A 1 1
Song A 2 5
Song A 3 9
Song B 1 4
pivot_wider
Obs CaseType Accepted
A Single Yes
A Family No
B Single No
B Family Yes
pivot_wider(names_from=”CaseType”, values_from=”Accepted”)
Obs Single Family
A Yes No
B No Yes
S4 class in R
create a class “Student_Info” with three member variables
setClass(“Student_Info”, slots=list(name=”character”, age=”numeric”, GPA=”numeric”))
student1 <- new(“Student_Info”, name = “John”, age = 21, GPA = 3.5)
student1
give col sums of a data frame as a vector, as example of map, of course there is an easier way to do this
df |> map_dbl(sum)
pass variables in function to group_by, for example
group_by() https://dplyr.tidyverse.org/reference/group_by.html uses
data-masking, not tidy-selection. We can work around that problem by using
the handy pick() https://dplyr.tidyverse.org/reference/pick.html
function, which allows you to use tidy-selection inside data-masking
functions:
df %>% count(pick(starts_with(“z”))) #> # A tibble: 3 × 3
count_missing <- function(df, group_vars, x_var) {
df |>
group_by(pick({{ group_vars }})) |>
summarize(
n_miss = sum(is.na({{ x_var }})),
.groups = “drop”
)
}
search for function you can’t remember
apropos(“replace”)
apply rows vs. columns
You want apply (see the docs for it). apply(var,1,fun) will apply to rows, apply(var,2,fun) will apply to columns.
a
c.1..2..3. c.10..0..6.
1 1 10
2 2 0
3 3 6
> apply(a,1,min)
[1] 1 0 3
assign with a list of things
paramlist=list(test.this = .20, vec.test=c(“hi”,”bye”))
for (ii.param in 1:length(paramlist)){
assign(x=names(paramlist)[[ii.param]],value = paramlist[[ii.param]])
}
assigns test.this to value .20
and vec.test to (“hi”,”bye”)
time a few lines
start.time <- proc.time()
cat(“It took “, as.numeric(getElement(proc.time(),”elapsed”) - getElement(start.time,”elapsed”)), “seconds to download the data.\n”)
lubridate
ymd(“2017-01-31”), ymd_hms(“2017-01-31 20:11:59”), year(), wday() weekday label=TRUE to get Tuesday;
expand_dates <- function(df) {
df |>
mutate(
across(where(is.Date), list(year = year, month = month, day = mday))
)
}
lubridate time differences
durations (exact # of seconds), periods (weeks and months), intervals (start and end point)
get number distinct, with summarize
mtcars |> summarize(across(everything(), n_distinct))
break
use break, not break()
formulas
as.formula,
formula.gbm.without.msa <-
reformulate(response=”weekpay”, termlabels= predictors.including.msas)
apply list of functions to something
x2 <- list(c(5,6,7))
x2
[[1]]
[1] 5 6 7
for (f2 in c(“mean”,”min”)){
+ print(do.call(f2,x2))
+ }
[1] 6
[1] 5
ggplot2 aes basic example
mtcars %>% ggplot(mapping=aes(x=wt, y=mpg)) +
geom_point()
with colors for different values of cyl:
mtcars %>% ggplot(mapping=aes(x=wt, y=mpg, color=cyl)) + geom_point()
labels and title
labs(x=”mpg”, y=”hwy”, title=”A Nice title”)
bar plot
ggplot(mpg, aes(x = drv, fill = drv)) +
geom_bar()
Note that fill is here set to be equal to the same variable as x.
bar plot, side by side
diamonds %>%
ggplot(aes(x=color, fill = cut)) +
geom_bar(position=”dodge”)
frequency polygon
geom_freqpoly
facet_wrap
facet wrap draws 5 plots, one for each level of marstat:
graphtrain %>%
ggplot(aes(x=age, y=weekpay)) +
geom_point() +
facet_wrap(~ marstat)
facet_grid
facet grid, draws grid for levels of 2 variables
flipping axes ggplot2
ggplot(data = diamonds, mapping = aes(x = cut)) +
geom_bar() +
coord_flip()
counts over 2 continuous variables
ggplot(smaller, aes(x = carat, y = price)) +
geom_bin2d()
counts over 2 discrete variables
ggplot(diamonds, aes(x = cut, y = color)) +
geom_count()
limit y axis
ggplot(diamonds, aes(x = y)) +
geom_histogram(binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
save ggplot
defaults to the last plot
ggsave(filename, device=”pdf”)
also can do pdf() with print(ggplot() …)
grep
x <- c(“hid”, “hai”, “hirsute”, “bla”)
grep(pattern=”hi”,x)
[1] 1 3
> grep(pattern=”hi”,x, value=TRUE)
[1] “hid” “hirsute”
grep(pattern,
x, value = TRUE) to return the values
flatten lists
list_flatten, in purrr
get first element of each element of a list, or element labeled x of each element
sapply(templist[1:3], FUN=”[“,1)
sapply(tt.cases, FUN= function(x){ x$id})
eval(parse())
x <- list(CA = c(1,2), NV =c(3,4), AZ=c(5,6))
state = “CA”
eval(parse(text= paste(“y<- x$”, state, sep=””)))
This actually does (and evaluates):
y <- x$CA
REMEMBER to include the ‘text=’ part!!
pick
use pick() within something like mutate, so you can use the things you can do with
select()
(mutate uses data-masking, select use tidy selection
pick() provides a way to easily select a subset of columns from your data using
select() semantics while inside a “data-masking” function like mutate() or
summarise(). pick() returns a data frame containing the selected columns for the
current group.
my_group_by <- function(data, cols) {
group_by(data, pick({{ cols }}))
}
df %>% my_group_by(c(x, starts_with(“z”)))
modular arithmetic
n %% 20
math expressions in plots
or maybe use quote with ggplot2?
plot(x,y,
xlab = expression(paste(“Text here “, hat(x), “ here “, z^rho, “ and here”)),
ylab = expression(paste(“Here is some text of “, phi^{rho})),
main = “Expressions with Text”)
pivot longer more complicated
household
#> # A tibble: 5 × 5
#> family dob_child1 dob_child2 name_child1 name_child2
#> <int> <date> <date> <chr> <chr>
#> 1 1 1998-11-26 2000-01-29 Susan Jose
#> 2 2 1996-06-22 NA Mark NA
#> 3 3 2002-07-11 2004-04-05 Sam Seth
#> 4 4 2004-10-10 2009-08-27 Craig Khai
#> 5 5 2000-12-05 2005-02-28 Parker Gracie</chr></chr></date></date></int>
Note that we have two pieces of information (or values) for each child: their name and
their dob (date of birth). These need to go into separate columns in the result. Again
we supply multiple variables to names_to, using names_sep to split up each variable name. Note the special name .value: this tells pivot_longer() that that part of the column name specifies the “value” being measured (which will become a variable in the output).
household %>%
pivot_longer(
cols = !family,
names_to = c(“.value”, “child”),
names_sep = “_”,
values_drop_na = TRUE
)
pivot wider
pivot_wider() is the opposite of pivot_longer(): it makes a dataset wider by increasing the number of columns and decreasing the number of rows. It’s relatively rare to need pivot_wider() to make tidy data, but it’s often useful for creating summary tables for presentation, or data in a format needed by other tools.
fish_encounters
#> # A tibble: 114 × 3
#> fish station seen
#> <fct> <fct> <int>
#> 1 4842 Release 1
#> 2 4842 I80_1 1
#> 3 4842 Lisbon 1
#> 4 4842 Rstr 1</int></fct></fct>
gets converted into
fish_encounters %>%
pivot_wider(
names_from = station,
values_from = seen
)
Type 1 error , type 2
False positive, i.e mistaken rejection of null hypothesis
False negative or mistaken lack of rejection
Sensitivity
Specificity
Sensitivity: Rate of getting a positive, given that it’s a positive
Rate of getting a negative, given that it’s actually a negative
Loops being slow
For loops are slow often because of a copy being made, e.g. subtracting median from every column of a df. But also vectorized functions are written in C code
sample
sample(x, size, replace= FALSE)
rnorm(10)
sample.int(100,10, replace= FALSE)
sort a dataframe by the order of the elements in B
x[order(x$B),]
tilde and .x
We can either use
These two are the same:
starwars |> summarize(across(where(is.character), ~ length(unique(.x))))
starwars |> summarize(across(where(is.character), function(x) length(unique(x))))
slice_max
Note that this is similar to using summarize to get max delay for each group, but this keeps the info for all columns
Also not it provides two or more if ties
flights |>
group_by(dest) |>
slice_max(arr_delay, n = 1) |>
relocate(dest)
#> # A tibble: 108 × 19
#> # Groups: dest [105]
#> dest year month day dep_time sched_dep_time dep_delay arr_time
#> <chr> <int> <int> <int> <int> <int> <dbl> <int>
#> 1 ABQ 2013 7 22 2145 2007 98 132
#> 2 ACK 2013 7 23 1139 800 219 1250
#> 3 ALB 2013 1 25 123 2000 323 229
#> 4 ANC 2013 8 17 1740 1625 75 2042
#> 5 ATL 2013 7 22 2257 759 898 121
#> 6 AUS 2013 7 10 2056 1505 351 2347
#> # ℹ 102 more rows
#> # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, …</dbl></int></int></dbl></int></int></int></int></int></chr>
Get first number from a string
parse_number() is a handy function that will extract the first number from a string, ignoring all other text.
Example with connecting points in a group with geom,_line
billboard_longer |>
ggplot(aes(x = week, y = rank, group = track)) +
geom_line(alpha = 0.25) +
scale_y_reverse()
Mapping within only one layer
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class)) +
geom_smooth()
Different data in one layer
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_point(
data = mpg |> filter(class == “2seater”),
color = “red”
) +
geom_point(
data = mpg |> filter(class == “2seater”),
shape = “circle open”, size = 3, color = “red”
)
rearrange/reorder columnds
df %>% relocate(a, .after = c)
df %>% relocate(all_of(c(“n_group”,”gender”,”group”)), .before = sleepy)