Second Flashcards
Build df from dictionary
pd.DataFrame()
dict = { “country” : [“Nepal”, “India”, “China”], “Capital” : [‘Kathmandu”, “Delhi”, “Bejing”]}
SA = pd.DataFrame(dict)
CSV to DataFrame
df = pd.read_csv(‘path/to/dataframe.csv)
CSV to DataFrame
1. Index the df
index_col = 0
df = pd.read_csv(‘path/to/dataframe.csv, index_col = 0)
Sort from highest to lowest
df.sort_values(‘col’, ascending = False)
Sort by multiple variables
df.sort_values([‘col1’, ‘col2’])
- Sort by the col1
- Then sort by col2
dogs.sort_values([‘weight_kg’, ‘height_cm’], ascending = [True, False])
Subset rows
df[col] > #
0 True
1 False
2 True
Subset rows and get data
df[df[‘col’] > #]
0 Bella Labrador Brown
4 Max Labrador Black
Subset based on a string
df[df[‘col1’] == ‘string’]
Subset based on date
df[df[‘col_date’] > “2015-01-01”]
Subset on multiple condiditons
is_x = df[‘col1’] == ‘x’
is_y = df[‘col2’] == ‘y’
df[is_x & is_y]
col1 col2 number 0 x y 56
Subset using isin()
is_x_or_y = df[‘col1].isin([‘x’, ‘y’])
df[is_x_or_y]
is_black_or_brown = dogs[‘color’].isin([‘black’, ‘brown’])
df[is_black_or_brown]
name color height 0 Bel brown 88 4 Max black 55
Adding new column + mutating df
df[‘new_col’] = df[‘col’] / 100
Individuals per 10K
df[‘per_10k’] = 10000 * df[‘number’] / df[‘total’]
homelessness[‘indv_per_10k’] = 10000 * homelessness[‘individuals’] / homelessness[‘state_pop’]
Summary Stats
df[‘col’].mean()
df[‘col’].median()
df[‘col’].mode()
df[‘col’].min()
df[‘col’].max()
df[‘col’].var()
df[‘col’].std()
df[‘col’].sum()
Quantile
df[‘col’].quantile()
Where a sample is divided into equal-sized, adjacent subgroups
Aggregate summary stats from one column
df[‘col’].agg(‘function’)
df[‘ext price’].agg(‘sum’)
ext price sum 2.1
Aggregate multiple summary stats from multiple columns
df[[‘col1’, ‘col2’]].agg([‘func1’, ‘func2’])
df[[‘price’, ‘quantity’]].agg([‘sum’, ‘mean’])
price quantity sum 2.1 25 mean 1 12
Cumulative Sum
df[‘col1’].cumsum()
Drop duplicates
df.drop_duplicates(subset = ‘col’)
vet_visits.drop_duplicates(subset = ‘breed’)
Drop duplicates multiple columns
df.drop_duplicates(subset = [‘col1’, ‘col2’])
vet_visits.drop_duplicates(subset = [‘name’, ‘breed’])
Count number of occurrences
df[‘col’].value_counts()
unique_dogs[‘breed’].value_counts()
labrador 4
Chow Chow 3
Poodle 2
Count number of occurrences & sort values
df[‘col’].value_counts(sort = True)
Count proportions of Total
df[‘col’].value_counts(normalize = True)
Lab 0.5
Chow 0.25
Poodle 0.25
Grouped Summary Statistics by attribute Pt2
df[df[‘col’] == ‘attribute’][‘measure col’].stat()
dogs[dogs[‘color’] == ‘black’][‘weight’].mean()
dogs[dogs[‘color’] == ‘black’][‘weight’].sum()
dogs[dogs[‘color’] == ‘brown’][‘weight’].sum()
26
24
Grouped Summary Statistics (groupby) Pt2
df. groupby(‘col’)[‘measure col’].stat()
dogs. groupby(‘color’)[‘weight’].mean()
color
black 26
brown 24
Multiple stats groupby summary
df. groupby(‘col’)[‘measure col’].agg([min, max, sum])
dogs. groupby(‘color’)[‘weigth’].agg([min, max, sum])
min max sum color black 24 29 53 brown 24 24 48
Group by multiple variables
df. groupby([‘col1’, ‘col2’])[‘measure col’].stat()
dogs. groupby([‘color’, ‘breed’])[‘weight’].mean()
color breed black chow 25 lab 29 poodle 24 Brown chow 24 lab 25
Group by multiple cols and multiple measures
df.groupby([‘col1’, ‘col2’])[[‘measure col1’], [‘measure col2’]].stat()
Pivot Table
df.pivot_table(values = ‘measure col’, index = ‘group_col’)
By default it takes the mean value of each group
dogs.pivot_table(values = ‘weight’, index = ‘color’)
weight color black 26 brown 24
Pivot Table: Change the statistic
aggfunc =
df.pivot_table(values = ‘measure col’, index = ‘group_col’, aggfunc = np.median)
Pivot Table: Multiple statistics
aggfunc = []
df.pivot_table(values = ‘measure col’, index = ‘group_col’, aggfunc = [np.median, np.mean])
Pivot Table: 2 Variables
columns =
df. pivot_table(values = ‘measure col’, index = ‘group_col’, columns = ‘group_col2’)
dogs. pivot_table(values = ‘weight’, index = ‘color’, columns = ‘breed’)
breed chow lab poodle
color
black NA 29 24
brown 24 24 NA
Pivot table: Fill Missing Values
fill_value =
df. pivot_table(values = ‘measure col’, index = ‘group_col’, fill_value = 0)
dogs. pivot_table(values = ‘weight’, index = ‘color’, columns = ‘breed’, fill_value = 0)
breed chow lab poodle
color
black 0 29 24
brown 24 24 0
Pivot Table: Sum All
df. pivot_table(values = ‘measure col’, index = ‘group_col’, fill_value = 0, margins = True)
dogs. pivot_table(values = ‘weight’, index = ‘color’, columns = ‘breed’, fill_value = 0, margins = True)
breed chow lab poodle All color black 0 29 24 53 brown 24 24 0 48 All 24 53 24