Pandas Basics Flashcards
Import library
pandas
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns
Import a csv into data frame
pandas
file = "file.csv" df = pd.read_csv(file)
Export a data frame to csv
pandas
df.to_csv("file.csv", sep = "|", index = F
Creating a data frame from a list of lists
pandas
data = [[1, 2, "A"], [3, 4, "B"]] df = pd.DataFrame(data, columns = ["col1", "col2", "col3"])
Creating a data frame from a dictionary
pandas
data = {'col1': [1, 2], 'col2': [3, 4], 'col3': ["A", "B"]} df = pd.DataFrame(data=data)
Get number of rows and columns in a data frame
pandas
df.shape
Viewing top n rows
pandas
df.head(n)
Displaying data type of columns
pandas
df.dtypes
Modifying the data type of a column
pandas
df["col1"] = df["col1"].astype(np.int8)
Display missing value stats and data type
pandas
df.info()
Print descriptive stats
pandas
df.describe()
Filling missing values with a specific value
pandas
df.fillna(0, inplace = True)
Combining data frames: join (merge)
pandas
pd.merge(df1, df2, on = "col3")
Sorting a data frame
pandas
2 alternatives
df.sort_values("col1")) df.sort_values(by='Sales', ascending=False)
Grouping a data frame
pandas
2 alternatives
df.groupby('Region')['Sales'].mean() df.groupby("col3").agg({"col1":sum, "col2":max})
Renaming columns
pandas
df.rename(columns = {"col_A":"col1"})
Deleting columns
pandas
df.drop(columns = ["col1"])
Adding columns (addition method)
pandas
df["col3"] = df["col1"] + df["col2"]
Adding columns (assingment method)
pandas
df = df.assign(col3 = df["col1"] + df["col2"])
Filtering rows: boolean method
pandas
dfx[['b', 'c']] df[df["col2"] > 5] df[(df['Region'] == 'North') & (df['Sales'] > 100)]
Filtering rows: from list
pandas
filter_list = ["A", "C"] df[df["col3"].isin(filter_list)]
Filtering by position
pandas
dfx.iloc[1] #Select single row dfx.iloc[:,1] #Select single column dfx.iloc[1,1] #Select single cell dfx.iloc[:2,:2] #Select group of cells dfx.iloc[1:,1:] #Select group of cells
Filtering: selecting by index
pandas
dfx.loc[1] #Select single row dfx.loc[:,'b'] #Select single column dfx.loc[1,'b'] #Select single cell dfx.loc[:2,['b', 'c']] #Select group of cells dfx.loc['hola'] #Select single row dfx.loc[:,'c'] #Select single column dfx.loc['hola','b'] #Select single cell dfx.loc[:'hola',['b', 'c']] #Select group of cells data.loc[data['condition']]
Set/reset index
pandas
dfx.set_index('d', inplace=True) dfx.reset_index()
Finding unique values (list, count)
pandas
df["col3"].unique() df["col3"].nunique()
Apply a function to a data frame
pandas
def add_cols(row): return row.col1 + row.col2 df["col3"] = df.apply(add_cols, axis=1)
Apply a function to a single column
pandas
def square_col(num): return num**2 df["col3"] = df.col1.apply(square_col) OR data['new_column'] = data['old_column'].apply(lambda x: x * 2)
Mark duplicated rows
pandas
df.duplicated(keep=False)
Drop duplicated rows
pandas
df.drop_duplicates()
Frequency distribution
pandas
df.value_counts("col2")
Reset the index, drop the old index
print(df.reset_index()) df.reset_index(drop=True)
Crosstbulation
pandas
pd.crosstab(df.col1, df.col2)
Pivoting a dataset (to wide format)
pandas
pd.pivot_table(df, index = ["Name"], columns=["Subject"], values='Marks', fill_value=0)
Get the type of an object
pandas
type(df)
Drop rows with missing values
pandas
df.dropna()
Apply a lambda function
pandas
df['Sales'].apply(lambda x: x * 2)
Combining data frames: append
pandas
df2 = pd.concat([df, df])
Get number of row and columns
df.shape
Delete a dataframe
del df del(df)
Add a caption to a dataframe
caption = ‘This is a caption’
df.style.set_caption(caption)
Import from Excel
From Excel
data = pd.read_excel(‘data.xlsx’)
Import from SQL
import sqlite3
conn = sqlite3.connect(‘database.db’)
data = pd.read_sql_query(‘SELECT * FROM table_name’, conn)
Drop rows with missing values
data.dropna()
Trim outliers
Q1 = data[‘column’].quantile(0.25)
Q3 = data[‘column’].quantile(0.75)
IQR = Q3 - Q1
data = data[(data[‘column’] >= Q1 - 1.5 * IQR) & (data[‘column’] <= Q3 + 1.5 * IQR)]
Save data to csv
data.to_csv(‘processed_data.csv’, index=False)
Manipulate dates
data[‘date_column’] = pd.to_datetime(data[‘date_column’])
data[‘month’] = data[‘date_column’].dt.month
Merging dataframes
merged_data = pd.concat([data1, data2], axis=0)
Pivot table
pd.pivot_table(data, values=’value’, index=’category’, columns=’date’, aggfunc=np.sum)
Random sample of data
sample = data.sample(n=100)
Merging data frame based on common column
merged_df = pd.merge(df1, df2, on=’ID’)
Joining based on index
result = df1.join(df2)