manipulate dataframe
dataframe consists of series. Series is a one-dimensional data structure, which consists of indexes and values. Dataframe is a two-dimensional structure that has columns in addition to indexes and values.
query
df.query('(age > 25) & (height > 165) & (gender == "female")')
df.query('(age > @min_age) & (height > @min_height) & (gender == @g)')
add row and colume name
import numpy as np
import pandas as pdvalues = np.random.randint(10, size=(3,7))
df = pd.DataFrame(values, columns=list('ABCDEFG'))
df.insert(0, 'category', ['cat1','cat2','cat3'])
melt
convert dataframe with high number of columns to narrow ones The column specified with the id_vars parameter remains the same and the other columns are represented under the variable and value columns
df_melted = pd.melt(df, id_vars='category')
stack
df_stacked = df_measurements.stack().to_frame()
#unstack
df_stacked.unstack()
adding or drop volumn
#created a new column with a list
df['city'] = ['Rome','Madrid','Houston']
#inplace parameter is set to True in order to save the changes
df.drop(['E','F','G'], axis=1, inplace=True)
insert
insert column at defined location #insert column at first column df.insert(0, ‘first_column’, [4,2,5])
add or drop row
new_row = {'A':4, 'B':2, 'C':5, 'D':4, 'city':'Berlin'}
df = df.append(new_row, ignore_index=True)
df.drop([3], axis=0, inplace=True)
pivot function
#return dataframe contains the mean values for each city-cat pair
df.pivot_table(index='cat', columns='city', aggfunc='mean')
merge
#merge dataframe according to defined id
pd.merge(df1, df2, on='id')
#if the column names on two dataframes are different
pd.merge(df1, df2, left_on='id', right_on='number')
#select join method: innner join,left join,right join or outer join
pd.merge(df1, df2, left_on='id', right_on='number', how='left')
join
join operation is more efficient than merge
df1.set_index("df1_col1", inplace = True)
df2.set_index("df2_col1", inplace = True)
x = df1.join(df2)
concat
#concatenate in column
pd.concat([df1, df2], axis=1)
apply
allows you to apply a function across an axis of a DataFrame or to a Series.
axis allows you to define which axis the function is going to be applied to.
If raw=False is passed then the row or column is passed to the apply function as a Series, if raw=True is passed then ndarray objects are passed to the function instead
# pandas.DataFrame.apply
DataFrame.apply(func, axis=0, raw=False, result_type=None, args=(), **kwds)
# pandas.Series.apply
Series.apply(func, convert_dtype=True, args=(), **kwds)
#example
import pandas as pd
import numpy as np
df = pd.DataFrame([[np.random.randint(1, 100), np.random.randint(1, 100)]] * 4, columns=['A', 'B'])
print(df)
print(df.apply(lambda x: [5, 10], axis=1))
print(df.apply(lambda x: [5, 10, 15], axis=1, result_type='expand'))
print(df.apply(lambda x: [5, 10], axis=1, result_type='broadcast'))
print(df.apply(lambda x: [5, 10], axis=1, result_type='reduce'))
#example1
series = pd.Series(np.random.randint(0, 100, 5), name='result')
df = pd.DataFrame(series)
#between range is a function
df['in_range'] = df['result'].apply(between_range, args=(25, 75))
print(df)
applymap
apply operation of the specified function on each cell in the DataFrame
df = pd.DataFrame(
{
"A":np.random.randn(5),
"B":np.random.randn(5),
"C":np.random.randn(5),
"D":np.random.randn(5),
"E":np.random.randn(5),
}
)
df.applymap(lambda x:"%.2f" % x)
eval
#create new data
result2 = df.eval('''
years_to_now = 2020 - release_year
new_date_added = @pd.to_datetime(date_added.str.strip(), format='%B %d, %Y', errors='coerce')''')
# use partial to fix expression
from functools import partial
func = partial(pd.to_datetime, format='%B %d, %Y', errors='coerce')
netflix.eval('''
years_to_now = 2020 - release_year
new_date_added = @func(date_added.str.strip())''')
map
using Series.map() can do easy replacement
#map with dictionary
data["gender"] = data["gender"].map({"man":1, "women":0})
#map with function
def gender_map(x):
gender = 1 if x == "man" else 0
return gender
#function as argument
data["gender"] = data["gender"].map(gender_map)
plot
there are visualization features in pandas
usage
import pandas as pd
df= pd.DataFrame(np.random.rand(8, 4), columns=['A','B','C','D'])
#since the plot is based on matplotlib, we can change thema with seaborn
import seaborn as sns
sns.set_palette("pastel", 8)
#sns.set_palette("Blues_r", 8)
#sns.set_palette("magma", 8)
df.plot.bar()
df.plot.barh(stacked=True)
df.plot.area(stacked=True,alpha = 0.9)
df.plot.kde()
df = pd.DataFrame({'a': np.random.randn(1000) + 1,
'b': np.random.randn(1000),
'c': np.random.randn(1000) - 1},
columns=['a', 'b', 'c'])
df.plot.hist(stacked=True, bins=20)
df.plot.hist(alpha=0.5)
df.plot.box()
df['value'].plot()
df.plot.scatter()
data.plot.hexbin(x='A',y='B')
#subplot
data.plot(subplots=True,layout=(3, 2), figsize=(15, 8))
string methods
StringDtype
object type is default to store strings, which has some drawbacks. with specification of string or StringDtype we can use StringDtype
import pandas as pd
pd.Series(['aaa','bbb','ccc']).dtype
dtype('O')
pd.Series(['aaa','bbb','ccc'], dtype='string').dtype
StringDtype
pd.Series(['aaa','bbb','ccc'], dtype=pd.StringDtype()).dtype
StringDtype ### convert text to word series
#explode function to use each separated word as a new item in the series
#A would have the index 0 due to the nature of the explode function, so here is droped
A = pd.Series(text).str.split().explode().reset_index(drop=True)
convert upper-/lowercase
A.str.upper()
A.str.lower()
len
#return length of each string in A
A.str.len()
string series to text
A.str.cat(sep=" ")
replace
it replace not only whole string, but also part of the string if any
A.str.replace('the', 'not-a-word')
regex
#extract part of the string
B = pd.Series(['a1','b4','c3','d4','e3'])
B.str.extract(r'([a-z])([0-9])')
#check whether string has same pattern
C = pd.Series(['a1','4b','c3','d4','e3'])
C.str.contains(r'[a-z][0-9]')
count character in stirng
B = pd.Series(['aa','ab','a','aaa','aaac'])
B.str.count('a') ### filtering
B = pd.Series(['aa','ba','ad'])
B.str.startswith('a')
B.str.endswith('d')
string to categorial
cities = ['New York', 'Rome', 'Madrid', 'Istanbul', 'Rome']
pd.Series(cities).str.get_dummies()
style api
show dataframe in colored style
dataframe.style.highlight_null(props='color:white;background-color:black')
dataframe.style.highlight_max(props='color:white;background-color:green')
dataframe.style.highlight_max(props='color:white;background-color:green', axis=1)
dataframe.style.highlight_between(left=100, right=200, props='color:black;
dataframe.style.bar(color='lightblue',height=70,width=70)
sql
create dummy data
df = pd.DataFrame({'name': ['Ann', 'Ann', 'Ann', 'Bob', 'Bob'],
'destination': ['Japan', 'Korea', 'Switzerland',
'USA', 'Switzerland'],
'dep_date': ['2019-02-02', '2019-01-01',
'2020-01-11', '2019-05-05',
'2020-01-11'],
'duration': [7, 21, 14, 10, 14]})
shift()
SELECT name
, destination
, dep_date
, duration
, LEAD(dep_date) OVER(ORDER BY dep_date, name) AS lead1
, LEAD(dep_date, 2) OVER(ORDER BY dep_date, name) AS lead2
, LAG(dep_date) OVER(ORDER BY dep_date, name) AS lag1
, LAG(dep_date, 3) OVER(ORDER BY dep_date, name) AS lag3
FROM df
#in python pandas it equals to
df.sort_values(['dep_date', 'name'], inplace=True)
df=df.assign(lead1 = df['dep_date'].shift(-1),
lead2 = df['dep_date'].shift(-2),
lag1 = df['dep_date'].shift(),
lag3 = df['dep_date'].shift(3))
Date/datetime
SELECT name
, destination
, dep_date
, duration
, DATENAME(WEEKDAY, dep_date) AS day
, DATENAME(MONTH, dep_date) AS month
, DATEDIFF(DAY,
LAG(dep_date) OVER(ORDER BY dep_date, name),
dep_date) AS diff
, DATEADD(DAY, day, dep_date) AS arr_date
FROM df
#equals to
df['dep_date'] = pd.to_datetime(df['dep_date'])
df['duration'] = pd.to_timedelta(df['duration'], 'D')
df.sort_values(['dep_date', 'name'], inplace=True)
df.assign(day = df['dep_date'].dt.day_name(),
month = df['dep_date'].dt.month_name(),
diff = df['dep_date'] - df['dep_date'].shift(),
arr_date = df['dep_date'] + df['duration']) ### Ranking
SELECT name
, destination
, dep_date
, duration
, ROW_NUMBER() OVER(ORDER BY duration, name) AS row_number_d
, RANK() OVER(ORDER BY duration) AS rank_d
, DENSE_RANK() OVER(ORDER BY duration) AS dense_rank_d
FROM df
#equal to
df.sort_values(['duration', 'name']).assign(
row_number_d = df['duration'].rank(method='first').astype(int),
rank_d = df['duration'].rank(method='min').astype(int),
dense_rank_d = df['duration'].rank(method='dense').astype(int))
Aggregate & Partition
SELECT name
, destination
, dep_date
, duration
, MAX(duration) OVER() AS max_dur
, SUM(duration) OVER() AS sum_dur
, AVG(duration) OVER(PARTITION BY name) AS avg_dur_name
, SUM(duration) OVER(PARTITION BY name ORDER BY dep_date
RANGE BETWEEN UNBOUNDED PRECEDING
AND CURRENT ROW) AS cum_sum_dur_name
FROM df
#equal to
df.sort_values(['name', 'dep_date'], inplace=True)
df.assign(max_dur=df['duration'].max(),
sum_dur=df['duration'].sum(),
avg_dur_name=df.groupby('name')['duration']
.transform('mean'),
cum_sum_dur_name=df.sort_values('dep_date')
.groupby('name')['duration']
.transform('cumsum'))
flatten json
import json
#load data using Python JSON module
with open('data/simple.json','r') as f:
data = json.loads(f.read())
import requests
URL = 'http://raw.githubusercontent.com/BindiChen/machine-learning/master/data-analysis/027-pandas-convert-json/data/simple.json'
data = json.loads(requests.get(URL).text)
#flatten dict or list of dict or nested dict to pandas dataframe
df = pd.json_normalize(a_dict)
Flattening JSON with a nested list
#flatten dict
pd.json_normalize(
json_obj,
#record_path is the list path
record_path =['students'],
#meta is data that needs to be included in the flattened results
meta=['school', ['info', 'contacts', 'tel']],
)
#flatten dict list
pd.json_normalize(
json_list,
record_path =['students'],
meta=['class', 'room', ['info', 'teachers', 'math']],
#set the argument errors to 'ignore' so that missing keys will be filled with NaN
#errors='ignore',
#custom Separator using the sep argument
#sep='->'
#add prefix
#meta_prefix='meta-',
#record_prefix='student-'
)
indexing
loc
dataframe.loc[specified rows in list: specified columns in list]
#or
dataframe.loc[start label row: stop label row, start lable column: start lable column]
#or
dataframe.loc[dataframe.
iloc
integer-location based indexing based on the position of the rows and columns.
the end index will not be included in the selected dataframe
dataframe.iloc[start index row:end index row, start index column:end index column]
iterating
DataFrame.iterrows()
for index, row in df.iterrows():
print(f'Index: {index}, row: {row.values}')
#only print column_a
print(f'Index: {index}, column_a: {row.get("column_a", 0)}')
DataFrame.itertuples()
returns an iterator containing name tuples representing the column names and values
# DataFrame.itertuples(index=True, name='Pandas')
for row in df.itertuples():
print(row)
print(row.column_name)
Garbage Collector
by processing big data using pandas dataframe, it is important to delete the unused reference and use gc.collect method, so that the memory will be returned to system
import pandas as pd
import sys #system specific parameters and names
import gc #garbage collector interface
def obj_size_fmt(num):
if num<10**3:
return "{:.2f}{}".format(num,"B")
elif ((num>=10**3)&(num<10**6)):
return "{:.2f}{}".format(num/(1.024*10**3),"KB")
elif ((num>=10**6)&(num<10**9)):
return "{:.2f}{}".format(num/(1.024*10**6),"MB")
else:
return "{:.2f}{}".format(num/(1.024*10**9),"GB")
def memory_usage():
memory_usage_by_variable=pd.DataFrame({k:sys.getsizeof(v)\
for (k,v) in globals().items()},index=['Size'])
memory_usage_by_variable=memory_usage_by_variable.T
memory_usage_by_variable=memory_usage_by_variable\
.sort_values(by='Size',ascending=False).head(10)
memory_usage_by_variable['Size']=memory_usage_by_variable['Size']
\.apply(lambda x: obj_size_fmt(x))
return memory_usage_by_variable
#deleting references
del df
del df2
#triggering collection
gc.collect()
#finally check memory usage
memory_usage()
feature engineering
feature is to extract new feature from existing dataset
replace() for Label encoding
dynamically replaces current values with the given values. The new values can be passed as a list, dictionary, series, str, float, and int
data['Outlet_Location_Type_Encoded'] = data['Outlet_Location_Type'] \
.replace({'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3})
get_dummies() for One Hot Encoding
convert a categorical variable to one hot variable.
#the parameter drop_first, which drops the first binary column to avoid perfect multicollinearity
Outlet_Type_Dumm = pd.get_dummies(data=data['Outlet_Type'], columns=['Outlet_Type'], drop_first=True)
pd.concat([data['Outlet_Type'], Outlet_Type_Dumm], axis=1).head()
cut() and qcut() for Binning
grouping together values of continuous variables into n number of bins. qcut divide the bins into the same frequency groups; cut divide the bins with explicitly defined bin edges
groups = ['Low', 'Med', 'High', 'Exp']
data['Item_MRP_Bin_qcut'] = pd.qcut(data['Item_MRP'], q=4, labels=groups)
data[['Item_MRP', 'Item_MRP_Bin_qcut']].head()
bins = [0, 70, 140, 210, 280]
#or to pass in interval index
bins = pd.IntervalIndex.from_tuples([(0, 70), (70, 140), (140, 210), (210, 280)])
groups = ['Low', 'Med', 'High', 'Exp']
data['Item_MRP_Bin_cut'] = pd.cut(data['Item_MRP'], bins=bins, labels=groups)
data[['Item_MRP', 'Item_MRP_Bin_cut']].head()
#count how many value fall into each bin
data['Item_MRP_Bin_cut'].value_counts().sort_index()
apply() for Text Extraction
apply a function to every variable of dataframe
data['Item_Code'] = data['Item_Identifier'].apply(lambda x: x[0:2])
data[['Item_Identifier', 'Item_Code']].head()
value_counts() and apply() for Frequency Encoding
Frequency Encoding is an encoding technique that encodes categorical feature values to their respected frequencies.
Item_Type_freq = data['Item_Type'].value_counts(normalize=True)
# Mapping the encoded values with original data
data['Item_Type_freq'] = data['Item_Type'].apply(lambda x : Item_Type_freq[x])
print('The sum of Item_Type_freq variable:', sum(Item_Type_freq))
data[['Item_Type', 'Item_Type_freq']].head(6)
groupby() and transform() for Aggregation Features
Groupby is a function that can split the data into various forms to get information that was not available on the surface.
data['Item_Outlet_Sales_Mean'] = data.groupby(['Item_Identifier', 'Item_Type'])['Item_Outlet_Sales']\
.transform(lambda x: x.mean())
data[['Item_Identifier','Item_Type','Item_Outlet_Sales','Item_Outlet_Sales_Mean']].tail()
Series.dt() for date and time based features
data['pickup_year'] = data['pickup_datetime'].dt.year
data['pickup_dayofyear'] = data['pickup_datetime'].dt.day
data['pickup_monthofyear'] = data['pickup_datetime'].dt.month
data['pickup_hourofday'] = data['pickup_datetime'].dt.hour
data['pickup_dayofweek'] = data['pickup_datetime'].dt.dayofweek
data['pickup_weekofyear'] = data['pickup_datetime'].dt.weekofyear
pipe function
The pipe function takes functions as inputs. These functions need to take a dataframe as input and return a dataframe
def drop_missing(df):
thresh = len(df) * 0.6
df.dropna(axis=1, thresh=thresh, inplace=True)
return df
def remove_outliers(df, column_name):
low = np.quantile(df[column_name], 0.05)
high = np.quantile(df[column_name], 0.95)
return df[df[column_name].between(low, high, inclusive=True)]
def to_category(df):
cols = df.select_dtypes(include='object').columns
for col in cols:
ratio = len(df[col].value_counts()) / len(df)
if ratio < 0.05:
df[col] = df[col].astype('category')
return df
def copy_df(df):
return df.copy()
marketing_cleaned = (marketing.
pipe(copy_df).
pipe(drop_missing).
pipe(remove_outliers, 'Salary').
pipe(to_category))
useful tips
read from clipboard
pd.read_clipboard()
generate test data
pd.util.testing.N = 10
pd.util.testing.K = 5
pd.util.testing.makeDataFrame()
save data in compressed file
#compress data to file
df.to_csv('sample.csv.gz', compression='gzip')
#read compressed file to pd
f = pd.read_csv('sample.csv.gz', compression='gzip', index_col=0)
read excel
# callable function to read useful data
# Define a more complex function:
def column_check(x):
if 'unnamed' in x.lower():
return False
if 'priority' in x.lower():
return False
if 'order' in x.lower():
return True
return True
df = pd.read_excel(src_file, header=1, usecols=column_check)
#or directly with column name list
df = pd.read_excel(
src_file,
header=1,
usecols=['item_type', 'order id', 'order date', 'state', 'priority'])
itertuples instead of the iterrows
[sum_square(row[0], row[1]) for _, row in df.iterrows()]
#can be optimized into
[sum_square(a, b) for a, b in df[[0, 1]].itertuples(index=False)]
vectorization
[sum_square(row[0], row[1]) for _, row in df.iterrows()]
#can be optimized into
np.vectorize(sum_square)(df[0], df[1])
#or
np.power(df[0] + df[1], 2)
parallelisation
If your function is I/O bound, meaning that it is spending a lot of time waiting for data (e.g. making api requests over the internet), then multithreading (or thread pool) will be the better and faster option
Modin
pandas is only fit for data processing on one cpu, for big data is better to use modin, which is based on dask or ray as backend.
#install
pip install modin[dask]
#usage: change import pandas as pd to import modin.pandas as pd
#other parts are just same as pandas
use dict in pandas
1) specify data type of column
import numpy as np
import pandas as pd
cols =['Price','Landsize','Distance','Type','Regionname']
melb = pd.read_csv(
"/content/melb_data.csv",
usecols = cols,
dtype = {'Price':'int'},
na_values = {'Landsize':9999, 'Regionname':'?'}
)
2) agg data with more options
melb.groupby('Type').agg(
{
'Distance':'mean',
'Price':lambda x: sum(x) / 1_000_000
}
)
melb.groupby('Type').agg(
{'Distance':'mean','Price':'mean'}
).round(
{'Distance':2, 'Price':1}
)
3) replace values in a dataframe
melb.replace(
{
'Type':{'h':'house'},
'Regionname':{'Northern Metropolitan':'Northern'}
}
).head()
reduce pandas dataframe memory
When there are mixed data types per column, they’re often stored as objects. by converting object datatype to categorical datatype, the memory size will be reduced. What the categorical data type does is assign each unique value a unique id to lookup. That ID is stored instead of the string. The individual strings are stored in a lookup.
#check datatype
df.dtypes
#check memory usage
df.memory_usage(deep=True)
#convert datatype to categorical datatype
df_small = df.copy()
for col in ['Source', 'Target']:
df_small[col] = df_small[col].astype('category')
reduction = df_small.memory_usage(
deep=True).sum() / df.memory_usage(deep=True).sum()
f'{reduction:0.2f}'
run pandas in spark
supported with spark 3.2 or above
from pyspark.pandas import read_csv
pdf = read_csv("data.csv")
parallel computing
pandarallel
only works on driver node, so ont feasible for databricks
df.apply(func)
df.parallel_apply(func)
df.applymap(func)
df.parallel_applymap(func)
df.groupby(args).apply(func)
df.groupby(args).parallel_apply(func)
df.groupby(args1).col_name.rolling(args2).apply(func)
df.groupby(args1).col_name.rolling(args2).parallel_apply(func)
df.groupby(args1).col_name.expanding(args2).apply(func)
df.groupby(args1).col_name.expanding(args2).parallel_apply(func)
series.map(func)
series.parallel_map(func)
series.apply(func)
series.parallel_apply(func)
series.rolling(args).apply(func)
series.rolling(args).parallel_apply(func)