Python pandas tutorial
- import pandas and basic set up
- read headers, data types, index, columns value
- print first or last row of file
- slicing records
- sorting/Describing Data
- renaming columns and moving columns
- assignment and Making changes to the data
- save data_frame
- Filtering None
- Filtering Data
- Conditional Changes
- Aggregate Statistics(Group by)
- iterate a data frame
down load “pokemon.csv” at:
https://gist.github.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6
RAW:
https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv
镜像文件:
https://blog.csdn.net/sway_wu/article/details/103590248
import pandas and basic set up
import pandas as pd
import numpy as np
import re
pd.set_option('display.width', None)
df = pd.read_csv('pokemon.csv')
read headers, data types, index, columns value
print(df.columns)
print(df.values)
print(len(df.index))
print(df.describe())
print first or last row of file
print(df.head(5))
print(df[0:5])
print(df.tail(6))
print(df[len(df.index)-6:len(df.index)])
print(df[-6:len(df.index)])
slicing records
print(df[['Name', 'HP']])
print(df.Name)
print(df[['HP', 'Attack']])
print(df.loc[:, ['HP', 'Attack']])
print(df.loc[3:5,['Total','Attack']])
print(df.iloc[3:6,[4,6]])
print(df.iloc[0:4])
print(df.iloc[1, 4])
print(df.loc[df['Type 1'] == "Grass"])
sorting/Describing Data
print(df.sort_values('Name',ascending=False))
print(df.sort_values(['Type 1','HP'], ascending=[True, False]))
renaming columns and moving columns
df. rename(columns = {'Sp. Atk' : 'sp_atk'}, inplace= True)
df. rename(columns = {'Sp. Def' : 'sp_def'}, inplace= True)
df = df[['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'sp_atk', \
'sp_def', 'Speed', 'Total', 'Generation','Legendary']]
df.columns = ['#', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l']
assignment and Making changes to the data
df.loc[9, ['HP']] = 110
print(df.iloc[9:10])
df1 = df.where(pd.notnull(df), 'None') # change NaN to None
df['Total_x'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']
df['Total_y'] = df.iloc[:,5:11].sum(axis=1)
print(df.sort_values(['Total_x'], ascending=False))
df = df.drop(columns=['Total_x','Total'])
cols = list(df.columns.values)
print(cols)
df = df[cols[0:4] + [cols[len(cols)-1]] + cols[4:12]]
cols = list(df.columns.values)
print(cols)
save data_frame
df.to_csv('modified.txt', index=False, sep='\t')
Filtering None
print(df[df['Type 2'].notnull()])
temp_list1 = df['Type 2'].notnull()
temp_list2 = [not i for i in temp_list1]
print(df[temp_list2])
Filtering Data
print(df[df.HP >= 200])
print(df[df['Type 1'].isin(['Flying','Fairy'])])
df.loc[3:5, 'Speed'] = np.array([5]*3) #change Speed value
df.loc[:,'Speed'] = np.array([5]*len(df))
df['av_Sp'] = (df['Sp. Atk'] + df['Sp. Def'])/2
new_df = df.loc[(df['Type 1'] == 'Grass') & (df['Type 2'] == 'Poison') & (df['HP'] >= 70)]
new_df = new_df.reset_index(drop=True)
print(df.loc[df['Name'].str.contains('Mega')])
print(df.loc[~df['Name'].str.contains('Mega')])
Conditional Changes
print(df.loc[df['Type 1'].str.contains('FIRE|Grass', flags=re.I, regex=True)])
print(df.loc[df['Name'].str.contains('^pi[a-z]*', flags=re.I, regex=True)])
df.loc[df['Type 1'] == 'Fire', 'Type 1'] = 'Flamer'
df.loc[df['Total'] > 500, ['Generation','Legendary']] = ['Test 1', 'Test 2']
Aggregate Statistics(Group by)
print(df.groupby(['Type 1']).count().sort_values('#', ascending=True))
print(df.groupby(['Type 1']).mean().sort_values('Attack', ascending=True))
df_count = df.groupby(['Type 1']).count()
df_count = df.groupby(['Type 1','Type 2']).count()
iterate a data frame
for index, row in df.iterrows():
print(index, row['Type 1'],row['HP'])
for df in pd.read_csv('modified.csv', chunksize=5):
print(df)
new_df = pd.DataFrame(columns=df.columns)
print(new_df)
for df2 in pd.read_csv('pokemon.csv', chunksize=5):
print(df2)