一、生成数据表
1 首先导入pandas库,通常会用到numpy库,所以一起导入
import numpy as np
import pandas as pd
2 导入csv或者xlsx文件(包括两种方法)
# 方法一:
df = pd.DataFrame(pd.read_csv('name.csv',header=1))
df = pd.DataFrame(pd.read_excel('name.xlsx'))
# 方法二:
import pandas as pd
from collections import namedtuple
Item = namedtuple('Item','reply pv')
items = []
with codecs.open('reply.pv.07','r','utf-8') as f:
for line in f:
line_split = line.strip().split('\t')
items.append(Item(line_split[0].strip(),line_split[1].strip()))
df = pd.DataFrame.from_records(items,columns=['reply','pv'])
3 直接用pandas创建数据表
df = pd.DataFrame(
{
'id':[1001,1002,1003,1004,1005,1006],
'data':pd.date_range('20130102',periods=6),
'city':['Beijing','SH','guangzhou','Shenzhen','shanghai','BEIJING']
'age':[23,44,54,32,34,32],
'category':['100-A','100-B','110-A','110-C','210-A','130-F'],
'price':[1200,np.nan,2133,5433,np.nan,4432]
},
columns=['id','date','city','category','age','price']
)
二、数据表信息查看
1 维度查看
df.shape()
print(df.shape())# 报错
TypeError: 'tuple' object is not callable
print(df.shape)# 正确
‘tuple’ object is not callable解决方案
2 数据表基本信息(维度、列名称、数据格式、所占空间)
df.info()
print(df.info())# 正确
print(df.infor)# 正确
3 每一列数据的格式
df.dtypes()
print(df.dtypes())# 报错
TypeError: 'Series' object is not callable
print(df.dtypes)# 正确
4 某一列格式
df['B'].dtype
5 空值
df.isnull()
6 查看某一列空值
df.isnull()
0 1001 NaN Beijing 100-A 23 1200.0
1 1002 NaN SH 100-B 44 NaN
2 1003 NaN guangzhou 110-A 54 2133.0
3 1004 NaN Shenzhen 110-C 32 5433.0
4 1005 NaN shanghai 210-A 34 NaN
5 1006 NaN BEIJING 130-F 32 4432.0>
7 查看某一列的唯一值
df['B'].unique()
[1200. nan 2133. 5433. 4432.] # 不包括重复值得值呗
8 查看数据表的值
df.values
[[1001 nan ‘Beijing’ ‘100-A’ 23 1200.0]
[1002 nan ‘SH’ ‘100-B’ 44 nan]
[1003 nan ‘guangzhou’ ‘110-A’ 54 2133.0]
[1004 nan ‘Shenzhen’ ‘110-C’ 32 5433.0]
[1005 nan ‘shanghai’ ‘210-A’ 34 nan]
[1006 nan ‘BEIJING’ ‘130-F’ 32 4432.0]]# 以列表得形式出现呗
9 查看列名称
df.columns
Index([‘id’, ‘date’, ‘city’, ‘category’, ‘age’, ‘price’], dtype=‘object’)
10 查看前5行和后5行数据
# 默认前5行数据
df.head()
# 默认后5行数据
df.tail()
三、数据表清洗
1 用数字0填充空值
df.fillna(value=0)
id date city category age price 0 1001 0 Beijing 100-A 23 1200.0 1 1002 0 SH 100-B 44 0.0 2 1003 0 guangzhou 110-A 54 2133.0 3 1004 0 Shenzhen 110-C 32 5433.0 4 1005 0 shanghai 210-A 34 0.0 5 1006 0 BEIJING 130-F 32 4432.0
2 使用列price的均值对NAN进行填充
df['price'].fillna(df['price'].mean())
3 清除city字段的字符空格
df['city']=df['city'].map(str.strip)# strip不能加括号
4 大小写转换
df['city']=df['city'].str.lower()# lower括号不能丢(
5 更改列名称
df.rename(columns={'category':'category-size'})
6 更改数据格式
df['price'].astype('int')
7 删除后出现的重复值
df['city'].drop_duplicates()
0 Beijing
1 shanghai
2 guangzhou
3 Shenzhen
5 BEIJING
Name: city, dtype: object
8 删除先出现的重复值
df['city'].drop_duplicates(keep = 'last')
0 Beijing
2 guangzhou
3 Shenzhen
4 shanghai
5 BEIJING
Name: city, dtype: object
9 数据替换
df['city'].replace('sh','shanghai')
四、数据预处理
df1 = pd.DataFrame({'id':[1001,1002,1003,1004,1005,1006,1007,1008],
'gender':['male','female','male','female','male','female','male','female'],
'pay':['Y','N','Y','Y','N','Y','N','Y',],
'm-point':[10,12,20,40,40,40,30,20]})
1 数据表合并
1.1 merge
df_inner = pd.merge(df,df1,how='inner')
df_left = pd.merge(df,df1,how='left')
df_right = pd.merge(df,df1,how='right')
df_outer = pd.merge(df,df1,how='outer')
print(df_inner)
print(df_left)
print(df_right)
print(df_outer)