数据规整:
~
df2:
fruit color price
0 apple red 8.5
1 banana yellow 6.8
2 orange yellow 5.6
3 apple cyan 7.8
4 banana cyan 6.4
#根据多列名分组
print(df1.groupby(by=['fruit', 'color'])['price'].mean())
fruit color
apple cyan 7.8
red 8.5
banana cyan 6.4
yellow 6.8
orange yellow 5.6
Name: price, dtype: float64
~
df2:
a b c d e
sum -0.792502 -0.968316 -0.617287 0.437361 -0.053810
make -0.475013 -1.281450 0.313900 -1.278580 -0.937860
tom 0.341775 0.796679 -1.321376 0.133645 -0.227819
bob -0.666465 -0.408208 -0.173661 -2.080733 -0.003080
#按字典-columns分组
m = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red'}
print(df2.groupby(by=m, axis=1).sum())
blue red
sum -0.179926 -1.814628
make -0.964680 -2.694323
tom -1.187731 0.910635
bob -2.254394 -1.077752
#按照series,分组求组内数量
s1 = pd.Series(m)
print(df2.groupby(by=s1, axis=1).count())
blue red
sum 2 3
make 2 3
tom 2 3
bob 2 3
#按照函数 长度函数分组
print(df2.groupby(by=len).sum())
a b c d e
3 -1.117192 -0.579845 -2.112324 -1.509727 -0.284709
4 -0.475013 -1.281450 0.313900 -1.278580 -0.937860
文件读取:
#读取文件 如果csv文件本身就没有标题 pd会自动把第一行当作标题, 所以为了避免数据被当成表头 的情况 header = none
df = pd.read_csv('房屋信息.csv', header=None)
print(df)
#也可以主动添加表头 names
df = pd.read_csv('房屋信息.csv', names=['a', 'b', 'c', 'd', 'message'])
print(df)
#换索引列 将message当作行索引 col代表列
df = pd.read_csv('房屋信息.csv', names=['a', 'b', 'c', 'd', 'message'], index_col='message')
print(df)
#index_col也可以列表形式 构造多层索引
df = pd.read_csv('房屋信息.csv', names=['a', 'b', 'c', 'd', 'message'], index_col=['a', 'b'])
print(df)
数据清洗:
~
data1:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
#丢弃全部都为nan的行
print(data1.dropna(how='all'))
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
#丢弃掉全部为nan的列
print(data1.dropna(how='all', axis=1))
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
~
df:
0 1 2
0 1.714094 -1.506848 -0.110878
1 0.000021 -0.505049 -0.004595
2 -2.349841 0.638547 -0.087270
3 0.560394 -0.653842 -0.636631
4 0.447427 0.480138 0.196381
5 1.103654 0.899888 -0.717674
6 -0.098616 0.458054 1.213029
#将部分值改为nan
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
print(df)
0 1 2
0 1.714094 NaN NaN
1 0.000021 NaN NaN
2 -2.349841 NaN -0.087270
3 0.560394 NaN -0.636631
4 0.447427 0.480138 0.196381
5 1.103654 0.899888 -0.717674
6 -0.098616 0.458054 1.213029
#丢弃前行内有两个缺失值的行
print(df.dropna(thresh=2))
0 1 2
2 -2.349841 NaN -0.087270
3 0.560394 NaN -0.636631
4 0.447427 0.480138 0.196381
5 1.103654 0.899888 -0.717674
6 -0.098616 0.458054 1.213029
#填充数据
print(df.fillna(0))
0 1 2
0 1.714094 0.000000 0.000000
1 0.000021 0.000000 0.000000
2 -2.349841 0.000000 -0.087270
3 0.560394 0.000000 -0.636631
4 0.447427 0.480138 0.196381
5 1.103654 0.899888 -0.717674
6 -0.098616 0.458054 1.213029
#分类讨论 把第一列的缺失值改成1, 把第二列的缺失值改成2
print(df.fillna({1:1, 2:0})) #添加inplace = ture 原地修改
0 1 2
0 1.714094 1.000000 0.000000
1 0.000021 1.000000 0.000000
2 -2.349841 1.000000 -0.087270
3 0.560394 1.000000 -0.636631
4 0.447427 0.480138 0.196381
5 1.103654 0.899888 -0.717674
6 -0.098616 0.458054 1.213029
~
df2:
0 1 2
0 -0.070229 -0.853853 -0.739137
1 -0.947690 -1.608765 0.059307
2 2.758675 NaN -0.603239
3 -1.165221 NaN -0.762415
4 0.083660 NaN NaN
5 -0.244539 NaN NaN
#向下填充缺失值 同列最近非缺失值 元素填充缺失值
print(df2.fillna(method='ffill')) #记住这个参数 ffill
0 1 2
0 -0.070229 -0.853853 -0.739137
1 -0.947690 -1.608765 0.059307
2 2.758675 -1.608765 -0.603239
3 -1.165221 -1.608765 -0.762415
4 0.083660 -1.608765 -0.762415
5 -0.244539 -1.608765 -0.762415
#限制向下填充的个数
print(df2.fillna(method='ffill', limit=2))
0 1 2
0 -0.070229 -0.853853 -0.739137
1 -0.947690 -1.608765 0.059307
2 2.758675 -1.608765 -0.603239
3 -1.165221 -1.608765 -0.762415
4 0.083660 NaN -0.762415
5 -0.244539 NaN -0.762415
~
data:
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
#返回布尔 判断是否重复
print(data.duplicated())
0 False
1 False
2 False
3 False
4 False
5 False
6 True
#删除重复数据 直接返回数组对象
print(data.drop_duplicates())
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
#也可以按照某类进行删除重复数据的操作--k1列如果有重复则删除整行保留最上面的
print(data.drop_duplicates(['k1']))
k1 k2
0 one 1
1 two 1
#当遇到重复值时 保留后面的数值 而 删除前面的数值
print(data.drop_duplicates(keep='last')) #last参数是保留最后的
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
6 two 4
~
datas:
food price
0 apple 1.0
1 bnana 2.0
2 orange 3.5
3 mangguo 2.9
4 Apple 3.5
meat = {
'apple': 'furit',
'bnana': 'veg',
'mangguo': 'fruit',
'orange': 'fru'
}
#map作用与series apply作用域datafame的每行或每 applymap是作用与所有元素
print(datas['food'].map(meat)) #map用于接收映射关系 == 字典或函数
0 furit
1 veg
2 fru
3 fruit
4 NaN
#把所有值转为小写lower
low = datas['food'].str.lower() #.str-获取列中的字符串
print(low)
0 apple
1 bnana
2 orange
3 mangguo
4 apple
datas['class'] = datas['food'].map(meat)
print(datas)
food price class
0 apple 1.0 furit
1 bnana 2.0 veg
2 orange 3.5 fru
3 mangguo 2.9 fruit
4 Apple 3.5 NaN
#由于Apple 和 apple 都是苹果 但是对应字典中 是全小写的形式 故可以用函数 先转换为小写 然后再对应
print(datas['food'].map(lambda x: meat[x.lower()]))
0 furit
1 veg
2 fru
3 fruit
4 furit
~
df:
a b
0 111万 1
1 22万 2
#将数据中的字符串 提取出数字
df['a'].map(lambda x: print(eval(x[0:-1])))
111
22
~
data:
0 1
1 -999
2 2
3 77777
dtype: int64
#replace方法
print(data.replace([-999, 77777], [np.nan, 0])) #-999替换成nan 7777替换成0
print(data.replace({-999: np.nan, 77777: 0})) #也可以用字典形式替换
0 1.0
1 NaN
2 2.0
3 0.0
dtype: float64
0 1.0
1 NaN
2 2.0
3 0.0
dtype: float64
~
datas = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['bj', 'tk', 'ny'],
columns=['one', 'two', 'three', 'four'])
print(datas)
one two three four
bj 0 1 2 3
tk 4 5 6 7
ny 8 9 10 11
#重新索引
print(datas.reindex(['tk', 'ny', 'bj']))
one two three four
tk 4 5 6 7
ny 8 9 10 11
bj 0 1 2 3
#重命名 rename
print(datas.rename(index={'tk': 'a', 'ny': 'b', 'bj': 'c'}, columns={'one': '1'}))
1 two three four
c 0 1 2 3
a 4 5 6 7
b 8 9 10 11
~
#数据的离散化和再划分
#分阶段
ages = [20, 22, 25, 27, 30, 31, 35, 37, 41, 46, 51, 67]
#面元bin
bins = [18, 25, 35, 60, 100]
'''
cut 函数 两个参数 第一个是需要切割的数组, 第二个是切割区间
返回两行
第一行是每个数据所属区间 皆是左闭右开
第二行是切割的区间
'''
cats = pd.cut(ages, bins)
print(cats)
[(18, 25], (18, 25], (18, 25], (25, 35], (25, 35], ..., (35, 60], (35, 60], (35, 60], (35, 60], (60, 100]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
#打印每个元素所属区间的编号
print(cats.codes)
[0 0 0 1 1 1 1 2 2 2 2 3]
#打印该数据的区间
print(cats.categories)
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
#降序打印 区间出现数据的次数
print(pd.value_counts(cats))
(25, 35] 4
(35, 60] 4
(18, 25] 3
(60, 100] 1
dtype: int64
names = ['青少年', '青年', '中年', '老年']
print(pd.cut(ages, bins, labels=names))
['青少年', '青少年', '青少年', '青年', '青年', ..., '中年', '中年', '中年', '中年', '老年']
~
data3:
[ 0.27299673 -0.76994187 -0.13676037 1.78912866 0.0323877 -0.26431203
0.47883635 0.31424239 1.16599465 -1.57052829 0.11403125 0.68940353
0.31311914 -1.86112748 -0.31328714 0.77800633 0.00404619 -0.6806554
0.57488494 1.22218326]
#均匀分割 4 份 #限定小数位数
print(pd.cut(data3, 4, precision=2))
[(-0.036, 0.88], (-0.95, -0.036], (-0.95, -0.036], (0.88, 1.79], (-0.036, 0.88], ..., (-0.036, 0.88], (-0.036, 0.88], (-0.95, -0.036], (-0.036, 0.88], (0.88, 1.79]]
Length: 20
Categories (4, interval[float64, right]): [(-1.86, -0.95] < (-0.95, -0.036] < (-0.036, 0.88] <
(0.88, 1.79]]
~
#qcut 均匀分割函数 均匀面元内的数 个数相等
data4 = np.random.randn(100)
print(pd.qcut(data4, 4))
print(pd.value_counts(pd.qcut(data4, 4)))
[(0.394, 1.686], (0.00123, 0.394], (0.00123, 0.394], (-2.541, -0.725], (0.00123, 0.394], ..., (-0.725, 0.00123], (0.394, 1.686], (-0.725, 0.00123], (0.394, 1.686], (-2.541, -0.725]]
Length: 100
Categories (4, interval[float64, right]): [(-2.541, -0.725] < (-0.725, 0.00123] < (0.00123, 0.394] <
(0.394, 1.686]]
(-2.541, -0.725] 25
(-0.725, 0.00123] 25
(0.00123, 0.394] 25
(0.394, 1.686] 25
dtype: int64
~
#检测和过滤异常值
data5 = pd.DataFrame(np.random.randn(100, 4))
print(data5)
print('-'*50)
#寻找绝对值大于三的
print(np.abs(data5) > 1)
print(data5[(np.abs(data5) > 1).any(1)]) #any的意思!敲重点 只要每行有一个ture 则返回ture
print(data5.describe())
0 1 2 3
0 0.888472 0.352992 -0.137076 1.734222
1 -0.178694 -0.995964 -0.220334 -0.240809
2 -0.956168 0.056116 -1.161986 0.621710
3 -0.132241 -0.772302 1.752085 -0.198287
4 -0.929681 1.649626 -0.640536 0.218673
.. ... ... ... ...
95 2.107463 0.503272 1.088572 1.518699
96 0.391785 -0.386596 -1.014523 -0.204848
97 -1.149701 -0.007344 0.082031 -0.711486
98 1.757191 -0.072677 -0.232690 -0.061123
99 -0.478114 1.468151 -0.517214 -1.730135
[100 rows x 4 columns]
--------------------------------------------------
0 1 2 3
0 False False False True
1 False False False False
2 False False True False
3 False False True False
4 False True False False
.. ... ... ... ...
95 True False True True
96 False False True False
97 True False False False
98 True False False False
99 False True False True
[100 rows x 4 columns]
0 1 2 3
0 0.888472 0.352992 -0.137076 1.734222
2 -0.956168 0.056116 -1.161986 0.621710
3 -0.132241 -0.772302 1.752085 -0.198287
4 -0.929681 1.649626 -0.640536 0.218673
6 0.683864 2.038768 -0.202977 0.137352
.. ... ... ... ...
95 2.107463 0.503272 1.088572 1.518699
96 0.391785 -0.386596 -1.014523 -0.204848
97 -1.149701 -0.007344 0.082031 -0.711486
98 1.757191 -0.072677 -0.232690 -0.061123
99 -0.478114 1.468151 -0.517214 -1.730135
[79 rows x 4 columns]
0 1 2 3
count 100.000000 100.000000 100.000000 100.000000
mean 0.148266 0.179241 0.003147 0.039621
std 0.958056 1.055200 1.049992 0.949855
min -1.980334 -2.545607 -2.459421 -2.134215
25% -0.481164 -0.381006 -0.655402 -0.518193
50% 0.083503 0.233742 0.046225 -0.047955
75% 0.813610 0.922826 0.716252 0.685000
max 2.573760 3.501705 2.311853 2.250750
Process finished with exit code 0