1.Series
s1=pd.Series([1,2,3,4])
print(s1)
#输出数据-------
#0 1
# 1 2
# 2 3
# 3 4
# dtype: int64
#改变他的索引
s2=pd.Series([1,2,3,4],index=("a","b","c","d"))
print(s2)
# a 1
# b 2
# c 3
# d 4
# dtype: int64
#查询
s2["a"]=100
print(
s2
)
#属性打印
print(s2.index) #索引
print(s2.values) #值
print(s2.shape) #4行
print(s2.ndim) #
print(s2.size)
print(s2.dtype)
print(s2.head()) #钱几行,默认是前五行
s2.index.name="ok"
print(s2)
print(s2.index.name)
# ok
# a 100
# b 2
# c 3
# d 4
# dtype: int64
# ok
2.DataFrame
#通过pd.DateFrame 创建
df1=pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])
print(df1)
# 0 1 2
# 0 1 2 3
# 1 4 5 6
# 2 7 8 9
#修改行,列索引名
df2=pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]],index=("a","b","c"),columns=("A","B","C"))
print(df2)
# A B C
# a 1 2 3
# b 4 5 6
# c 7 8 9
#通过numpy 创建
import numpy as np
df3=pd.DataFrame(np.random.randn(3,3),index=("a","b","c"),columns=("A","B","C"))
print(df3)
# A B C
# # a -1.532409 -2.349603 0.744708
# # b -1.425400 -0.706411 -1.384674
# # c 1.025274 -0.933366 0.188983
print("--------打印行或列的信息-----")
print(df3["A"])
#DataFrame 中的ix被废弃 loc是通过通过行标签索引行数据 iloc——通过行号索引行数据
print(df2)
# A B C
# a 1 2 3
# b 4 5 6
# c 7 8 9
print(df2.loc["a","A"])
# 1
print(df2.iloc[0,1])
# 2
print(df2.loc[:,"A"])
# a 1
# b 4
# c 7
# Name: A, dtype: int64
print(df2.loc["a",:])
#A 1
# B 2
# C 3
# Name: a, dtype: int64
#更改 数据
df2.loc["a","A"]=100
#删除 数据 --只能删除列
del df2["A"]
print(df2)
# B C
# a 2 3
# b 5 6
# c 8 9
#增加
df2["D"]=[1,2,3]
3对齐运算
print("----------对其运算---------------")
import pandas as pd
s1=pd.Series(data=range(5),index=range(5))
s2=pd.Series(data=range(10),index=range(10))
print(s1+s2)
# 0 0.0
# 1 2.0
# 2 4.0
# 3 6.0
# 4 8.0
# 5 NaN
# 6 NaN
# 7 NaN
# 8 NaN
# 9 NaN
#对齐运算
print(s1.add(s2,fill_value=100))
# 0 0.0
# 1 2.0
# 2 4.0
# 3 6.0
# 4 8.0
# 5 105.0
# 6 106.0
# 7 107.0
# 8 108.0
# 9 109.0
4.pandas的函数应用
import pandas as pd
import numpy as np
df1 = pd.DataFrame([np.random.randn(3), [np.nan, 2, 2], [1, np.nan, 3]])
print(df1)
# 0 1 2
# 0 -0.307309 -0.602356 0.468423
# 1 NaN 2.000000 2.000000
# 2 1.000000 NaN 3.000000
#isnull 判断是否为空
print(df1.isnull())
# 0 1 2
# 0 False False False
# 1 True False False
# 2 False True False
#fillna 填充空值
print(df1.fillna(100))
# 0 1 2
# 0 -1.61506 0.472201 0.518066
# 1 100.00000 2.000000 2.000000
# 2 1.00000 100.000000 3.000000
#dropna#
#默认删除行
print(df1.drop([0]))
# 0 1 2
# 1 NaN 2.0 2.0
# 2 1.0 NaN 3.0
#删除列
print(df1.drop(columns=1))
# 0 2
# 0 -0.630894 0.385834
# 1 NaN 2.000000
# 2 1.000000 3.000000
5层次索引
import numpy as np
import pandas as pd
s1=pd.Series(np.random.randn((12)),
index=[["a","a","a","b","b","b","c","c","c","d","d","d",],
["1","2","3","1","2","3","1","2","3","1","2","3"]])
print(s1)
# a 1 0.584870
# # 2 -0.137987
# # 3 -1.070576
# # b 1 -0.519936
# # 2 -1.796467
# # 3 1.153866
# # c 1 -0.450462
# # 2 1.204522
# # 3 0.726738
# # d 1 1.219730
# # 2 -0.364476
# # 3 1.489353
print(s1["a"][1])
# -0.03686968088802105
s1["a"][1]=100
print(s1["a"][1])
# 100.0
#index互换
print(s1.swaplevel())
# 1 a 1.861300
# 2 a 100.000000
# 3 a 1.151022
# 1 b -0.003058
# 2 b -0.781960
# 3 b -1.819286
# 1 c -0.339151
# 2 c -1.028886
# 3 c 0.099268
# 1 d -0.262582
# 2 d -0.308409
# 3 d -0.198745
# dtype: float64
#根据索引排序
print(s1.swaplevel().sort_index())
# 1 a 1.861300
# b -0.003058
# c -0.339151
# d -0.262582
# 2 a 100.000000
# b -0.781960
# c -1.028886
# d -0.308409
# 3 a 1.151022
# b -1.819286
# c 0.099268
# d -0.198745
# dtype: float64
6.pandas统计计算
import numpy as np
import pandas as pd
s1 = pd.DataFrame(np.random.randn(5, 4), index=("A", "B", "C", "D", "E"), columns=("a", "b", "c", "d"))
print(s1)
#计算函数 sum mean max min
#按行统计 axis = 1
#按列统计 axis=0
# skipna 排除缺失值 ,默认为True
print(s1.min(axis=0))
# a b c d
# A 0.256362 -0.821711 1.404762 0.236821
# B 0.368196 0.466041 -0.080604 0.561946
# C -0.497315 0.094333 2.598284 0.358305
# D 0.588792 -0.297474 -1.556710 0.006757
# E 1.396045 -0.214725 0.397240 -1.023083
# a -0.497315
# b -0.821711
# c -1.556710
# d -1.023083
# dtype: float64
print(s1.min(axis=1))
# A -0.683147
# B -1.239646
# C -1.699261
# D -1.294243
# E -1.004114
# dtype: float64
print(s1.describe())
# a b c d
# count 5.000000 5.000000 5.000000 5.000000 有几个样本
# mean -0.577773 -0.205601 -0.068531 -0.192949 均值
# std 1.162071 0.367125 0.979809 0.956035 标准差
# min -2.410131 -0.689078 -1.225277 -1.301825 最小值
# 25% -0.915416 -0.300361 -0.534038 -0.785917 1/4分位数
# 50% -0.302638 -0.225946 -0.450630 -0.386991 中位数
# 75% 0.197694 -0.148005 0.678076 0.408620 3/4分位数
# max 0.541625 0.335383 1.189214 1.101367 最大值
#query 根据给的条件选出数据(只有列?)
print(s1)
print(s1.query("a>b"))
# a b c d
# A -1.641773 -1.350308 -0.597922 0.195105
# B -0.003405 0.551447 -0.400094 -2.092777
# C -0.980596 -1.048407 -1.933153 0.610606
# D 0.672218 1.151102 -0.810194 0.921114
# E -0.172449 0.199802 -0.427191 -0.062688
# a b c d
# C -0.980596 -1.048407 -1.933153 0.610606
8.pandas读取文件
import pandas as pd
file=pd.read_csv("./SklearnTest.txt")
print(file)
#取出一列
print(file["height"])
print(file.height)
#取出两列
print(file.loc[:,"height":"house"])
print(file.iloc[:,0:2])
# height house
# 0 1.80 1
# 1 1.62 1
# 2 1.71 0
# 3 1.58 1
# 4 1.68 0
# 5 1.63 1
# 6 1.78 0
# 7 1.64 0
# 8 1.65 0
#取出样本数据0-5
print(file.iloc[0:6,:])
print(file.loc[0:6,:])
# height house car handsome job is_date
# 0 1.80 1 0 6.5 2 1
# 1 1.62 1 0 5.5 0 1
# 2 1.71 0 1 8.5 1 1
# 3 1.58 1 1 6.3 1 1
# 4 1.68 0 1 5.1 0 0
# 5 1.63 1 0 5.3 1 0
# 6 1.78 0 0 4.5 0 0
#选择特征列和列别标签列
X1=file.loc[:,"height":"job"]
print(X1)
print(type(X1))
# height house car handsome job
# # 0 1.80 1 0 6.5 2
# # 1 1.62 1 0 5.5 0
# # 2 1.71 0 1 8.5 1
# # 3 1.58 1 1 6.3 1
# # 4 1.68 0 1 5.1 0
# # 5 1.63 1 0 5.3 1
# # 6 1.78 0 0 4.5 0
# # 7 1.64 0 0 7.8 2
# # 8 1.65 0 1 6.6 0
# # <class 'pandas.core.frame.DataFrame'>
Y1 = file.loc[:,"is_date"]
print(Y1)
print(type(Y1))
# 0 1
# 1 1
# 2 1
# 3 1
# 4 0
# 5 0
# 6 0
# 7 0
# 8 -1
# Name: is_date, dtype: int64
# <class 'pandas.core.series.Series'>
#选择is_date=
new_date = file.query("is_date!=-1")
date = file.query("is_date==-1")
print(new_date)
# height house car handsome job is_date
# 0 1.80 1 0 6.5 2 1
# 1 1.62 1 0 5.5 0 1
# 2 1.71 0 1 8.5 1 1
# 3 1.58 1 1 6.3 1 1
# 4 1.68 0 1 5.1 0 0
# 5 1.63 1 0 5.3 1 0
# 6 1.78 0 0 4.5 0 0
# 7 1.64 0 0 7.8 2 0
print(date)
# height house car handsome job is_date
# 8 1.65 0 1 6.6 0 -1
#将处理好的文件分为X Y
X = new_date.iloc[:,0:5]
Y = new_date.iloc[:,5]
print("------------------------")
print(X)
print(type(X))
# height house car handsome job
# 0 1.80 1 0 6.5 2
# 1 1.62 1 0 5.5 0
# 2 1.71 0 1 8.5 1
# 3 1.58 1 1 6.3 1
# 4 1.68 0 1 5.1 0
# 5 1.63 1 0 5.3 1
# 6 1.78 0 0 4.5 0
# 7 1.64 0 0 7.8 2
# <class 'pandas.core.frame.DataFrame'>
print("-----------------------")
print(Y)
print(type(Y))
# 0 1
# 1 1
# 2 1
# 3 1
# 4 0
# 5 0
# 6 0
# 7 0
# Name: is_date, dtype: int64
# <class 'pandas.core.series.Series'>
9.pandas函数补充
9.1 cocat
pandas.concat()通常用来连接DataFrame对象。默认情况下是对两个DataFrame对象进行纵向连接, 当然通过设置参数,也可以通过它实现DataFrame对象的横向连接(axis=1)
import pandas as pd
s1= pd.Series(['a','b'])
s2= pd.Series(['c','d'])
print(s1)
# 0 a
# 1 b
print(s2)
# 0 c
# 1 d
print(pd.concat([s1,s2],axis=0))
# 0 a
# 1 b
# 0 c
# 1 d
print(pd.concat([s1,s2],axis=1))
# 0 1
# 0 a c
# 1 b d
#如果希望重新设置合并之后的DataFrame对象的index值, 可以添加ignore_index=True参数:
df1 = pd.DataFrame([['a', 1], ['b', 2]], columns=['letter', 'number'])
print(df1)
# letter number
# 0 a 1
# 1 b 2
df2 = pd.DataFrame([['c', 3], ['d', 4]], columns=['letter', 'number'])
print(df2)
# letter number
# 0 c 3
# 1 d 4
print(pd.concat([df1,df2] ,ignore_index=True))
# letter number
# 0 a 1
# 1 b 2
# 2 c 3
# 3 d 4
print(pd.concat([df1,df2] ,ignore_index=True,axis=1))
# 0 1 2 3
# 0 a 1 c 3
# 1 b 2 d 4
2.groupby函数
import pandas as pd
s1 = pd.DataFrame([[1,2,3],[1,3,3],[4,5,6],[4,3,6]],columns=["one","two","three"])
print(s1)
# one two three
# 0 1 2 3
# 1 1 3 3
# 2 4 5 6
# 3 4 3 6
print("--------------------")
print(s1.groupby(by = ["one"])["two"])
# <pandas.core.groupby.generic.SeriesGroupBy object at 0x0000026484B259E8>
print("0--------------------")
#先分组根据 "one" 然后再选区"two" 的平均值
print(s1.groupby(by = ["one"])["two"].mean())
# one
# 1 2.5 ->根据均值来选的 (2+3)/2=2.5
# 4 4.0
#等价写法
print(s1["two"].groupby(by=s1["one"]).mean())
# one
# 1 2.5
# 4 4.0
# Name: two, dtype: float64
import numpy as np
#根据one进行聚合,再对其余列进行求解均值
print(s1.groupby(["one"]).mean())
# two three
# one
# 1 2.5 3.0
# 4 4.0 6.0
#根据one列进行聚合,再对其余列进行求解均值
print(s1.groupby(["one"]).transform(lambda x:np.mean(x)))
# two three
# 0 2.5 3.0
# 1 2.5 3.0
# 2 4.0 6.0
# 3 4.0 6.0
print(s1.groupby(["one"]).transform(lambda x:(x-np.mean(x))/(np.std(x))))
# two three
# 0 -1.0 NaN
# 1 1.0 NaN
# 2 1.0 NaN
# 3 -1.0 NaN
#根据one列进行聚合,再对制定列进行求解均值,
print(s1.groupby(["one"]).apply(lambda s1:np.mean(s1["two"])))
# one
# 1 2.5
# 4 4.0
# dtype: float64
检查缺失值