label = '7_45_same_pay_label' # 将X和Y拆分开
X = data.loc[:, data.columns != label]
y = data.loc[:, data.columns == label]
mean(axis=0)计算的是每一列平均值,
mean(axis=1)计算的是每一行平均值。
特:0跨行 1跨列
drop(0,axis=0)删除行,
drop([‘col1’],axis=1)删除列。
-
p=i.strip() #默认删除前后端 ‘\n\t\r’
-
print(p.split(","))
-
l=i.split(",") #按照指定字符分割,以列表方式返回
zero_col_count = dict(df[0].value_counts())#统计第0列元素的值的个数
three_row_count = dict(df.loc[3].value_counts())#统计第3行元素的值的个数
def fun(a65,b73):
a65=float(a65)
b73=float(b73)
if a65>=10000:
return a65
else:
return b73
sub_65['finpre']=sub_65.apply(lambda x:fun(x.prediction_pay_price,x.xgb),axis=1)
userFeature_data=[{'uid': '26325489', 'age': '4', 'gender': '2' }, {'uid': '1184123', 'age': '2', 'gender': '1'}]
user_feature = pd.DataFrame(userFeature_data) #可以这样传入数据
user_feature.to_csv(data_path + 'userFeature.csv', index=False) #去掉index
'''
age gender uid
0 4 2 26325489
1 2 1 1184123
'''
位置选择筛选:train.loc[train['label']==-1,'label']=0 #把label列中值为-1的转化为0
两个数据结构:Series和DataFrame
1、Series
s = pd.Series([1,3,6,np.nan,44,1])
"""
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
"""
2、DataFrame
dates = pd.date_range('20160101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
"""
a b c d
2016-01-01 -0.253065 -2.071051 -0.640515 0.613663
2016-01-02 -1.147178 1.532470 0.989255 -0.499761
2016-01-03 1.221656 -2.390171 1.862914 0.778070
2016-01-04 1.473877 -0.046419 0.610046 0.204672
2016-01-05 -1.584752 -0.700592 1.487264 -1.778293
2016-01-06 0.633675 -1.414157 -0.277066 -0.442545
"""
读取数据
- pd.read_csv(filename):从CSV文件读入数据
- pd.read_excel(filename):从Excel文件读入数据
- label=pd.read_csv("./datas/baoxianfanqizha_trainingset/train_id.csv",delimiter="\t")
col_names = ["ID","K1K2驱动信号","电子锁驱动信号","急停信号","门禁信号","THDV-M","THDI-M","label"]
data = pd.read_csv("data_train.csv",names=col_names)
datas=pd.read_csv("./datas/data_test.csv",encoding='gbk',delimiter="\t",names=list)
写入
- df.to_csv(filename):写入CSV文件
- me.to_csv('./datas/baoxianfanqizha_trainingset/train_id.csv',index=False)
信息
- df.head(n):查看前n行
- df.tail(n):查看最后n行
- df.shape():查看形状行数和列数
- df.info():查看索引、数据类型和内存信息
- df.describe():查看数值型列的汇总统计
定位
- df[col]:根据列名,并以Series的形式返回列
- df[[col1, col2]]:以DataFrame形式返回多列
- s.iloc[0]:按位置选取数据
- s.loc['index_one']:按索引选取数据
- df.iloc[0,:]:返回第一行
- df.iloc[0,0]:返回第一列的第一个元素
-
print(df['b']) """ 2016-01-01 -2.071051 2016-01-02 1.532470 2016-01-03 -2.390171 2016-01-04 -0.046419 2016-01-05 -0.700592 2016-01-06 -1.414157 Freq: D, Name: b, dtype: float64 """
合并
1、merge:
合并时有4种方法how = ['left', 'right', 'outer', 'inner']
,预设值 how='inner'。
import pandas as pd
#定义资料集并打印出
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
# A B key1 key2
# 0 A0 B0 K0 K0
# 1 A1 B1 K0 K1
# 2 A2 B2 K1 K0
# 3 A3 B3 K2 K1
print(right)
# C D key1 key2
# 0 C0 D0 K0 K0
# 1 C1 D1 K1 K0
# 2 C2 D2 K1 K0
# 3 C3 D3 K2 K0
#依据key1与key2 columns进行合并,并打印出四种结果['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
# A B key1 key2 C D
# 0 A0 B0 K0 K0 C0 D0
# 1 A2 B2 K1 K0 C1 D1
# 2 A2 B2 K1 K0 C2 D2
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
# A B key1 key2 C D
# 0 A0 B0 K0 K0 C0 D0
# 1 A1 B1 K0 K1 NaN NaN
# 2 A2 B2 K1 K0 C1 D1
# 3 A2 B2 K1 K0 C2 D2
# 4 A3 B3 K2 K1 NaN NaN
# 5 NaN NaN K2 K0 C3 D3
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
# A B key1 key2 C D
# 0 A0 B0 K0 K0 C0 D0
# 1 A1 B1 K0 K1 NaN NaN
# 2 A2 B2 K1 K0 C1 D1
# 3 A2 B2 K1 K0 C2 D2
# 4 A3 B3 K2 K1 NaN NaN
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
# A B key1 key2 C D
# 0 A0 B0 K0 K0 C0 D0
# 1 A2 B2 K1 K0 C1 D1
# 2 A2 B2 K1 K0 C2 D2
# 3 NaN NaN K2 K0 C3 D3
2、concat:
#定义资料集
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=0, join='inner')
#打印结果
print(res)
# b c d
# 1 0.0 0.0 0.0
# 2 0.0 0.0 0.0
# 3 0.0 0.0 0.0
# 2 1.0 1.0 1.0
# 3 1.0 1.0 1.0
# 4 1.0 1.0 1.0
#重置index并打印结果
res = pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
print(res)
# b c d
# 0 0.0 0.0 0.0
# 1 0.0 0.0 0.0
# 2 0.0 0.0 0.0
# 3 1.0 1.0 1.0
# 4 1.0 1.0 1.0
# 5 1.0 1.0 1.0
join_axes (依照 axes 合并)
import pandas as pd
import numpy as np
#定义资料集
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
#依照`df1.index`进行横向合并
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
#打印结果
print(res)
# a b c d b c d e
# 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
# 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
# 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
#移除join_axes,并打印结果
res = pd.concat([df1, df2], axis=1)
print(res)
# a b c d b c d e
# 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
# 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
# 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
# 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
描述和汇总统计
方法 说明
count 非NA值的数量
describe 针对Series或各DataFrame列计算汇总统计
min,max 计算最小值和最大值
argmin,argmax 计算能够获取到最小值和最大值的索引位置(整数)
idxmin,idxmax 计算能够获取到最小值和最大值的索引值
quantile 计算样本的分位数(0到 1)
sum 值的总和
mean 值的平均数, a.mean() 默认对每一列的数据求平均值;若加上参数a.mean(1)则对每一行求平均值
media 值的算术中位数(50%分位数)
mad 根据平均值计算平均绝对离差
var 样本值的方差
std 样本值的标准差
skew 样本值的偏度(三阶矩)
kurt 样本值的峰度(四阶矩)
cumsum 样本值的累计和
cummin,cummax 样本值的累计最大值和累计最小
cumprod 样本值的累计积
diff 计算一阶差分(对时间序列很有用)
pct_change 计算百分数变化
注:参考至---莫凡python
待续。。