python 基础 —— pandas

1. pandas 基本介绍

s = pd.Series([1, 3, 6, np.nan, 44, 1])
print(s)

dates = pd.date_range('20160101', periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
print(df)
df1 = pd.DataFrame(np.arange(12).reshape(3, 4))
print(df1)

# 字典的形式来定义
df2 = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20130102'),
                    'C':pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D':np.array([3] * 4, dtype='int32'),
                    'E':pd.Categorical(["test", "train", "test", "train"]),
                    'F':'foo'})
print(df2)
# 输出每一列的类型
print(df2.dtypes)

print(df2.index)
print(df2.columns)
print(df2.values)
# 只计算数字的列
print(df2.describe())
print(df2.T)
print(df.sort_index(axis=1, ascending=False)) # 列
print(df.sort_index(axis=0, ascending=False)) # 列
print(df2.sort_values(by='E'))

2. pandas 选择数据

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df['B'])
print(df.B)
print(df[0:3])
print(df['20130102':'20130104'])
print("================")
# select by label:loc
print(df.loc['20130102'])
print(df.loc['20130102', ['A', 'B']])
# select by position:iloc
print(df.iloc[2, 3])
print(df.iloc[3:5, 1:3])
print(df.iloc[[1, 3, 5], 1:3])

# mixed selection: ix
print(df.ix[:3, ['A', 'C']])

# boolen indexing
print(df[df.A > 8])

3. pandas设置值

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
df.iloc[2, 2] = 111
print(df)
df.loc['20130101', 'B'] = 222
print(df)
df.A[df.A>4] = 999
print(df)
df.B[df.C>10] = 1000
print(df)
df['F'] = np.nan
print(df)
df['E'] = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130101', periods=6))
print(df)

4. pandas处理丢失数据

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
print(df)
print(df.dropna(axis=0, how='any')) # 丢掉有nan的行 how = {'any', 'all'}
print(df.dropna(axis=1, how='any')) # 丢掉有nan的列
print(df.fillna(value=0)) # 给nan数据填上0
print(df.isna()) # df.isnull # 检查是否有缺失数据
print(np.any(df.isna()) == True) # 有丢失数据

5. pandas导入导出

data = pd.read_csv('C:/Users/lenovo/Desktop/student.csv')
print(data)
data.to_pickle('C:/Users/lenovo/Desktop/student.pickle')

6. pandas 合并concat

# concatenating
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
print(df1)
print(df2)
print(df3)

res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)  # 上下方向的合并
res1 = pd.concat([df1, df2, df3], axis=1, ignore_index=True)  # 水平方向的合并
print(res)
print(res1)

# join, ['inner', 'outer']
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
print(df1)
print(df2)

res = pd.concat([df1, df2], join='inner', ignore_index=True)  # 只考虑两者都有的
print(res)
res = pd.concat([df1, df2], join='outer', ignore_index=True)  # 没有数据的地方用nan
print(res)

# join_axes

res = pd.concat([df1, df2], axis=1, join_axes=[df2.index])
res1 = pd.concat([df1, df2], axis=1)
print(res)
print(res1)
print("==============")
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
s1 = pd.Series([2, 2, 3, 4], index=['a', 'b', 'c', 'd'])
res = df1.append(s1, ignore_index=True)
print(s1)
print(res)

df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
res = df1.append(df2, ignore_index=True)
s1 = pd.Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])
res = df1.append(s1, ignore_index=True)
print(res)

7. pandas 合并 merge

# merge two df by key/keys.(maybe in database)
# simple example
left = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})

right = pd.DataFrame({
    'key': ['K1', 'K1', 'K2', 'K3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})

print(left)
print(right)

res = pd.merge(left, right, on='key')
print(res)

# consider two keys
left = pd.DataFrame({
    'key1': ['K0', 'K0', 'K1', 'K2'],
    'key2': ['K0', 'K1', 'K0', 'K1'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})

right = pd.DataFrame({
    'key1': ['K0', 'K1', 'K1', 'K2'],
    'key2': ['K0', 'K0', 'K0', 'K0'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})
print(left)
print(right)

# how = ['left', 'right', 'outer', 'inner']

res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)

# indicator
df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']})
df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]})
print(df1)
print(df2)

res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)

# merged by index
left = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2']},
    index=['K0', 'K1', 'K2'])
right = pd.DataFrame({
    'C': ['C0', 'C2', 'C3'],
    'D': ['D0', 'D2', 'D3']},
    index=['K0', 'K2', 'K3']
)

print(left)
print(right)

# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)

# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
print(boys)
print(girls)
print(res)

# join 和 merge 类似

8. pandas plot 画图

# plot data
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4),
                    index=np.arange(1000),
                    columns=list('ABCD'))
print(data.head(3))
data = data.cumsum()
data.plot()
plt.show()

# plot methods:
# 'bar' 'hist' 'box' 'kde' 'area' 'scatter' 'hexbin' 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class 1')
data.plot.scatter(x='A', y='C', color='DarkGreen', label='Class2', ax=ax)
plt.show()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值