Python-Pandas

akulululu

已于 2022-03-19 11:12:50 修改

阅读量901

点赞数 1

文章标签： python 数据挖掘数据分析

于 2022-03-17 21:59:44 首次发布

本文链接：https://blog.csdn.net/weixin_52260178/article/details/123562039

版权

Python Pandas

1.Pandas基本介绍

import pandas as pd
import numpy as np


# np.nan空
s = pd.Series([1, 3, 5, np.nan, 44, 1])

print(s)
# 0     1.0
# 1     3.0
# 2     5.0
# 3     NaN
# 4    44.0
# 5     1.0
# dtype: float64

dates = pd.date_range("20220301", periods=6)

print(dates)
# DatetimeIndex(['2022-03-01', '2022-03-02', '2022-03-03', '2022-03-04',
#                '2022-03-05', '2022-03-06'],
#               dtype='datetime64[ns]', freq='D')


# index:行的索引
# columns:列的索引
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=["a", "b", "c", "d"])

print(df)
#                    a         b         c         d
# 2022-03-01  0.194540 -0.553569 -0.966694 -0.759754
# 2022-03-02  0.381299 -0.916151 -1.332240  0.424302
# 2022-03-03 -2.277082  0.756127 -1.934086  0.772821
# 2022-03-04 -2.624071 -0.925296 -0.457871 -0.163357
# 2022-03-05  0.780169  0.883999  0.773308  2.210360
# 2022-03-06  0.451705 -2.097732 -0.286711  0.216228


# 未指定行和列的索引，默认为0, 1, 2, 3...
df_1 = pd.DataFrame(np.arange(12).reshape(3, 4))

print(df_1)
#    0  1   2   3
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11

df_2 = pd.DataFrame({'A': 1., 'B': pd.Timestamp('20220311'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                     'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(["test", "train", "test", "train"]), 'F': 'foo'})

print(df_2)
#      A          B    C  D      E    F
# 0  1.0 2022-03-11  1.0  3   test  foo
# 1  1.0 2022-03-11  1.0  3  train  foo
# 2  1.0 2022-03-11  1.0  3   test  foo
# 3  1.0 2022-03-11  1.0  3  train  foo


# describe计算数字的相关内容，日期和字符串被省略掉了
print(df_2.describe())
#          A    C    D
# count  4.0  4.0  4.0
# mean   1.0  1.0  3.0
# std    0.0  0.0  0.0
# min    1.0  1.0  3.0
# 25%    1.0  1.0  3.0
# 50%    1.0  1.0  3.0
# 75%    1.0  1.0  3.0
# max    1.0  1.0  3.0

print(df_2.T)
#                      0  ...                    3
# A                  1.0  ...                  1.0
# B  2022-03-11 00:00:00  ...  2022-03-11 00:00:00
# C                  1.0  ...                  1.0
# D                    3  ...                    3
# E                 test  ...                train
# F                  foo  ...                  foo
#
# [6 rows x 4 columns]

# sort_index(选择行或列， 选择顺序): 按照索引排序
# True为正序,False为倒叙
print(df_2.sort_index(axis=1, ascending=False))
#      F      E  D    C          B    A
# 0  foo   test  3  1.0 2022-03-11  1.0
# 1  foo  train  3  1.0 2022-03-11  1.0
# 2  foo   test  3  1.0 2022-03-11  1.0
# 3  foo  train  3  1.0 2022-03-11  1.0

print(df_2.sort_index(axis=0, ascending=False))
#      A          B    C  D      E    F
# 3  1.0 2022-03-11  1.0  3  train  foo
# 2  1.0 2022-03-11  1.0  3   test  foo
# 1  1.0 2022-03-11  1.0  3  train  foo
# 0  1.0 2022-03-11  1.0  3   test  foo

# sort_valuse(该例选择E列): 按照值排序
print(df_2.sort_values(by='E'))
#      A          B    C  D      E    F
# 0  1.0 2022-03-11  1.0  3   test  foo
# 2  1.0 2022-03-11  1.0  3   test  foo
# 1  1.0 2022-03-11  1.0  3  train  foo
# 3  1.0 2022-03-11  1.0  3  train  foo

2.Pandas选择数据

import pandas as pd
import numpy as np

dates = pd.date_range('20220301', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])

print(df)
#              A   B   C   D
# 2022-03-01   0   1   2   3
# 2022-03-02   4   5   6   7
# 2022-03-03   8   9  10  11
# 2022-03-04  12  13  14  15
# 2022-03-05  16  17  18  19
# 2022-03-06  20  21  22  23

print(df['A'])
# 2022-03-01     0
# 2022-03-02     4
# 2022-03-03     8
# 2022-03-04    12
# 2022-03-05    16
# 2022-03-06    20
# Freq: D, Name: A, dtype: int32

print(df.A)
# 2022-03-01     0
# 2022-03-02     4
# 2022-03-03     8
# 2022-03-04    12
# 2022-03-05    16
# 2022-03-06    20
# Freq: D, Name: A, dtype: int32

# 切片1.0
print(df[0: 3])
#             A  B   C   D
# 2022-03-01  0  1   2   3
# 2022-03-02  4  5   6   7
# 2022-03-03  8  9  10  11

# 切片2.0
print(df['20220302': '20220304'])
#              A   B   C   D
# 2022-03-02   4   5   6   7
# 2022-03-03   8   9  10  11
# 2022-03-04  12  13  14  15


# 按照标签进行选择
# 可以选择横向的标签，也可以选择纵向的标签
print(df.loc['20220301'])
# A    0
# B    1
# C    2
# D    3
# Name: 2022-03-01 00:00:00, dtype: int32

# 选择所有的行，和A, B两列
print(df.loc[:, ['A', 'B']])
#              A   B
# 2022-03-01   0   1
# 2022-03-02   4   5
# 2022-03-03   8   9
# 2022-03-04  12  13
# 2022-03-05  16  17
# 2022-03-06  20  21

# 通过切片的方式选择标签
print(df.loc['20220302':, ['A', 'B']])
#              A   B
# 2022-03-02   4   5
# 2022-03-03   8   9
# 2022-03-04  12  13
# 2022-03-05  16  17
# 2022-03-06  20  21


# 选择特定的行，选择部分列标签
print(df.loc['20220302', ['A', 'B']])
# A    4
# B    5
# Name: 2022-03-02 00:00:00, dtype: int32

# 通过数字筛选(行数列数)
# 选择第三行
print(df.iloc[3])
# A    12
# B    13
# C    14
# D    15
# Name: 2022-03-04 00:00:00, dtype: int32

# 选择第三行第一位
print(df.iloc[3, 1])
# 13

# 通过切片选择第3行到第5行的第1位到第3位
print(df.iloc[3: 5, 1: 3])
#              B   C
# 2022-03-04  13  14
# 2022-03-05  17  18

# 可以不连续地筛选行
print(df.iloc[[1, 3, 5], 1: 3])
#              B   C
# 2022-03-02   5   6
# 2022-03-04  13  14
# 2022-03-06  21  22

print(df[df.A > 8])
#              A   B   C   D
# 2022-03-04  12  13  14  15
# 2022-03-05  16  17  18  19
# 2022-03-06  20  21  22  23

3.pandas设置值

import pandas as pd
import numpy as np

dates = pd.date_range('20220301', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])

print(df)
#              A   B   C   D
# 2022-03-01   0   1   2   3
# 2022-03-02   4   5   6   7
# 2022-03-03   8   9  10  11
# 2022-03-04  12  13  14  15
# 2022-03-05  16  17  18  19
# 2022-03-06  20  21  22  23

# 更改第二行第二列的值，通过数字表示要修改的位置
df.iloc[2, 2] = 1111
print(df)
#              A   B     C   D
# 2022-03-01   0   1     2   3
# 2022-03-02   4   5     6   7
# 2022-03-03   8   9  1111  11
# 2022-03-04  12  13    14  15
# 2022-03-05  16  17    18  19
# 2022-03-06  20  21    22  23


# 更改特定行和列的值，通过标签表示要修改的位置
df.loc['20220301', 'B'] = 2222
print(df)
#              A     B     C   D
# 2022-03-01   0  2222     2   3
# 2022-03-02   4     5     6   7
# 2022-03-03   8     9  1111  11
# 2022-03-04  12    13    14  15
# 2022-03-05  16    17    18  19
# 2022-03-06  20    21    22  23

# 将A列中大于4的值全部赋值为0
df.A[df.A > 4] = 0
print(df)
#             A     B     C   D
# 2022-03-01  0  2222     2   3
# 2022-03-02  4     5     6   7
# 2022-03-03  0     9  1111  11
# 2022-03-04  0    13    14  15
# 2022-03-05  0    17    18  19
# 2022-03-06  0    21    22  23


df[df.A > 4] = 0
print(df)
#             A     B     C   D
# 2022-03-01  0  2222     2   3
# 2022-03-02  4     5     6   7
# 2022-03-03  0     9  1111  11
# 2022-03-04  0    13    14  15
# 2022-03-05  0    17    18  19
# 2022-03-06  0    21    22  23

df.B[df.A > 4] = 0
print(df)
#             A     B     C   D
# 2022-03-01  0  2222     2   3
# 2022-03-02  4     5     6   7
# 2022-03-03  0     9  1111  11
# 2022-03-04  0    13    14  15
# 2022-03-05  0    17    18  19
# 2022-03-06  0    21    22  23


# 添加了新的一列F
df['F'] = np.nan
print(df)
#             A     B     C   D   F
# 2022-03-01  0  2222     2   3 NaN
# 2022-03-02  4     5     6   7 NaN
# 2022-03-03  0     9  1111  11 NaN
# 2022-03-04  0    13    14  15 NaN
# 2022-03-05  0    17    18  19 NaN
# 2022-03-06  0    21    22  23 NaN

# 添加了一个序列
df['E'] = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20220301', periods=6))
print(df)
#             A     B     C   D   F  E
# 2022-03-01  0  2222     2   3 NaN  1
# 2022-03-02  4     5     6   7 NaN  2
# 2022-03-03  0     9  1111  11 NaN  3
# 2022-03-04  0    13    14  15 NaN  4
# 2022-03-05  0    17    18  19 NaN  5
# 2022-03-06  0    21    22  23 NaN  6

4.pandas处理丢失数据

import pandas as pd
import numpy as np

dates = pd.date_range('20220301', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan

print(df)
#              A     B     C   D
# 2022-03-01   0   NaN   2.0   3
# 2022-03-02   4   5.0   NaN   7
# 2022-03-03   8   9.0  10.0  11
# 2022-03-04  12  13.0  14.0  15
# 2022-03-05  16  17.0  18.0  19
# 2022-03-06  20  21.0  22.0  23

# 判断是否为NAN
print(df.isnull())
#                 A      B      C      D
# 2022-03-01  False   True  False  False
# 2022-03-02  False  False   True  False
# 2022-03-03  False  False  False  False
# 2022-03-04  False  False  False  False
# 2022-03-05  False  False  False  False
# 2022-03-06  False  False  False  False

# 判断是否存在任何一个NAN
print(np.any(df.isnull()) == True)
# True

# how={'any', 'all'}
# any: 出现任何一个NAN就会丢掉整行的数据; all: 改行中所有的值都为NAN时丢掉该行数据
# axis: 设置行或列
print(df.dropna(axis=0, how='any'))
#              A     B     C   D
# 2022-03-03   8   9.0  10.0  11
# 2022-03-04  12  13.0  14.0  15
# 2022-03-05  16  17.0  18.0  19
# 2022-03-06  20  21.0  22.0  23

# 将显示为NAN的数据填充为0
print(df.fillna(value=0))
#              A     B     C   D
# 2022-03-01   0   0.0   2.0   3
# 2022-03-02   4   5.0   0.0   7
# 2022-03-03   8   9.0  10.0  11
# 2022-03-04  12  13.0  14.0  15
# 2022-03-05  16  17.0  18.0  19
# 2022-03-06  20  21.0  22.0  23

5.pandas导入导出数据

import pandas as pd

# 读取csv文件
data = pd.read_csv("xxx.csv")

# 将数据存到pickle类型的文件中
data.to_pickle("xxx.pickle")

6.pandas合并

import pandas as pd
import numpy as np

# concatenating
df_1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df_2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df_3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])

print(df_1)
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0

print(df_2)
#      a    b    c    d
# 0  1.0  1.0  1.0  1.0
# 1  1.0  1.0  1.0  1.0
# 2  1.0  1.0  1.0  1.0

print(df_3)
#      a    b    c    d
# 0  2.0  2.0  2.0  2.0
# 1  2.0  2.0  2.0  2.0
# 2  2.0  2.0  2.0  2.0

# axis=0:竖向合并    axis=1:横向合并
res = pd.concat([df_1, df_2, df_3], axis=0)

print(res)
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 0  1.0  1.0  1.0  1.0
# 1  1.0  1.0  1.0  1.0
# 2  1.0  1.0  1.0  1.0
# 0  2.0  2.0  2.0  2.0
# 1  2.0  2.0  2.0  2.0
# 2  2.0  2.0  2.0  2.0

# ignore_index=True: 对索引重新排序
res = pd.concat([df_1, df_2, df_3], axis=0, ignore_index=True)

print(res)
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  1.0  1.0  1.0  1.0
# 4  1.0  1.0  1.0  1.0
# 5  1.0  1.0  1.0  1.0
# 6  2.0  2.0  2.0  2.0
# 7  2.0  2.0  2.0  2.0
# 8  2.0  2.0  2.0  2.0

import pandas as pd
import numpy as np

# join, ['inner', 'outer']
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])

print(df1)
#      a    b    c    d
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  0.0  0.0  0.0  0.0

print(df2)
#      b    c    d    e
# 2  0.0  0.0  0.0  0.0
# 3  0.0  0.0  0.0  0.0
# 4  0.0  0.0  0.0  0.0

res = pd.concat([df1, df2])

print(res)
#      a    b    c    d    e
# 1  0.0  0.0  0.0  0.0  NaN
# 2  0.0  0.0  0.0  0.0  NaN
# 3  0.0  0.0  0.0  0.0  NaN
# 2  NaN  1.0  1.0  1.0  1.0
# 3  NaN  1.0  1.0  1.0  1.0
# 4  NaN  1.0  1.0  1.0  1.0


# join='inner': 只显示两者都有的部分
res = pd.concat([df1, df2], join='inner', ignore_index=True)

print(res)
#      b    c    d
# 0  0.0  0.0  0.0
# 1  0.0  0.0  0.0
# 2  0.0  0.0  0.0
# 3  1.0  1.0  1.0
# 4  1.0  1.0  1.0
# 5  1.0  1.0  1.0

import numpy as np
import pandas as pd

# append
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])

res = df1.append(df2, ignore_index=True)

print(res)
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  1.0  1.0  1.0  1.0
# 4  1.0  1.0  1.0  1.0
# 5  1.0  1.0  1.0  1.0

res = df1.append([df2, df3], ignore_index=True)

print(res)
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  1.0  1.0  1.0  1.0
# 4  1.0  1.0  1.0  1.0
# 5  1.0  1.0  1.0  1.0
# 6  2.0  2.0  2.0  2.0
# 7  2.0  2.0  2.0  2.0
# 8  2.0  2.0  2.0  2.0

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
res = df1.append(s1, ignore_index=True)

print(res)
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  1.0  2.0  3.0  4.0

import pandas as pd

# merge two df by key/keys. (may be used in database)
# simple example

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

print(left)
#   key   A   B
# 0  K0  A0  B0
# 1  K1  A1  B1
# 2  K2  A2  B2
# 3  K3  A3  B3

print(right)
#   key   C   D
# 0  K0  C0  D0
# 1  K1  C1  D1
# 2  K2  C2  D2
# 3  K3  C3  D3

res = pd.merge(left, right, on='key')

print(res)
#   key   A   B   C   D
# 0  K0  A0  B0  C0  D0
# 1  K1  A1  B1  C1  D1
# 2  K2  A2  B2  C2  D2
# 3  K3  A3  B3  C3  D3

import pandas as pd

# merge two df by key/keys. (may be used in database)
# consider two keys

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

print(left)
#   key1 key2   A   B
# 0   K0   K0  A0  B0
# 1   K0   K1  A1  B1
# 2   K1   K0  A2  B2
# 3   K2   K1  A3  B3

print(right)
#   key1 key2   C   D
# 0   K0   K0  C0  D0
# 1   K1   K0  C1  D1
# 2   K1   K0  C2  D2
# 3   K2   K0  C3  D3

res = pd.merge(left, right, on=['key1', 'key2'])

print(res)
#   key1 key2   A   B   C   D
# 0   K0   K0  A0  B0  C0  D0
# 1   K1   K0  A2  B2  C1  D1
# 2   K1   K0  A2  B2  C2  D2


# how='inner': 默认的合并方法，只考虑相同的key
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')

print(res)
#   key1 key2   A   B   C   D
# 0   K0   K0  A0  B0  C0  D0
# 1   K1   K0  A2  B2  C1  D1
# 2   K1   K0  A2  B2  C2  D2

# how="outer": 考虑所有的key，没有的部分用NAN表示
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')

print(res)
#   key1 key2    A    B    C    D
# 0   K0   K0   A0   B0   C0   D0
# 1   K0   K1   A1   B1  NaN  NaN
# 2   K1   K0   A2   B2   C1   D1
# 3   K1   K0   A2   B2   C2   D2
# 4   K2   K1   A3   B3  NaN  NaN
# 5   K2   K0  NaN  NaN   C3   D3

import pandas as pd

# merge two df by key/keys. (may be used in database)
# indicator

df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']})
df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]})

print(df1)
#    col1 col_left
# 0     0        a
# 1     1        b

print(df2)
#    col1  col_right
# 0     1          2
# 1     2          2
# 2     2          2


# indicator: 显示合并的方式
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)

print(res)
#    col1 col_left  col_right      _merge
# 0     0        a        NaN   left_only
# 1     1        b        2.0        both
# 2     2      NaN        2.0  right_only
# 3     2      NaN        2.0  right_only

# 给indicator一个名字，名字默认为_merge
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')

print(res)
#    col1 col_left  col_right indicator_column
# 0     0        a        NaN        left_only
# 1     1        b        2.0             both
# 2     2      NaN        2.0       right_only
# 3     2      NaN        2.0       right_only

import pandas as pd

# merge two df by key/keys. (may be used in database)
# merged by index

left = pd.DataFrame({'A': ["A0", "A1", "A2"],
                     'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2'])

right = pd.DataFrame({'C': ["C0", "C2", "C3"],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])

print(left)
#      A   B
# K0  A0  B0
# K1  A1  B1
# K2  A2  B2

print(right)
#      C   D
# K0  C0  D0
# K2  C2  D2
# K3  C3  D3

# left_index and right_index

res = pd.merge(left, right, left_index=True, right_index=True, how='outer')

print(res)
#       A    B    C    D
# K0   A0   B0   C0   D0
# K1   A1   B1  NaN  NaN
# K2   A2   B2   C2   D2
# K3  NaN  NaN   C3   D3

res = pd.merge(left, right, left_index=True, right_index=True, how='inner')

print(res)
#      A   B   C   D
# K0  A0  B0  C0  D0
# K2  A2  B2  C2  D2

import pandas as pd

# merge two df by key/keys. (may be used in database)
# handle overlapping

boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})

print(boys)

girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})

print(girls)

res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')

print(res)
#     k  age_boy  age_girl
# 0  K0        1         4
# 1  K0        1         5

res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')

print(res)
#     k  age_boy  age_girl
# 0  K0      1.0       4.0
# 1  K0      1.0       5.0
# 2  K1      2.0       NaN
# 3  K2      3.0       NaN
# 4  K3      NaN       6.0

7.pandas plot图表

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# plot data

# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))

data = data.cumsum()

# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4),
                    index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
print(data.head())
#           A         B         C         D
# 0  0.076246 -0.071643 -1.384471  1.114611
# 1  2.754119  1.074283 -2.139756  0.259078
# 2  2.364150  0.617473 -1.771385  1.417175
# 3  2.171843  0.738065 -1.409256  1.942316
# 4  1.688500  2.546109 -0.901585  1.028159

data.plot()

plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# plot data

# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))

data = data.cumsum()

# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4),
                    index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
print(data.head())
#           A         B         C         D
# 0  0.076246 -0.071643 -1.384471  1.114611
# 1  2.754119  1.074283 -2.139756  0.259078
# 2  2.364150  0.617473 -1.771385  1.417175
# 3  2.171843  0.738065 -1.409256  1.942316
# 4  1.688500  2.546109 -0.901585  1.028159


# plot methods:
# 'bat','hist','box','kde','area','scatter','hexbin','pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class 1')
data.plot.scatter(x='A', y='C', color='DarkGreen', label='Class 2', ax=ax)

plt.show()

akulululu

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python-Pandas

Python Pandas
复制链接

扫一扫

Python-Pandas

Python Pandas

1.Pandas基本介绍

2.Pandas选择数据

3.pandas设置值

4.pandas处理丢失数据

5.pandas导入导出数据

6.pandas合并

7.pandas plot图表

“相关推荐”对你有帮助么？