python_pandas

theeb

于 2019-07-25 17:52:38 发布

阅读量118

点赞数

本文链接：https://blog.csdn.net/xuyuehengyi/article/details/97273675

版权

>>> s=pd.Series([1,2,np.nan,4])  # 字符串 一维索引
>>> s
0    1.0
1    2.0
2    NaN
3    4.0
dtype: float64

>>> dates = pd.date_range('20190725',periods=4)   # 表格型数据结构，二维索引
>>> dates
DatetimeIndex(['2019-07-25', '2019-07-26', '2019-07-27', '2019-07-28'], dtype='datetime64[ns]', freq='D')
>>> df = pd.DataFrame(np.random.randn(4,4),index=dates,columns=['a','b','c','d'])
>>> df
                   a         b         c         d
2019-07-25  0.837788  0.012600 -1.815884  0.276740
2019-07-26 -1.586108  0.737898 -0.541256  0.449048
2019-07-27 -1.522343  0.264496 -0.285446  1.933708
2019-07-28  1.063021 -0.566470  0.071764  0.243278

>>> df["b"]                                # 索引数据
2019-07-25    0.012600
2019-07-26    0.737898
2019-07-27    0.264496
2019-07-28   -0.566470
Freq: D, Name: b, dtype: float64
>>> df.dtypes                            # 数据类型
a    float64
b    float64
c    float64
d    float64
dtype: object
>>> df.index                            # 横向标签
DatetimeIndex(['2019-07-25', '2019-07-26', '2019-07-27', '2019-07-28'], dtype='datetime64[ns]', freq='D')
>>> df.columns                          # 纵向标签
Index(['a', 'b', 'c', 'd'], dtype='object')
>>> df.values                          # 值
array([[ 0.83778807,  0.01259989, -1.81588355,  0.27673983],
       [-1.58610777,  0.73789787, -0.54125606,  0.44904759],
       [-1.52234345,  0.26449592, -0.28544604,  1.93370844],
       [ 1.06302092, -0.56646994,  0.07176394,  0.24327814]])
>>> df.describe()              
              a         b         c         d
count  4.000000  4.000000  4.000000  4.000000
mean  -0.301911  0.112131 -0.642705  0.725694
std    1.449203  0.543202  0.821531  0.810374
min   -1.586108 -0.566470 -1.815884  0.243278
25%   -1.538285 -0.132168 -0.859913  0.268374
50%   -0.342278  0.138548 -0.413351  0.362894
75%    0.894096  0.382846 -0.196144  0.820213
max    1.063021  0.737898  0.071764  1.933708
>>> df.T
   2019-07-25  2019-07-26  2019-07-27  2019-07-28
a    0.837788   -1.586108   -1.522343    1.063021
b    0.012600    0.737898    0.264496   -0.566470
c   -1.815884   -0.541256   -0.285446    0.071764
d    0.276740    0.449048    1.933708    0.243278
>>> df.sort_index(axis=1,ascending=False)          # 横向 降序
                   d         c         b         a
2019-07-25  0.276740 -1.815884  0.012600  0.837788
2019-07-26  0.449048 -0.541256  0.737898 -1.586108
2019-07-27  1.933708 -0.285446  0.264496 -1.522343
2019-07-28  0.243278  0.071764 -0.566470  1.063021
  
>>> df.sort_values(by='a')                          # a列 升序
                   a         b         c         d
2019-07-26 -1.586108  0.737898 -0.541256  0.449048
2019-07-27 -1.522343  0.264496 -0.285446  1.933708
2019-07-25  0.837788  0.012600 -1.815884  0.276740
2019-07-28  1.063021 -0.566470  0.071764  0.243278

>>> df
             A     B     C   D
2019-01-01   0   NaN   2.0   3
2019-01-02   4   5.0   NaN   7
2019-01-03   8   9.0  10.0  11
2019-01-04  12  13.0  14.0  15
2019-01-05  16  17.0  18.0  19
2019-01-06  20  21.0  22.0  23
>>> df.dropna(axis=0,how="any")             # 0横向 删除，how=all必须全为nan
             A     B     C   D
2019-01-03   8   9.0  10.0  11
2019-01-04  12  13.0  14.0  15
2019-01-05  16  17.0  18.0  19
2019-01-06  20  21.0  22.0  23
>>> df.fillna(value=99)                    # 代替
             A     B     C   D
2019-01-01   0  99.0   2.0   3
2019-01-02   4   5.0  99.0   7
2019-01-03   8   9.0  10.0  11
2019-01-04  12  13.0  14.0  15
2019-01-05  16  17.0  18.0  19
2019-01-06  20  21.0  22.0  23
>>> df.isnull()
                A      B      C      D
2019-01-01  False   True  False  False
2019-01-02  False  False   True  False
2019-01-03  False  False  False  False
2019-01-04  False  False  False  False
2019-01-05  False  False  False  False
2019-01-06  False  False  False  False
>>> np.any(df.isnull())
True

>>> df1
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
>>> df2
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
>>> df3
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

>>> res = pd.concat([df1, df2, df3], axis=0, ignore_index=True) # 纵向加 忽略原来索引
>>> res
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0

>>> df1
     b    c    d    e
1  1.0  1.0  1.0  1.0
>>> df2
     a    b    c    d
1  0.0  0.0  0.0  0.0

>>> res = pd.concat([df1, df2], axis=0, join='inner')  # 纵向合并，只合并有相同索引的列
>>> res
     b    c    d
1  1.0  1.0  1.0
1  0.0  0.0  0.0

>>> res = pd.concat([df1, df2], axis=0, join_axes=[df1.columns])  # 纵向合并,按照df1.columns

>>> df1.append(df2)               # 只有纵向合并
     a    b    c    d    e
1  NaN  1.0  1.0  1.0  1.0
1  0.0  0.0  0.0  0.0  NaN

>>> df1
     b    c    d    e
1  1.0  1.0  1.0  1.0
>>> df2
     a    b    c    d
1  0.0  0.0  0.0  0.0

# 找到b,c，d列中，两者一样的行，这里没有一样的，为空
>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='inner')   
>>> res
Empty DataFrame
Columns: [b, c, d, e, a]
Index: []
# 不一样的用nan填充
>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='outer')
>>> res
     b    c    d    e    a
0  1.0  1.0  1.0  1.0  NaN
1  0.0  0.0  0.0  NaN  0.0
# 左边的索引
>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='left')
>>> res
     b    c    d    e   a
0  1.0  1.0  1.0  1.0 NaN
>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='left', indicator='indicator_column')
>>> res
     b    c    d    e   a indicator_column
0  1.0  1.0  1.0  1.0 NaN        left_only

>>> res = pd.merge(df1, df2, on=['b', 'c','d'], how='left', indicator=True)
>>> res
     b    c    d    e   a     _merge
0  1.0  1.0  1.0  1.0 NaN  left_only

>>> boys
    k  age
0  K0    1
1  K1    2
2  K2    3
>>> girls
    k  age
0  K0    4
1  K0    5
2  K3    6
>>> res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
>>> res
    k  age_boy  age_girl
0  K0        1         4
1  K0        1         5

# 曲线图
data = pd.DataFrame(
    np.random.randn(1000,4),
    index=np.arange(1000),
    columns=list("ABCD")
    )
data.cumsum()
data.plot()
plt.show()


# 散点图
data.plot.scatter(x='A',y='C',color='LightGreen',label='Class2',ax=ax)
plt.show()

theeb

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python_pandas

>>> s=pd.Series([1,2,np.nan,4]) # 字符串一维索引>>> s0 1.01 2.02 NaN3 4.0dtype: float64>>> dates = pd.date_range('20190725',periods=4) # 表格型数据结构，二维索引>&...
复制链接

扫一扫