数据挖掘3.Pandas基础数据处理

Pandas 数据处理工具(Pandas基础数据处理)

1.便捷的数据处理能力

2.读取文件方便

3.封装了Matplotlib ,Numpy的画图和计算

Pandas的核心数据结构

1.DataFrame

2.Panel

3.Series

# DataFrame:
#     既有行索引,又有列索引的二维数组
import numpy as np
stock_change = np.random.normal(0,1,(10,5))
stock_change
array([[-8.95568808e-01,  4.06084744e-01,  2.39582146e+00,
         5.11976280e-01,  2.77457517e-01],
       [-4.51453894e-01,  6.31000994e-01,  1.24744949e+00,
        -2.00202912e+00, -2.92207867e-01],
       [ 6.55667369e-02, -7.84859167e-01, -4.12101825e-01,
        -3.47667810e-01,  1.45793706e-03],
       [-1.67876579e-01, -1.40968116e+00,  1.81237520e+00,
        -1.92033529e+00,  9.38875605e-01],
       [ 6.00839305e-02, -1.05816829e-01, -4.45666222e-01,
        -1.81006228e-01,  2.05418850e-01],
       [-6.77267207e-01, -7.53382789e-02,  3.97187494e-01,
         1.07846908e-01, -1.32381297e+00],
       [-1.68876926e+00, -1.70388042e-01, -3.94244637e-01,
         2.25074192e+00, -6.01979955e-01],
       [-1.91533157e+00, -1.22823000e-01,  3.91800427e-01,
        -8.66979997e-01,  2.93894267e-01],
       [ 1.36784986e+00, -7.75015875e-01, -1.71867214e+00,
         4.12866314e-01, -1.37527880e+00],
       [-1.04679747e-01,  4.59861072e-01, -7.55025583e-01,
        -6.71892862e-01, -1.52619930e+00]])
import pandas as pd
pd.DataFrame(stock_change)
01234
0-0.8955690.4060852.3958210.5119760.277458
1-0.4514540.6310011.247449-2.002029-0.292208
20.065567-0.784859-0.412102-0.3476680.001458
3-0.167877-1.4096811.812375-1.9203350.938876
40.060084-0.105817-0.445666-0.1810060.205419
5-0.677267-0.0753380.3971870.107847-1.323813
6-1.688769-0.170388-0.3942452.250742-0.601980
7-1.915332-0.1228230.391800-0.8669800.293894
81.367850-0.775016-1.7186720.412866-1.375279
9-0.1046800.459861-0.755026-0.671893-1.526199
#添加行索引
stock = ["股票{}".format(i) for i in range(10)]
pd.DataFrame(stock_change, index=stock)
01234
股票0-0.8955690.4060852.3958210.5119760.277458
股票1-0.4514540.6310011.247449-2.002029-0.292208
股票20.065567-0.784859-0.412102-0.3476680.001458
股票3-0.167877-1.4096811.812375-1.9203350.938876
股票40.060084-0.105817-0.445666-0.1810060.205419
股票5-0.677267-0.0753380.3971870.107847-1.323813
股票6-1.688769-0.170388-0.3942452.250742-0.601980
股票7-1.915332-0.1228230.391800-0.8669800.293894
股票81.367850-0.775016-1.7186720.412866-1.375279
股票9-0.1046800.459861-0.755026-0.671893-1.526199
#添加列索引
date = pd.date_range(start="20180101", periods=5, freq="B")
date
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05'],
              dtype='datetime64[ns]', freq='B')
data = pd.DataFrame(stock_change, index=stock, columns=date)
# DataFrame常见属性演示
data
2018-01-012018-01-022018-01-032018-01-042018-01-05
股票0-0.8955690.4060852.3958210.5119760.277458
股票1-0.4514540.6310011.247449-2.002029-0.292208
股票20.065567-0.784859-0.412102-0.3476680.001458
股票3-0.167877-1.4096811.812375-1.9203350.938876
股票40.060084-0.105817-0.445666-0.1810060.205419
股票5-0.677267-0.0753380.3971870.107847-1.323813
股票6-1.688769-0.170388-0.3942452.250742-0.601980
股票7-1.915332-0.1228230.391800-0.8669800.293894
股票81.367850-0.775016-1.7186720.412866-1.375279
股票9-0.1046800.459861-0.755026-0.671893-1.526199
data.shape
(10, 5)
data.index
Index(['股票0', '股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9'], dtype='object')
data.columns
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05'],
              dtype='datetime64[ns]', freq='B')
data.values
array([[-8.95568808e-01,  4.06084744e-01,  2.39582146e+00,
         5.11976280e-01,  2.77457517e-01],
       [-4.51453894e-01,  6.31000994e-01,  1.24744949e+00,
        -2.00202912e+00, -2.92207867e-01],
       [ 6.55667369e-02, -7.84859167e-01, -4.12101825e-01,
        -3.47667810e-01,  1.45793706e-03],
       [-1.67876579e-01, -1.40968116e+00,  1.81237520e+00,
        -1.92033529e+00,  9.38875605e-01],
       [ 6.00839305e-02, -1.05816829e-01, -4.45666222e-01,
        -1.81006228e-01,  2.05418850e-01],
       [-6.77267207e-01, -7.53382789e-02,  3.97187494e-01,
         1.07846908e-01, -1.32381297e+00],
       [-1.68876926e+00, -1.70388042e-01, -3.94244637e-01,
         2.25074192e+00, -6.01979955e-01],
       [-1.91533157e+00, -1.22823000e-01,  3.91800427e-01,
        -8.66979997e-01,  2.93894267e-01],
       [ 1.36784986e+00, -7.75015875e-01, -1.71867214e+00,
         4.12866314e-01, -1.37527880e+00],
       [-1.04679747e-01,  4.59861072e-01, -7.55025583e-01,
        -6.71892862e-01, -1.52619930e+00]])
data.T
股票0股票1股票2股票3股票4股票5股票6股票7股票8股票9
2018-01-01-0.895569-0.4514540.065567-0.1678770.060084-0.677267-1.688769-1.9153321.367850-0.104680
2018-01-020.4060850.631001-0.784859-1.409681-0.105817-0.075338-0.170388-0.122823-0.7750160.459861
2018-01-032.3958211.247449-0.4121021.812375-0.4456660.397187-0.3942450.391800-1.718672-0.755026
2018-01-040.511976-2.002029-0.347668-1.920335-0.1810060.1078472.250742-0.8669800.412866-0.671893
2018-01-050.277458-0.2922080.0014580.9388760.205419-1.323813-0.6019800.293894-1.375279-1.526199
#DataFrame常用方法
data.head()
2018-01-012018-01-022018-01-032018-01-042018-01-05
股票0-0.8955690.4060852.3958210.5119760.277458
股票1-0.4514540.6310011.247449-2.002029-0.292208
股票20.065567-0.784859-0.412102-0.3476680.001458
股票3-0.167877-1.4096811.812375-1.9203350.938876
股票40.060084-0.105817-0.445666-0.1810060.205419
data.head(3)
2018-01-012018-01-022018-01-032018-01-042018-01-05
股票0-0.8955690.4060852.3958210.5119760.277458
股票1-0.4514540.6310011.247449-2.002029-0.292208
股票20.065567-0.784859-0.412102-0.3476680.001458
data.tail(2)
2018-01-012018-01-022018-01-032018-01-042018-01-05
股票81.36785-0.775016-1.7186720.412866-1.375279
股票9-0.104680.459861-0.755026-0.671893-1.526199
#DataFrame索引的设置
#1.修改行列索引值
data.head()
2018-01-012018-01-022018-01-032018-01-042018-01-05
股票0-0.8955690.4060852.3958210.5119760.277458
股票1-0.4514540.6310011.247449-2.002029-0.292208
股票20.065567-0.784859-0.412102-0.3476680.001458
股票3-0.167877-1.4096811.812375-1.9203350.938876
股票40.060084-0.105817-0.445666-0.1810060.205419
#data.index[2] = "股票88" 会报错不能单独修改索引
stock_ =["股票_{}".format(i) for i in range(10)]#只能整体修改
data.index = stock_
data
2018-01-012018-01-022018-01-032018-01-042018-01-05
股票_0-0.8955690.4060852.3958210.5119760.277458
股票_1-0.4514540.6310011.247449-2.002029-0.292208
股票_20.065567-0.784859-0.412102-0.3476680.001458
股票_3-0.167877-1.4096811.812375-1.9203350.938876
股票_40.060084-0.105817-0.445666-0.1810060.205419
股票_5-0.677267-0.0753380.3971870.107847-1.323813
股票_6-1.688769-0.170388-0.3942452.250742-0.601980
股票_7-1.915332-0.1228230.391800-0.8669800.293894
股票_81.367850-0.775016-1.7186720.412866-1.375279
股票_9-0.1046800.459861-0.755026-0.671893-1.526199
#2.重设索引
data.reset_index(drop = False)
index2018-01-01 00:00:002018-01-02 00:00:002018-01-03 00:00:002018-01-04 00:00:002018-01-05 00:00:00
0股票_0-0.8955690.4060852.3958210.5119760.277458
1股票_1-0.4514540.6310011.247449-2.002029-0.292208
2股票_20.065567-0.784859-0.412102-0.3476680.001458
3股票_3-0.167877-1.4096811.812375-1.9203350.938876
4股票_40.060084-0.105817-0.445666-0.1810060.205419
5股票_5-0.677267-0.0753380.3971870.107847-1.323813
6股票_6-1.688769-0.170388-0.3942452.250742-0.601980
7股票_7-1.915332-0.1228230.391800-0.8669800.293894
8股票_81.367850-0.775016-1.7186720.412866-1.375279
9股票_9-0.1046800.459861-0.755026-0.671893-1.526199
data.shape
(10, 5)
#设置新索引
df = pd.DataFrame({'month':[1,4,7,10],
                    'year':[2012,2014,2013,2014],
                    'sale':[55,40,84,31]})
df
monthyearsale
01201255
14201440
27201384
310201431
#以月份为新的索引
df.set_index("month",drop=True)
yearsale
month
1201255
4201440
7201384
10201431
df.set_index("month",drop=False)
monthyearsale
month
11201255
44201440
77201384
1010201431
#设置多个索引
new_df = df.set_index(["year","month"])
new_df
sale
yearmonth
2012155
2014440
2013784
20141031
new_df.index##变成MultiIndex类型
MultiIndex([(2012,  1),
            (2014,  4),
            (2013,  7),
            (2014, 10)],
           names=['year', 'month'])
new_df.index.names
FrozenList(['year', 'month'])
new_df.index.levels
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])

Panel存储三维结构的数据

p = pd.Panel(np,arange(24).reshape(4,3,2),
             items=list('ABCD'),
             major_axis=pd.date_range('20130101',periods=3),
             minor_axis=['first','second'])#存储MultiIndex的一个容器,已弃用
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

Cell In[71], line 1
----> 1 p = pd.Panel(np,arange(24).reshape(4,3,2),
      2              items=list('ABCD'),
      3              major_axis=pd.date_range('20130101',periods=3),
      4              minor_axis=['first','second'])


AttributeError: module 'pandas' has no attribute 'Panel'
data
2018-01-012018-01-022018-01-032018-01-042018-01-05
股票_0-0.8955690.4060852.3958210.5119760.277458
股票_1-0.4514540.6310011.247449-2.002029-0.292208
股票_20.065567-0.784859-0.412102-0.3476680.001458
股票_3-0.167877-1.4096811.812375-1.9203350.938876
股票_40.060084-0.105817-0.445666-0.1810060.205419
股票_5-0.677267-0.0753380.3971870.107847-1.323813
股票_6-1.688769-0.170388-0.3942452.250742-0.601980
股票_7-1.915332-0.1228230.391800-0.8669800.293894
股票_81.367850-0.775016-1.7186720.412866-1.375279
股票_9-0.1046800.459861-0.755026-0.671893-1.526199
data.iloc[1,:]  #带索引的一维数组,即series
2018-01-01   -0.451454
2018-01-02    0.631001
2018-01-03    1.247449
2018-01-04   -2.002029
2018-01-05   -0.292208
Freq: B, Name: 股票_1, dtype: float64
data.iloc[1,:].index
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05'],
              dtype='datetime64[ns]', freq='B')
#Series
sr = data.iloc[1,:]
sr.index
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05'],
              dtype='datetime64[ns]', freq='B')
sr.values
array([-0.45145389,  0.63100099,  1.24744949, -2.00202912, -0.29220787])
type(sr.values)
numpy.ndarray
pd.Series(np.arange(3,9,2),index=["a","b","c"])
a    3
b    5
c    7
dtype: int32
pd.Series({'red':100,'blue':200,'green':500,'yellow':1000})
red        100
blue       200
green      500
yellow    1000
dtype: int64
#可以将DataFrame理解为Series的容器
#Panel是DataFrame的容器

基本数据操作

import pandas as pd
data = pd.read_csv("./stock_day/stock_day.csv")
data
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
.............................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112115090.18115090.18115090.186.16
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.82098904.7998904.7998904.793.19
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707100812.93100812.93100812.932.30
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610117681.67117681.67117681.674.76
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.52096291.7396291.7396291.733.30

643 rows × 14 columns

data = data.drop(["ma5","ma10","ma20","v_ma5","v_ma10","v_ma20"], axis=1)
data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.8824.1623.5395578.030.632.682.39
2018-02-2622.8023.7823.5322.8060985.110.693.021.53
2018-02-2322.8823.3722.8222.7152914.010.542.421.32
2018-02-2222.2522.7622.2822.0236105.010.361.640.90
2018-02-1421.4921.9921.9221.4823331.040.442.050.58
...........................
2015-03-0613.1714.4814.2813.13179831.721.128.516.16
2015-03-0512.8813.4513.1612.8793180.390.262.023.19
2015-03-0412.8012.9212.9012.6167075.440.201.572.30
2015-03-0312.5213.0612.7012.52139071.610.181.444.76
2015-03-0212.2512.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

#data[1,0]   不能直接进行数字索引
#1.直接索引(先列后行)
data["open"]["2018-02-26"]
22.8
#2.按名字索引
data.loc["2018-02-26"]["open"]
22.8
data.loc["2018-02-26","open"]
22.8
#3.按数字索引
data.iloc[1,0]
22.8
#4.即用数字又用名字 (ix组合索引)
data.loc[data.index[0:4],['open','close','high','low']]

openclosehighlow
2018-02-2723.5324.1625.8823.53
2018-02-2622.8023.5323.7822.80
2018-02-2322.8822.8223.3722.71
2018-02-2222.2522.2822.7622.02
data.iloc[0:4,data.columns.get_indexer(['open','close','high','low'])]
openclosehighlow
2018-02-2723.5324.1625.8823.53
2018-02-2622.8023.5323.7822.80
2018-02-2322.8822.8223.3722.71
2018-02-2222.2522.2822.7622.02
#赋值操作
data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.8824.1623.5395578.030.632.682.39
2018-02-2622.8023.7823.5322.8060985.110.693.021.53
2018-02-2322.8823.3722.8222.7152914.010.542.421.32
2018-02-2222.2522.7622.2822.0236105.010.361.640.90
2018-02-1421.4921.9921.9221.4823331.040.442.050.58
...........................
2015-03-0613.1714.4814.2813.13179831.721.128.516.16
2015-03-0512.8813.4513.1612.8793180.390.262.023.19
2015-03-0412.8012.9212.9012.6167075.440.201.572.30
2015-03-0312.5213.0612.7012.52139071.610.181.444.76
2015-03-0212.2512.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

data["open"]
2018-02-27    23.53
2018-02-26    22.80
2018-02-23    22.88
2018-02-22    22.25
2018-02-14    21.49
              ...  
2015-03-06    13.17
2015-03-05    12.88
2015-03-04    12.80
2015-03-03    12.52
2015-03-02    12.25
Name: open, Length: 643, dtype: float64
data.open = 100
data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2710025.8824.1623.5395578.030.632.682.39
2018-02-2610023.7823.5322.8060985.110.693.021.53
2018-02-2310023.3722.8222.7152914.010.542.421.32
2018-02-2210022.7622.2822.0236105.010.361.640.90
2018-02-1410021.9921.9221.4823331.040.442.050.58
...........................
2015-03-0610014.4814.2813.13179831.721.128.516.16
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

data.iloc[1,0]=222
data
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1010036.3533.8532.23269033.120.511.539.21
2015-06-1222235.9835.2134.01159825.880.822.385.47
2017-10-3110035.2234.4432.20361660.882.387.429.05
2015-06-1510034.9931.6931.69199369.53-3.52-10.006.82
2015-06-1110034.9834.3932.51173075.730.541.595.92
...........................
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

data.head()
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2710025.8824.1623.5395578.030.632.682.39
2018-02-2622223.7823.5322.8060985.110.693.021.53
2018-02-2310023.3722.8222.7152914.010.542.421.32
2018-02-2210022.7622.2822.0236105.010.361.640.90
2018-02-1410021.9921.9221.4823331.040.442.050.58
#排序操作
    #对内容排序
    #对索引进行排序
data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2710025.8824.1623.5395578.030.632.682.39
2018-02-2622223.7823.5322.8060985.110.693.021.53
2018-02-2310023.3722.8222.7152914.010.542.421.32
2018-02-2210022.7622.2822.0236105.010.361.640.90
2018-02-1410021.9921.9221.4823331.040.442.050.58
...........................
2015-03-0610014.4814.2813.13179831.721.128.516.16
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

data.sort_values(by='high',ascending=False)#默认为True从小到大进行排序
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1010036.3533.8532.23269033.120.511.539.21
2015-06-1222235.9835.2134.01159825.880.822.385.47
2017-10-3110035.2234.4432.20361660.882.387.429.05
2015-06-1510034.9931.6931.69199369.53-3.52-10.006.82
2015-06-1110034.9834.3932.51173075.730.541.595.92
...........................
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

#按多个字段进行排序
data.sort_values(by=['high','p_change'],ascending=False)#默认为True从小到大进行排序
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1010036.3533.8532.23269033.120.511.539.21
2015-06-1222235.9835.2134.01159825.880.822.385.47
2017-10-3110035.2234.4432.20361660.882.387.429.05
2015-06-1510034.9931.6931.69199369.53-3.52-10.006.82
2015-06-1110034.9834.3932.51173075.730.541.595.92
...........................
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

#对索引进行排序
data
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1010036.3533.8532.23269033.120.511.539.21
2015-06-1222235.9835.2134.01159825.880.822.385.47
2017-10-3110035.2234.4432.20361660.882.387.429.05
2015-06-1510034.9931.6931.69199369.53-3.52-10.006.82
2015-06-1110034.9834.3932.51173075.730.541.595.92
...........................
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

data.sort_index()#从小到大排
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0210012.6712.5212.2096291.730.322.623.30
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-03-0610014.4814.2813.13179831.721.128.516.16
...........................
2018-02-1410021.9921.9221.4823331.040.442.050.58
2018-02-2210022.7622.2822.0236105.010.361.640.90
2018-02-2310023.3722.8222.7152914.010.542.421.32
2018-02-2622223.7823.5322.8060985.110.693.021.53
2018-02-2710025.8824.1623.5395578.030.632.682.39

643 rows × 8 columns

#Series的排序
sr = data["price_change"]
sr
2015-06-10    0.51
2015-06-12    0.82
2017-10-31    2.38
2015-06-15   -3.52
2015-06-11    0.54
              ... 
2015-03-05    0.26
2015-09-07    0.37
2015-03-03    0.18
2015-03-04    0.20
2015-03-02    0.32
Name: price_change, Length: 643, dtype: float64
sr.sort_values(ascending=False).head()
2015-06-09    3.03
2017-10-26    2.68
2015-05-21    2.57
2017-10-31    2.38
2017-06-22    2.36
Name: price_change, dtype: float64
sr.sort_index()
2015-03-02    0.32
2015-03-03    0.18
2015-03-04    0.20
2015-03-05    0.26
2015-03-06    1.12
              ... 
2018-02-14    0.44
2018-02-22    0.36
2018-02-23    0.54
2018-02-26    0.69
2018-02-27    0.63
Name: price_change, Length: 643, dtype: float64

DataFrame运算

#算术运算
data
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1010036.3533.8532.23269033.120.511.539.21
2015-06-1222235.9835.2134.01159825.880.822.385.47
2017-10-3110035.2234.4432.20361660.882.387.429.05
2015-06-1510034.9931.6931.69199369.53-3.52-10.006.82
2015-06-1110034.9834.3932.51173075.730.541.595.92
...........................
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

data["open"].add(3).head()
2015-06-10    103
2015-06-12    225
2017-10-31    103
2015-06-15    103
2015-06-11    103
Name: open, dtype: int64
data+10
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1011046.3543.8542.23269043.1210.5111.5319.21
2015-06-1223245.9845.2144.01159835.8810.8212.3815.47
2017-10-3111045.2244.4442.20361670.8812.3817.4219.05
2015-06-1511044.9941.6941.69199379.536.480.0016.82
2015-06-1111044.9844.3942.51173085.7310.5411.5915.92
...........................
2015-03-0511023.4523.1622.8793190.3910.2612.0213.19
2015-09-0711023.3822.7722.6352500.0410.3712.9811.80
2015-03-0311023.0622.7022.52139081.6110.1811.4414.76
2015-03-0411022.9222.9022.6167085.4410.2011.5712.30
2015-03-0211022.6722.5222.2096301.7310.3212.6213.30

643 rows × 8 columns

data.sub(100).head()
openhighcloselowvolumeprice_changep_changeturnover
2015-06-100-63.65-66.15-67.77268933.12-99.49-98.47-90.79
2015-06-12122-64.02-64.79-65.99159725.88-99.18-97.62-94.53
2017-10-310-64.78-65.56-67.80361560.88-97.62-92.58-90.95
2015-06-150-65.01-68.31-68.31199269.53-103.52-110.00-93.18
2015-06-110-65.02-65.61-67.49172975.73-99.46-98.41-94.08
data["close"].sub(data["open"]).head()
2015-06-10    -66.15
2015-06-12   -186.79
2017-10-31    -65.56
2015-06-15    -68.31
2015-06-11    -65.61
dtype: float64
#逻辑运算(可进行布尔索引)
data["p_change"]>2
2015-06-10    False
2015-06-12     True
2017-10-31     True
2015-06-15    False
2015-06-11    False
              ...  
2015-03-05     True
2015-09-07     True
2015-03-03    False
2015-03-04    False
2015-03-02     True
Name: p_change, Length: 643, dtype: bool
data[data["p_change"]>2].head()
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1222235.9835.2134.01159825.880.822.385.47
2017-10-3110035.2234.4432.20361660.882.387.429.05
2015-06-1610033.4832.3529.61153130.610.662.085.24
2015-06-0910033.3433.3430.46204438.473.0310.007.00
2017-10-2710033.2033.1131.45333824.310.702.168.35
data[(data["p_change"]>2) &(data["low"]<15)]
openhighcloselowvolumeprice_changep_changeturnover
2015-09-1110016.0716.0714.61114900.251.469.993.93
2015-09-2910016.0015.4714.8178247.200.382.522.68
2016-01-1410016.0015.9414.3968228.870.704.592.34
2015-03-2710015.8615.7714.90120352.130.845.634.12
2015-03-1710015.4415.1814.63158770.770.312.085.43
2016-02-0410015.2515.1514.5240090.400.674.631.37
2016-03-0210015.2015.1814.3649128.620.775.341.68
2015-09-2210015.1914.9714.4881399.430.443.032.79
2015-03-1610015.0514.8714.5194468.300.402.763.23
2015-09-0910014.8814.6313.9595082.470.775.563.25
2015-03-1010014.8014.6514.01101213.510.342.383.46
2016-02-0210014.7114.5514.0542175.520.463.271.44
2015-09-1610014.6514.6413.4079799.991.329.912.73
2016-03-0110014.6014.4014.0229640.230.382.711.01
2015-09-2110014.5714.5513.7360466.030.543.852.07
2015-03-1310014.5014.4714.0861342.220.362.552.10
2015-03-0610014.4814.2813.13179831.721.128.516.16
2016-01-2910014.2113.9713.3038279.970.402.951.31
2015-09-0810013.9813.8512.6760627.791.098.542.08
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0210012.6712.5212.2096291.730.322.623.30
#逻辑运算函数
data.query("p_change>2 & low<15").head()
openhighcloselowvolumeprice_changep_changeturnover
2015-09-1110016.0716.0714.61114900.251.469.993.93
2015-09-2910016.0015.4714.8178247.200.382.522.68
2016-01-1410016.0015.9414.3968228.870.704.592.34
2015-03-2710015.8615.7714.90120352.130.845.634.12
2015-03-1710015.4415.1814.63158770.770.312.085.43
data[data["turnover"].isin([4.19,2.39])]#判断是否存在
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2710025.8824.1623.5395578.030.632.682.39
2017-07-2510024.2023.7022.64167489.480.672.914.19
2016-09-2810020.9820.8619.7195580.750.984.932.39
2015-04-0710017.9817.5416.50122471.850.885.284.19
#统计运算
data
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1010036.3533.8532.23269033.120.511.539.21
2015-06-1222235.9835.2134.01159825.880.822.385.47
2017-10-3110035.2234.4432.20361660.882.387.429.05
2015-06-1510034.9931.6931.69199369.53-3.52-10.006.82
2015-06-1110034.9834.3932.51173075.730.541.595.92
...........................
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

data.describe()
openhighcloselowvolumeprice_changep_changeturnover
count643.000000643.000000643.000000643.000000643.000000643.000000643.000000643.000000
mean100.37947121.90051321.33626720.77183599905.5191140.0188020.1902802.936190
std6.7987784.0775783.9428063.79196873879.1193540.8984764.0796982.079375
min100.00000012.67000012.36000012.2000001158.120000-3.520000-10.0300000.040000
25%100.00000019.50000019.04500018.52500048533.210000-0.390000-1.8500001.360000
50%100.00000021.97000021.45000020.98000083175.9300000.0500000.2600002.500000
75%100.00000024.06500023.41500022.850000127580.0550000.4550002.3050003.915000
max222.00000036.35000035.21000034.010000501915.4100003.03000010.03000012.560000
data.max()
open               222.00
high                36.35
close               35.21
low                 34.01
volume          501915.41
price_change         3.03
p_change            10.03
turnover            12.56
dtype: float64
data.idxmax()
open            2015-06-12
high            2015-06-10
close           2015-06-12
low             2015-06-12
volume          2017-10-26
price_change    2015-06-09
p_change        2015-08-28
turnover        2017-10-26
dtype: object
#累计统计函数
data
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1010036.3533.8532.23269033.120.511.539.21
2015-06-1222235.9835.2134.01159825.880.822.385.47
2017-10-3110035.2234.4432.20361660.882.387.429.05
2015-06-1510034.9931.6931.69199369.53-3.52-10.006.82
2015-06-1110034.9834.3932.51173075.730.541.595.92
...........................
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

data["p_change"].sort_index().cumsum()#累计和
2015-03-02      2.62
2015-03-03      4.06
2015-03-04      5.63
2015-03-05      7.65
2015-03-06     16.16
               ...  
2018-02-14    112.59
2018-02-22    114.23
2018-02-23    116.65
2018-02-26    119.67
2018-02-27    122.35
Name: p_change, Length: 643, dtype: float64
data["p_change"].sort_index().cumsum().plot()
<Axes: >

在这里插入图片描述

#自定义运算
data.apply(lambda x:x.max() - x.min())
open               122.00
high                23.68
close               22.85
low                 21.81
volume          500757.29
price_change         6.55
p_change            20.06
turnover            12.52
dtype: float64
data["open"].max()-data["open"].min()
122
#Pandas画图
data
openhighcloselowvolumeprice_changep_changeturnover
2015-06-1010036.3533.8532.23269033.120.511.539.21
2015-06-1222235.9835.2134.01159825.880.822.385.47
2017-10-3110035.2234.4432.20361660.882.387.429.05
2015-06-1510034.9931.6931.69199369.53-3.52-10.006.82
2015-06-1110034.9834.3932.51173075.730.541.595.92
...........................
2015-03-0510013.4513.1612.8793180.390.262.023.19
2015-09-0710013.3812.7712.6352490.040.372.981.80
2015-03-0310013.0612.7012.52139071.610.181.444.76
2015-03-0410012.9212.9012.6167075.440.201.572.30
2015-03-0210012.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

data.plot(x="volume",y="turnover",kind="scatter")#kind默认为"lines"折线图
<Axes: xlabel='volume', ylabel='turnover'>

在这里插入图片描述

Pandas文件操作

#读取和存储csv文件
pd.read_csv("./stock_day/stock_day.csv",usecols=["high","low","open","close"]).head()
openhighcloselow
2018-02-2723.5325.8824.1623.53
2018-02-2622.8023.7823.5322.80
2018-02-2322.8823.3722.8222.71
2018-02-2222.2522.7622.2822.02
2018-02-1421.4921.9921.9221.48
pd.read_csv("stock_day2.csv")#如果文件中为纯数据,则需要加入字段names参数
2018-02-2723.5325.8824.1623.53.195578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
02018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
12018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
22018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
32018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
42018-02-1321.4021.9021.4821.3130802.450.281.3221.34222.10323.38739694.6545518.1465161.680.77
................................................
6372015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112115090.18115090.18115090.186.16
6382015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.82098904.7998904.7998904.793.19
6392015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707100812.93100812.93100812.932.30
6402015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610117681.67117681.67117681.674.76
6412015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.52096291.7396291.7396291.733.30

642 rows × 15 columns

pd.read_csv("stock_day2.csv",names=["open", "high", "close", "low", "volume", "price_change", "p_change", "ma5", "ma10", "ma20", "v_ma5", "v_ma10", "v_ma20", "turnover"])
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
.............................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112115090.18115090.18115090.186.16
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.82098904.7998904.7998904.793.19
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707100812.93100812.93100812.932.30
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610117681.67117681.67117681.674.76
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.52096291.7396291.7396291.733.30

643 rows × 14 columns

#存储csv文件 保存open列数据
data[:10].to_csv("test.csv",columns=["open"],index=False,mode='a',header=False)#不保存行索引,header不保存列索引

pd.read_csv(“test.csv”)

#hdf5文件

import pandas as pd
#HDF5文件的读取和存储 二进制文件
#:读取和存储一般需要指定一个键,值即为要存储的DataFrame 相当于存入一个三维数组,每一个键都是一个二维
day_close=pd.read_hdf(“./stock_data/day/day_close.h5”)
day_close.to_hdf(“test.h5”,key=“close”)pd.read_hdf(“test.h5”,key=“close”)day_open = pd.read_hdf(“./stock_data/day/day_open.h5”)day_open.to_hdf(“test.h5”,key=“open”)

#json文件的读取和存储(前后端交互时)
sa =pd.read_json("Sarcasm_Headlines_Dataset.json", orient="records", lines=True)#读取展示格式,以及是否以行为一组数据默认为False
sa
article_linkheadlineis_sarcastic
0https://www.huffingtonpost.com/entry/versace-b...former versace store clerk sues over secret 'b...0
1https://www.huffingtonpost.com/entry/roseanne-...the 'roseanne' revival catches up to our thorn...0
2https://local.theonion.com/mom-starting-to-fea...mom starting to fear son's web series closest ...1
3https://politics.theonion.com/boehner-just-wan...boehner just wants wife to listen, not come up...1
4https://www.huffingtonpost.com/entry/jk-rowlin...j.k. rowling wishes snape happy birthday in th...0
............
26704https://www.huffingtonpost.com/entry/american-...american politics in moral free-fall0
26705https://www.huffingtonpost.com/entry/americas-...america's best 20 hikes0
26706https://www.huffingtonpost.com/entry/reparatio...reparations and obama0
26707https://www.huffingtonpost.com/entry/israeli-b...israeli ban targeting boycott supporters raise...0
26708https://www.huffingtonpost.com/entry/gourmet-g...gourmet gifts for the foodie 20140

26709 rows × 3 columns

sa.to_json("test.json",orient="records",lines=True)

  • 41
    点赞
  • 49
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值