Pandas学习记录:DataFrame、索引操作、MultiIndex和Panel、Series、loc和iloc切片、排序、df算术逻辑统计运算、画图、文件操作CSV、HDF5、JSON读取保存

2 篇文章 0 订阅
2 篇文章 0 订阅

例子

import pandas as pd
import numpy as np
stock_change = np.random.normal(0, 1, (10, 5))
stock_change
array([[ 0.46649014, -0.07051788, -0.32120409, -0.46387559,  0.1085417 ],
       [ 0.04736554, -0.024752  , -0.32766649, -0.85826713, -2.33166558],
       [ 2.03097432,  0.35601761,  0.01116652, -0.18235223,  0.05282132],
       [-0.18245019,  0.75408371,  1.05843986,  0.92652009, -0.19937657],
       [-0.9097629 ,  0.23308364, -0.37100984,  0.20365066, -1.21841146],
       [-1.1896191 ,  0.21687117,  0.9942734 ,  2.14533057, -0.33534244],
       [-1.61708713, -0.31115317, -1.30691698, -0.05595457,  1.97507016],
       [-1.18554297,  0.13411785, -0.19375951, -0.73538635, -1.00568663],
       [-1.2157007 ,  1.37445982, -0.82222654, -0.68234626, -1.19710046],
       [ 0.14670458, -0.25369802,  1.54054094, -0.56285278,  0.61586738]])

raw

stock_rise = pd.DataFrame(stock_change)# 只能接受ndarray

stock_rise
01234
00.466490-0.070518-0.321204-0.4638760.108542
10.047366-0.024752-0.327666-0.858267-2.331666
22.0309740.3560180.011167-0.1823520.052821
3-0.1824500.7540841.0584400.926520-0.199377
4-0.9097630.233084-0.3710100.203651-1.218411
5-1.1896190.2168710.9942732.145331-0.335342
6-1.617087-0.311153-1.306917-0.0559551.975070
7-1.1855430.134118-0.193760-0.735386-1.005687
8-1.2157011.374460-0.822227-0.682346-1.197100
90.146705-0.2536981.540541-0.5628530.615867

增加行索引

stock_rise.shape
(10, 5)
stock_code = ["股票{}".format(i + 1) for i in range(stock_rise.shape[0])]
stock_code
['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10']
pd.DataFrame(stock_change, index=stock_code)
01234
股票10.466490-0.070518-0.321204-0.4638760.108542
股票20.047366-0.024752-0.327666-0.858267-2.331666
股票32.0309740.3560180.011167-0.1823520.052821
股票4-0.1824500.7540841.0584400.926520-0.199377
股票5-0.9097630.233084-0.3710100.203651-1.218411
股票6-1.1896190.2168710.9942732.145331-0.335342
股票7-1.617087-0.311153-1.306917-0.0559551.975070
股票8-1.1855430.134118-0.193760-0.735386-1.005687
股票9-1.2157011.374460-0.822227-0.682346-1.197100
股票100.146705-0.2536981.540541-0.5628530.615867

增加列索引

date = pd.date_range(start="20220328", periods=stock_rise.shape[1], freq="B")# freq="B"时略过周末
date
DatetimeIndex(['2022-03-28', '2022-03-29', '2022-03-30', '2022-03-31',
               '2022-04-01'],
              dtype='datetime64[ns]', freq='B')
pd.DataFrame(stock_change, index=stock_code, columns=date)
2022-03-282022-03-292022-03-302022-03-312022-04-01
股票10.466490-0.070518-0.321204-0.4638760.108542
股票20.047366-0.024752-0.327666-0.858267-2.331666
股票32.0309740.3560180.011167-0.1823520.052821
股票4-0.1824500.7540841.0584400.926520-0.199377
股票5-0.9097630.233084-0.3710100.203651-1.218411
股票6-1.1896190.2168710.9942732.145331-0.335342
股票7-1.617087-0.311153-1.306917-0.0559551.975070
股票8-1.1855430.134118-0.193760-0.735386-1.005687
股票9-1.2157011.374460-0.822227-0.682346-1.197100
股票100.146705-0.2536981.540541-0.5628530.615867

DataFrame

stock_change = np.random.normal(0, 1, (10, 5))
stock_rise = pd.DataFrame(stock_change)# 只能接受ndarray
stock_rise
01234
0-0.123263-0.9197600.2687320.3255450.422783
1-0.784039-1.241364-0.538109-0.5732241.736335
2-0.0023891.036499-0.585971-0.134203-1.111272
3-0.6546060.301471-0.656609-0.861359-0.711315
4-2.427007-1.5224550.629515-0.8647900.763393
50.850623-0.744593-0.1141720.8767930.008272
61.3277303.2348210.2059270.1860990.381091
7-0.477677-0.8670530.6912951.1321980.337238
80.6406250.0698140.568245-0.800257-0.163131
9-0.7613561.0277001.1343070.806318-1.089131

属性

# shape
stock_rise.shape
(10, 5)
# index
stock_rise.index
RangeIndex(start=0, stop=10, step=1)
# columns
stock_rise.columns
RangeIndex(start=0, stop=5, step=1)
# values(获取到其中的ndarray)
stock_rise.values
array([[-1.23262654e-01, -9.19760263e-01,  2.68731979e-01,
         3.25545108e-01,  4.22782972e-01],
       [-7.84038634e-01, -1.24136373e+00, -5.38109410e-01,
        -5.73224475e-01,  1.73633452e+00],
       [-2.38937989e-03,  1.03649857e+00, -5.85971165e-01,
        -1.34203156e-01, -1.11127187e+00],
       [-6.54606493e-01,  3.01470724e-01, -6.56608685e-01,
        -8.61358988e-01, -7.11314778e-01],
       [-2.42700694e+00, -1.52245541e+00,  6.29515371e-01,
        -8.64789913e-01,  7.63393103e-01],
       [ 8.50623001e-01, -7.44592685e-01, -1.14172355e-01,
         8.76793130e-01,  8.27195387e-03],
       [ 1.32773018e+00,  3.23482054e+00,  2.05926733e-01,
         1.86098804e-01,  3.81091182e-01],
       [-4.77676933e-01, -8.67053008e-01,  6.91295288e-01,
         1.13219805e+00,  3.37238380e-01],
       [ 6.40625259e-01,  6.98143091e-02,  5.68244924e-01,
        -8.00257378e-01, -1.63130614e-01],
       [-7.61355864e-01,  1.02770007e+00,  1.13430702e+00,
         8.06318437e-01, -1.08913089e+00]])
# T进行转置

stock_rise.T
0123456789
0-0.123263-0.784039-0.002389-0.654606-2.4270070.8506231.327730-0.4776770.640625-0.761356
1-0.919760-1.2413641.0364990.301471-1.522455-0.7445933.234821-0.8670530.0698141.027700
20.268732-0.538109-0.585971-0.6566090.629515-0.1141720.2059270.6912950.5682451.134307
30.325545-0.573224-0.134203-0.861359-0.8647900.8767930.1860991.132198-0.8002570.806318
40.4227831.736335-1.111272-0.7113150.7633930.0082720.3810910.337238-0.163131-1.089131

方法

# 显示前几行
stock_rise.head(3)# 默认为5
01234
0-0.123263-0.9197600.2687320.3255450.422783
1-0.784039-1.241364-0.538109-0.5732241.736335
2-0.0023891.036499-0.585971-0.134203-1.111272
# 显示后几行
stock_rise.tail(4)
01234
61.3277303.2348210.2059270.1860990.381091
7-0.477677-0.8670530.6912951.1321980.337238
80.6406250.0698140.568245-0.800257-0.163131
9-0.7613561.0277001.1343070.806318-1.089131

索引的修改

stock_rise
01234
0-0.123263-0.9197600.2687320.3255450.422783
1-0.784039-1.241364-0.538109-0.5732241.736335
2-0.0023891.036499-0.585971-0.134203-1.111272
3-0.6546060.301471-0.656609-0.861359-0.711315
4-2.427007-1.5224550.629515-0.8647900.763393
50.850623-0.744593-0.1141720.8767930.008272
61.3277303.2348210.2059270.1860990.381091
7-0.477677-0.8670530.6912951.1321980.337238
80.6406250.0698140.568245-0.800257-0.163131
9-0.7613561.0277001.1343070.806318-1.089131

直接修改

# 直接修改就行,注意要改就要全部替换不能只改索引中的一个值
stock_rise.index = ["股票{}".format(i + 1) for i in range(stock_change.shape[0])]

stock_rise
01234
股票1-0.123263-0.9197600.2687320.3255450.422783
股票2-0.784039-1.241364-0.538109-0.5732241.736335
股票3-0.0023891.036499-0.585971-0.134203-1.111272
股票4-0.6546060.301471-0.656609-0.861359-0.711315
股票5-2.427007-1.5224550.629515-0.8647900.763393
股票60.850623-0.744593-0.1141720.8767930.008272
股票71.3277303.2348210.2059270.1860990.381091
股票8-0.477677-0.8670530.6912951.1321980.337238
股票90.6406250.0698140.568245-0.800257-0.163131
股票10-0.7613561.0277001.1343070.806318-1.089131

重设索引

# 重设索引,不传参数时默认不删除原来的索引
stock_rise.reset_index()
index01234
0股票1-0.123263-0.9197600.2687320.3255450.422783
1股票2-0.784039-1.241364-0.538109-0.5732241.736335
2股票3-0.0023891.036499-0.585971-0.134203-1.111272
3股票4-0.6546060.301471-0.656609-0.861359-0.711315
4股票5-2.427007-1.5224550.629515-0.8647900.763393
5股票60.850623-0.744593-0.1141720.8767930.008272
6股票71.3277303.2348210.2059270.1860990.381091
7股票8-0.477677-0.8670530.6912951.1321980.337238
8股票90.6406250.0698140.568245-0.800257-0.163131
9股票10-0.7613561.0277001.1343070.806318-1.089131
# 删除原来的
stock_rise.reset_index(drop=True)
01234
0-0.123263-0.9197600.2687320.3255450.422783
1-0.784039-1.241364-0.538109-0.5732241.736335
2-0.0023891.036499-0.585971-0.134203-1.111272
3-0.6546060.301471-0.656609-0.861359-0.711315
4-2.427007-1.5224550.629515-0.8647900.763393
50.850623-0.744593-0.1141720.8767930.008272
61.3277303.2348210.2059270.1860990.381091
7-0.477677-0.8670530.6912951.1321980.337238
80.6406250.0698140.568245-0.800257-0.163131
9-0.7613561.0277001.1343070.806318-1.089131

以某列值作为索引

# 创建数据
df = pd.DataFrame({"month" : [1, 4, 7, 10],
                  "year" : [2012, 2014, 2018, 2022],
                  "sale" : [55, 40, 80, 31]})

df
monthyearsale
01201255
14201440
27201880
310202231
# 创建一个索引
df.set_index("month")
yearsale
month
1201255
4201440
7201880
10202231
# 创建多个索引

df.set_index(keys=["year", "month"])
sale
yearmonth
2012155
2014440
2018780
20221031

MultiIndex与Panel

MultiIndex

# 创建数据
df = pd.DataFrame({"month" : [1, 4, 7, 10],
                  "year" : [2012, 2014, 2018, 2022],
                  "sale" : [55, 40, 80, 31]})

# 创建多个索引

multdf = df.set_index(keys=["year", "month"])

multdf
sale
yearmonth
2012155
2014440
2018780
20221031
# 创建多个索引之后这个DataFrame就拥有了multindex
multdf.index
MultiIndex([(2012,  1),
            (2014,  4),
            (2018,  7),
            (2022, 10)],
           names=['year', 'month'])

index的属性

# index.names,索引名

multdf.index.names
FrozenList(['year', 'month'])
# index.levels 索引的值

multdf.index.levels
FrozenList([[2012, 2014, 2018, 2022], [1, 4, 7, 10]])

MultiIndex的创建

arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]

pd.MultiIndex.from_arrays(arrays, names=("number", "colors"))
MultiIndex([(1,  'red'),
            (1, 'blue'),
            (2,  'red'),
            (2, 'blue')],
           names=['number', 'colors'])

Panel(已被弃用)

  • 用于储存三维数组(被MultiIndex取代)
  • class pandas.Panel(data=None,items=None,major_axis=None,minor_axis=None)
  • 作用:存储3维数组的Panel结构
  • 参数:
  • data:ndarray或者dataframe
  • items:索引或类似数组的对象,axis=0
  • major_axis:索引或类似数组的对象,axis=1
  • minor__axis:索引或类似数组的对象,axis=2

Series

  • 带有标签的一维数组
import numpy as np
import pandas as pd

创建

# 通过字典

dic = {'a':1,'b':2,'c':3,'1':'hello','2':'python','3':[1,2]}

s = pd.Series(dic)

s
a         1
b         2
c         3
1     hello
2    python
3    [1, 2]
dtype: object
# 通过ndarray

s = pd.Series(np.random.rand(5), index=list("abcde"), name="test")

print(s, type(s))

# 重命名(不改变原来的Series)

s1 = s.rename("excel")

print(s1, type(s1))
a    0.750265
b    0.108221
c    0.656231
d    0.647292
e    0.063232
Name: test, dtype: float64 <class 'pandas.core.series.Series'>
a    0.750265
b    0.108221
c    0.656231
d    0.647292
e    0.063232
Name: excel, dtype: float64 <class 'pandas.core.series.Series'>
# 通过标量创建,必须指定index

s = pd.Series(6, index=range(6))

print(s, type(s))
0    6
1    6
2    6
3    6
4    6
5    6
dtype: int64 <class 'pandas.core.series.Series'>

索引

# 通过下标索引

s = pd.Series(np.random.rand(5))
print(s,'\n')
print(s[2],type(s[2]),s[2].dtype)
0    0.468480
1    0.899646
2    0.359836
3    0.289194
4    0.651470
dtype: float64 

0.3598363215742296 <class 'numpy.float64'> float64
# 通过标签索引

s = pd.Series(np.random.rand(5),index = list('abcde'))
print(s,'\n')
print(s['a'],'\n')

# 当有多个标签要索引的话,要多加个[ ]
print(s[['a','b']])
a    0.122132
b    0.742566
c    0.834115
d    0.602795
e    0.016637
dtype: float64 

0.12213187860080199 

a    0.122132
b    0.742566
dtype: float64
s1 = pd.Series(np.random.rand(5),list('abcde'))
print(s1,'\n')
print(s1['a':'b'],'\n')      #用index做索引的话是末端包含的
print(s1[1:2],'\n')          #用下标做切片索引的话和list切片是一样的,不包含末端 
a    0.156733
b    0.545197
c    0.291637
d    0.561090
e    0.645997
dtype: float64 

a    0.156733
b    0.545197
dtype: float64 

b    0.545197
dtype: float64 

其他操作

获取属性

s = pd.Series(np.random.rand(5),index = list('abcde'))

print(s.index)

print(s.values)
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
[0.86954724 0.55087078 0.66362467 0.29143158 0.56050401]

增加(第一,直接下标索引或index添加;第二,通过append()添加,生成新的Series

s = pd.Series(np.random.rand(2))

print(s, type(s))

s[3]= 100           #用index增添
s['a'] = 200
print(s)
0    0.475706
1    0.998265
dtype: float64 <class 'pandas.core.series.Series'>
0      0.475706
1      0.998265
3    100.000000
a    200.000000
dtype: float64
s2 = pd.Series(np.random.rand(2),index = ['value1','value2'])
s3 = s.append(s2)        #用append()增添
print(s3)
0           0.475706
1           0.998265
3         100.000000
a         200.000000
value1      0.769521
value2      0.548249
dtype: float64


C:\Users\LR\AppData\Local\Temp\ipykernel_9552\2823273785.py:2: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  s3 = s.append(s2)        #用append()增添

删除(第一,用del删除;第二,用.drop()删除,会生成新的Series)

s = pd.Series(np.random.rand(5),index = list('abcde'))
del s['a']           #用del删除
print(s,'\n')
s1 = s.drop(['c','d'])           #用.drop()删除,删除多个要加[]
print(s1)

修改(通过索引直接修改)

s = pd.Series(np.random.rand(5),index = list('abcde'))
print(s,'\n')
s[1] = 100
print(s,'\n')
s[['c','d']] = 200
print(s)

索引操作

读取数据

data = pd.read_csv("data/stock_day.csv")

data
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
.............................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112115090.18115090.18115090.186.16
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.82098904.7998904.7998904.793.19
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707100812.93100812.93100812.932.30
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610117681.67117681.67117681.674.76
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.52096291.7396291.7396291.733.30

643 rows × 14 columns

# 删除一些数据列方便后续的操作

data = data.drop(["ma5", "ma10", "ma20", "v_ma5", "v_ma10", "v_ma20"], axis=1)

data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.8824.1623.5395578.030.632.682.39
2018-02-2622.8023.7823.5322.8060985.110.693.021.53
2018-02-2322.8823.3722.8222.7152914.010.542.421.32
2018-02-2222.2522.7622.2822.0236105.010.361.640.90
2018-02-1421.4921.9921.9221.4823331.040.442.050.58
...........................
2015-03-0613.1714.4814.2813.13179831.721.128.516.16
2015-03-0512.8813.4513.1612.8793180.390.262.023.19
2015-03-0412.8012.9212.9012.6167075.440.201.572.30
2015-03-0312.5213.0612.7012.52139071.610.181.444.76
2015-03-0212.2512.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

直接使用行列索引

先列后行

# 必须先列后行

data["open"]["2018-02-27"]
23.53
# 切片只能切行(只能切一次)切两次是是不对的:

data[:1, :2]
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

File F:\software\Python\Python39\lib\site-packages\pandas\core\indexes\base.py:3621, in Index.get_loc(self, key, method, tolerance)
   3620 try:
-> 3621     return self._engine.get_loc(casted_key)
   3622 except KeyError as err:


File F:\software\Python\Python39\lib\site-packages\pandas\_libs\index.pyx:136, in pandas._libs.index.IndexEngine.get_loc()


File F:\software\Python\Python39\lib\site-packages\pandas\_libs\index.pyx:142, in pandas._libs.index.IndexEngine.get_loc()


TypeError: '(slice(None, 1, None), slice(None, 2, None))' is an invalid key


During handling of the above exception, another exception occurred:


InvalidIndexError                         Traceback (most recent call last)

Input In [43], in <cell line: 3>()
      1 # 切片是不对的:
----> 3 data[:1, :2]


File F:\software\Python\Python39\lib\site-packages\pandas\core\frame.py:3505, in DataFrame.__getitem__(self, key)
   3503 if self.columns.nlevels > 1:
   3504     return self._getitem_multilevel(key)
-> 3505 indexer = self.columns.get_loc(key)
   3506 if is_integer(indexer):
   3507     indexer = [indexer]


File F:\software\Python\Python39\lib\site-packages\pandas\core\indexes\base.py:3628, in Index.get_loc(self, key, method, tolerance)
   3623         raise KeyError(key) from err
   3624     except TypeError:
   3625         # If we have a listlike key, _check_indexing_error will raise
   3626         #  InvalidIndexError. Otherwise we fall through and re-raise
   3627         #  the TypeError.
-> 3628         self._check_indexing_error(key)
   3629         raise
   3631 # GH#42269


File F:\software\Python\Python39\lib\site-packages\pandas\core\indexes\base.py:5637, in Index._check_indexing_error(self, key)
   5633 def _check_indexing_error(self, key):
   5634     if not is_scalar(key):
   5635         # if key is not a scalar, directly raise an error (the code below
   5636         # would convert to numpy arrays and raise later any way) - GH29926
-> 5637         raise InvalidIndexError(key)


InvalidIndexError: (slice(None, 1, None), slice(None, 2, None))

通过loc和iloc来进行切片

  • loc通过列名和行名来切片
  • iloc通过数字来进行切片
  • 都是先行后列
data
data.loc["2018-02-27" : "2018-02-23", "open":"high"] #使用index做索引末端包含
data.iloc[:3, :5]# 使用数字末端不包含

混合索引

获取1到第4天[“open”, “close”, “high”, “low”]这四个指标的值

通过ix(已被弃用)

# 使用ix

#data.ix[0:4, ["open", "close", "high", "low"]]
# 通过loc

data.loc[data.index[0:4], ["open", "close", "high", "low"]]
# 通过iloc

data.iloc[0:4, data.columns.get_indexer(["open", "close", "high", "low"])]

赋值操作

# close列赋值为一

data["close"] = 1# 也可写成data.close = 1

data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.88123.5395578.030.632.682.39
2018-02-2622.8023.78122.8060985.110.693.021.53
2018-02-2322.8823.37122.7152914.010.542.421.32
2018-02-2222.2522.76122.0236105.010.361.640.90
2018-02-1421.4921.99121.4823331.040.442.050.58
...........................
2015-03-0613.1714.48113.13179831.721.128.516.16
2015-03-0512.8813.45112.8793180.390.262.023.19
2015-03-0412.8012.92112.6167075.440.201.572.30
2015-03-0312.5213.06112.52139071.610.181.444.76
2015-03-0212.2512.67112.2096291.730.322.623.30

643 rows × 8 columns

排序

对内容进行排序

# 根据一列
data = data.sort_values(by="p_change", ascending=True)# 默认升序,传False降序

data
openhighcloselowvolumeprice_changep_changeturnover
2015-09-0114.7814.78113.4678985.85-1.50-10.032.70
2015-09-1416.6216.64114.46139701.77-1.61-10.024.78
2016-01-1116.5917.24115.9058036.37-1.77-10.021.99
2015-07-1522.7823.04122.1085966.42-2.46-10.022.94
2015-08-2617.0018.35115.56130318.31-1.73-10.014.46
...........................
2016-12-2218.5020.42118.45150470.831.8610.023.77
2016-07-0718.6618.66118.4148756.551.7010.021.67
2015-05-2127.5028.22126.50121190.112.5710.024.15
2015-08-0416.2017.35115.8094292.631.5810.023.23
2015-08-2815.4016.46115.00117827.601.5010.034.03

643 rows × 8 columns

# 根据多列先传谁谁的地位就高

data = data.sort_values(by=["open", "high"])

data
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0212.2512.67112.2096291.730.322.623.30
2015-09-0212.3014.11112.3070201.74-1.10-8.172.40
2015-03-0312.5213.06112.52139071.610.181.444.76
2015-03-0412.8012.92112.6167075.440.201.572.30
2015-03-0512.8813.45112.8793180.390.262.023.19
...........................
2015-06-1133.1734.98132.51173075.730.541.595.92
2017-11-0133.8534.34133.10232325.30-0.61-1.775.81
2015-06-1034.1036.35132.23269033.120.511.539.21
2015-06-1234.6935.98134.01159825.880.822.385.47
2015-06-1534.9934.99131.69199369.53-3.52-10.006.82

643 rows × 8 columns

# 这里high的地位比open要高
data = data.sort_values(by=["high", "open"])

data
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0212.2512.67112.2096291.730.322.623.30
2015-03-0412.8012.92112.6167075.440.201.572.30
2015-03-0312.5213.06112.52139071.610.181.444.76
2015-09-0712.9213.38112.6352490.040.372.981.80
2015-03-0512.8813.45112.8793180.390.262.023.19
...........................
2015-06-1133.1734.98132.51173075.730.541.595.92
2015-06-1534.9934.99131.69199369.53-3.52-10.006.82
2017-10-3132.6235.22132.20361660.882.387.429.05
2015-06-1234.6935.98134.01159825.880.822.385.47
2015-06-1034.1036.35132.23269033.120.511.539.21

643 rows × 8 columns

对索引进行排序

data
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0212.2512.67112.2096291.730.322.623.30
2015-03-0412.8012.92112.6167075.440.201.572.30
2015-03-0312.5213.06112.52139071.610.181.444.76
2015-09-0712.9213.38112.6352490.040.372.981.80
2015-03-0512.8813.45112.8793180.390.262.023.19
...........................
2015-06-1133.1734.98132.51173075.730.541.595.92
2015-06-1534.9934.99131.69199369.53-3.52-10.006.82
2017-10-3132.6235.22132.20361660.882.387.429.05
2015-06-1234.6935.98134.01159825.880.822.385.47
2015-06-1034.1036.35132.23269033.120.511.539.21

643 rows × 8 columns

data.sort_index()# 按行索引的升序排列,也可将列排序,还可以传入一些复杂的参数
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0212.2512.67112.2096291.730.322.623.30
2015-03-0312.5213.06112.52139071.610.181.444.76
2015-03-0412.8012.92112.6167075.440.201.572.30
2015-03-0512.8813.45112.8793180.390.262.023.19
2015-03-0613.1714.48113.13179831.721.128.516.16
...........................
2018-02-1421.4921.99121.4823331.040.442.050.58
2018-02-2222.2522.76122.0236105.010.361.640.90
2018-02-2322.8823.37122.7152914.010.542.421.32
2018-02-2622.8023.78122.8060985.110.693.021.53
2018-02-2723.5325.88123.5395578.030.632.682.39

643 rows × 8 columns

Series排序

Series排序时只有一列不需要参数
# 取出一列就是Series
data.p_change.sort_values(ascending=True)
2015-09-01   -10.03
2016-01-11   -10.02
2015-07-15   -10.02
2015-09-14   -10.02
2015-08-26   -10.01
              ...  
2015-05-21    10.02
2015-08-04    10.02
2016-07-07    10.02
2016-12-22    10.02
2015-08-28    10.03
Name: p_change, Length: 643, dtype: float64
data.p_change.sort_index(ascending=False)
2018-02-27    2.68
2018-02-26    3.02
2018-02-23    2.42
2018-02-22    1.64
2018-02-14    2.05
              ... 
2015-03-06    8.51
2015-03-05    2.02
2015-03-04    1.57
2015-03-03    1.44
2015-03-02    2.62
Name: p_change, Length: 643, dtype: float64

DataFrame运算

算术运算

可以使用方法也可使用运算符号

add

import pandas as pd
data = pd.read_csv("data/stock_day.csv")
data = data.drop(["ma5", "ma10", "ma20", "v_ma5", "v_ma10", "v_ma20"], axis=1)
data["open"]
2018-02-27    23.53
2018-02-26    22.80
2018-02-23    22.88
2018-02-22    22.25
2018-02-14    21.49
              ...  
2015-03-06    13.17
2015-03-05    12.88
2015-03-04    12.80
2015-03-03    12.52
2015-03-02    12.25
Name: open, Length: 643, dtype: float64
 # open行都加一个2,也可以加另一个DataFrame
data["open"].add(2)
2018-02-27    25.53
2018-02-26    24.80
2018-02-23    24.88
2018-02-22    24.25
2018-02-14    23.49
              ...  
2015-03-06    15.17
2015-03-05    14.88
2015-03-04    14.80
2015-03-03    14.52
2015-03-02    14.25
Name: open, Length: 643, dtype: float64
#使用+也可以
data["open"] + 2
2018-02-27    25.53
2018-02-26    24.80
2018-02-23    24.88
2018-02-22    24.25
2018-02-14    23.49
              ...  
2015-03-06    15.17
2015-03-05    14.88
2015-03-04    14.80
2015-03-03    14.52
2015-03-02    14.25
Name: open, Length: 643, dtype: float64

sub

data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.8824.1623.5395578.030.632.682.39
2018-02-2622.8023.7823.5322.8060985.110.693.021.53
2018-02-2322.8823.3722.8222.7152914.010.542.421.32
2018-02-2222.2522.7622.2822.0236105.010.361.640.90
2018-02-1421.4921.9921.9221.4823331.040.442.050.58
...........................
2015-03-0613.1714.4814.2813.13179831.721.128.516.16
2015-03-0512.8813.4513.1612.8793180.390.262.023.19
2015-03-0412.8012.9212.9012.6167075.440.201.572.30
2015-03-0312.5213.0612.7012.52139071.610.181.444.76
2015-03-0212.2512.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

close = data["close"]
open1 = data["open"]

data["m_price_change"] = close.sub(open1)

data
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5395578.030.632.682.390.63
2018-02-2622.8023.7823.5322.8060985.110.693.021.530.73
2018-02-2322.8823.3722.8222.7152914.010.542.421.32-0.06
2018-02-2222.2522.7622.2822.0236105.010.361.640.900.03
2018-02-1421.4921.9921.9221.4823331.040.442.050.580.43
..............................
2015-03-0613.1714.4814.2813.13179831.721.128.516.161.11
2015-03-0512.8813.4513.1612.8793180.390.262.023.190.28
2015-03-0412.8012.9212.9012.6167075.440.201.572.300.10
2015-03-0312.5213.0612.7012.52139071.610.181.444.760.18
2015-03-0212.2512.6712.5212.2096291.730.322.623.300.27

643 rows × 9 columns

逻辑运算

符号<、>、|、&

例如筛选p_change > 2的日期数据

  • data[“p_change”] > 2 返回逻辑结果
data
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5395578.030.632.682.390.63
2018-02-2622.8023.7823.5322.8060985.110.693.021.530.73
2018-02-2322.8823.3722.8222.7152914.010.542.421.32-0.06
2018-02-2222.2522.7622.2822.0236105.010.361.640.900.03
2018-02-1421.4921.9921.9221.4823331.040.442.050.580.43
..............................
2015-03-0613.1714.4814.2813.13179831.721.128.516.161.11
2015-03-0512.8813.4513.1612.8793180.390.262.023.190.28
2015-03-0412.8012.9212.9012.6167075.440.201.572.300.10
2015-03-0312.5213.0612.7012.52139071.610.181.444.760.18
2015-03-0212.2512.6712.5212.2096291.730.322.623.300.27

643 rows × 9 columns

data["p_change"] > 2
2018-02-27     True
2018-02-26     True
2018-02-23     True
2018-02-22    False
2018-02-14     True
              ...  
2015-03-06     True
2015-03-05     True
2015-03-04    False
2015-03-03    False
2015-03-02     True
Name: p_change, Length: 643, dtype: bool

逻辑判断的结果可以作为筛选的依据

data[data["p_change"] > 2]
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5395578.030.632.682.390.63
2018-02-2622.8023.7823.5322.8060985.110.693.021.530.73
2018-02-2322.8823.3722.8222.7152914.010.542.421.32-0.06
2018-02-1421.4921.9921.9221.4823331.040.442.050.580.43
2018-02-1220.7021.4021.1920.6332445.390.824.030.810.49
..............................
2015-03-1314.1314.5014.4714.0861342.220.362.552.100.34
2015-03-1014.2014.8014.6514.01101213.510.342.383.460.45
2015-03-0613.1714.4814.2813.13179831.721.128.516.161.11
2015-03-0512.8813.4513.1612.8793180.390.262.023.190.28
2015-03-0212.2512.6712.5212.2096291.730.322.623.300.27

183 rows × 9 columns

也可以复合逻辑判断

data[(data["p_change"] > 2) & (data["p_change"] < 4)]
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5395578.030.632.682.390.63
2018-02-2622.8023.7823.5322.8060985.110.693.021.530.73
2018-02-2322.8823.3722.8222.7152914.010.542.421.32-0.06
2018-02-1421.4921.9921.9221.4823331.040.442.050.580.43
2018-02-0522.4523.3923.2722.2552341.390.652.871.310.82
..............................
2015-03-1614.5215.0514.8714.5194468.300.402.763.230.35
2015-03-1314.1314.5014.4714.0861342.220.362.552.100.34
2015-03-1014.2014.8014.6514.01101213.510.342.383.460.45
2015-03-0512.8813.4513.1612.8793180.390.262.023.190.28
2015-03-0212.2512.6712.5212.2096291.730.322.623.300.27

88 rows × 9 columns

逻辑运算函数

query(表达式)
# 通过query使得刚才的过程更加方便
data.query("p_change > 2 & p_change < 4")
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5395578.030.632.682.390.63
2018-02-2622.8023.7823.5322.8060985.110.693.021.530.73
2018-02-2322.8823.3722.8222.7152914.010.542.421.32-0.06
2018-02-1421.4921.9921.9221.4823331.040.442.050.580.43
2018-02-0522.4523.3923.2722.2552341.390.652.871.310.82
..............................
2015-03-1614.5215.0514.8714.5194468.300.402.763.230.35
2015-03-1314.1314.5014.4714.0861342.220.362.552.100.34
2015-03-1014.2014.8014.6514.01101213.510.342.383.460.45
2015-03-0512.8813.4513.1612.8793180.390.262.023.190.28
2015-03-0212.2512.6712.5212.2096291.730.322.623.300.27

88 rows × 9 columns

isin(values)
# 判断turnover是否为2.39或1.53
data[data["turnover"].isin([2.39, 1.53])]
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5395578.030.632.682.390.63
2018-02-2622.8023.7823.5322.8060985.110.693.021.530.73
2016-09-2819.8820.9820.8619.7195580.750.984.932.390.98
2015-12-0221.3021.3720.8120.3044773.31-0.40-1.891.53-0.49

统计运算

describe

综合分析:能够直接得出很多统计结果count、mean、std、min、max

data.describe()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
count643.000000643.000000643.000000643.000000643.000000643.000000643.000000643.000000643.000000
mean21.27270621.90051321.33626720.77183599905.5191140.0188020.1902802.9361900.063561
std3.9309734.0775783.9428063.79196873879.1193540.8984764.0796982.0793750.800565
min12.25000012.67000012.36000012.2000001158.120000-3.520000-10.0300000.040000-3.300000
25%19.00000019.50000019.04500018.52500048533.210000-0.390000-1.8500001.360000-0.300000
50%21.44000021.97000021.45000020.98000083175.9300000.0500000.2600002.5000000.080000
75%23.40000024.06500023.41500022.850000127580.0550000.4550002.3050003.9150000.450000
max34.99000036.35000035.21000034.010000501915.4100003.03000010.03000012.5600003.410000

统计函数

统计函数

解释:
mode:众数、prod:乘积、var:方差

统计函数numpy中已经介绍了很多这里不做过多介绍只展示几个

# 默认按列
data.sum()
open                 13678.35
high                 14082.03
close                13719.22
low                  13356.29
volume            64239248.79
price_change            12.09
p_change               122.35
turnover              1887.97
m_price_change          40.87
dtype: float64
# 按行的情况
data.sum(axis=1)
2018-02-27     95681.46
2018-02-26     61083.99
2018-02-23     53010.01
2018-02-22     36197.25
2018-02-14     23421.42
                ...    
2015-03-06    179903.68
2015-03-05     93238.50
2015-03-04     67130.84
2015-03-03    139128.97
2015-03-02     96347.88
Length: 643, dtype: float64
data.idxmax()
open              2015-06-15
high              2015-06-10
close             2015-06-12
low               2015-06-12
volume            2017-10-26
price_change      2015-06-09
p_change          2015-08-28
turnover          2017-10-26
m_price_change    2015-07-10
dtype: object
data.idxmin()
open              2015-03-02
high              2015-03-02
close             2015-09-02
low               2015-03-02
volume            2016-07-06
price_change      2015-06-15
p_change          2015-09-01
turnover          2016-07-06
m_price_change    2015-06-15
dtype: object

累计统计函数(累积分布函数)

累计统计函数

这里按照时间的先后顺序来进行统计

# 先排序
data = data.sort_index()
data
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2015-03-0212.2512.6712.5212.2096291.730.322.623.300.27
2015-03-0312.5213.0612.7012.52139071.610.181.444.760.18
2015-03-0412.8012.9212.9012.6167075.440.201.572.300.10
2015-03-0512.8813.4513.1612.8793180.390.262.023.190.28
2015-03-0613.1714.4814.2813.13179831.721.128.516.161.11
..............................
2018-02-1421.4921.9921.9221.4823331.040.442.050.580.43
2018-02-2222.2522.7622.2822.0236105.010.361.640.900.03
2018-02-2322.8823.3722.8222.7152914.010.542.421.32-0.06
2018-02-2622.8023.7823.5322.8060985.110.693.021.530.73
2018-02-2723.5325.8824.1623.5395578.030.632.682.390.63

643 rows × 9 columns

# 对p_change进行求和
stock_rise = data["p_change"]

stock_rise.cumsum()
2018-02-27      2.68
2018-02-26      5.70
2018-02-23      8.12
2018-02-22      9.76
2018-02-14     11.81
               ...  
2015-03-06    114.70
2015-03-05    116.72
2015-03-04    118.29
2015-03-03    119.73
2015-03-02    122.35
Name: p_change, Length: 643, dtype: float64

引入matplotlib可以绘图

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(20, 6), dpi=100)
# plot绘图,不要忘了ax=plt.gca()不然有可能改变不了图形的大小和像素点密集度
stock_rise.cumsum().plot(ax=plt.gca())
# 显示图形
plt.show()

在这里插入图片描述

自定义运算

data[["open", "close"]].apply(lambda x: x.max() - x.min())
open     22.74
close    22.85
dtype: float64

Pandas画图

pandas.DataFrame.plot

help(pd.DataFrame.plot)
Help on class PlotAccessor in module pandas.plotting._core:

class PlotAccessor(pandas.core.base.PandasObject)
 |  PlotAccessor(data)
 |  
 |  Make plots of Series or DataFrame.
 |  
 |  Uses the backend specified by the
 |  option ``plotting.backend``. By default, matplotlib is used.
 |  
 |  Parameters
 |  ----------
 |  data : Series or DataFrame
 |      The object for which the method is called.
 |  x : label or position, default None
 |      Only used if data is a DataFrame.
 |  y : label, position or list of label, positions, default None
 |      Allows plotting of one column versus another. Only used if data is a
 |      DataFrame.
 |  kind : str
 |      The kind of plot to produce:
 |  
 |      - 'line' : line plot (default)
 |      - 'bar' : vertical bar plot
 |      - 'barh' : horizontal bar plot
 |      - 'hist' : histogram
 |      - 'box' : boxplot
 |      - 'kde' : Kernel Density Estimation plot
 |      - 'density' : same as 'kde'
 |      - 'area' : area plot
 |      - 'pie' : pie plot
 |      - 'scatter' : scatter plot (DataFrame only)
 |      - 'hexbin' : hexbin plot (DataFrame only)
 |  ax : matplotlib axes object, default None
 |      An axes of the current figure.
 |  subplots : bool, default False
 |      Make separate subplots for each column.
 |  sharex : bool, default True if ax is None else False
 |      In case ``subplots=True``, share x axis and set some x axis labels
 |      to invisible; defaults to True if ax is None otherwise False if
 |      an ax is passed in; Be aware, that passing in both an ax and
 |      ``sharex=True`` will alter all x axis labels for all axis in a figure.
 |  sharey : bool, default False
 |      In case ``subplots=True``, share y axis and set some y axis labels to invisible.
 |  layout : tuple, optional
 |      (rows, columns) for the layout of subplots.
 |  figsize : a tuple (width, height) in inches
 |      Size of a figure object.
 |  use_index : bool, default True
 |      Use index as ticks for x axis.
 |  title : str or list
 |      Title to use for the plot. If a string is passed, print the string
 |      at the top of the figure. If a list is passed and `subplots` is
 |      True, print each item in the list above the corresponding subplot.
 |  grid : bool, default None (matlab style default)
 |      Axis grid lines.
 |  legend : bool or {'reverse'}
 |      Place legend on axis subplots.
 |  style : list or dict
 |      The matplotlib line style per column.
 |  logx : bool or 'sym', default False
 |      Use log scaling or symlog scaling on x axis.
 |      .. versionchanged:: 0.25.0
 |  
 |  logy : bool or 'sym' default False
 |      Use log scaling or symlog scaling on y axis.
 |      .. versionchanged:: 0.25.0
 |  
 |  loglog : bool or 'sym', default False
 |      Use log scaling or symlog scaling on both x and y axes.
 |      .. versionchanged:: 0.25.0
 |  
 |  xticks : sequence
 |      Values to use for the xticks.
 |  yticks : sequence
 |      Values to use for the yticks.
 |  xlim : 2-tuple/list
 |      Set the x limits of the current axes.
 |  ylim : 2-tuple/list
 |      Set the y limits of the current axes.
 |  xlabel : label, optional
 |      Name to use for the xlabel on x-axis. Default uses index name as xlabel, or the
 |      x-column name for planar plots.
 |  
 |      .. versionadded:: 1.1.0
 |  
 |      .. versionchanged:: 1.2.0
 |  
 |         Now applicable to planar plots (`scatter`, `hexbin`).
 |  
 |  ylabel : label, optional
 |      Name to use for the ylabel on y-axis. Default will show no ylabel, or the
 |      y-column name for planar plots.
 |  
 |      .. versionadded:: 1.1.0
 |  
 |      .. versionchanged:: 1.2.0
 |  
 |         Now applicable to planar plots (`scatter`, `hexbin`).
 |  
 |  rot : int, default None
 |      Rotation for ticks (xticks for vertical, yticks for horizontal
 |      plots).
 |  fontsize : int, default None
 |      Font size for xticks and yticks.
 |  colormap : str or matplotlib colormap object, default None
 |      Colormap to select colors from. If string, load colormap with that
 |      name from matplotlib.
 |  colorbar : bool, optional
 |      If True, plot colorbar (only relevant for 'scatter' and 'hexbin'
 |      plots).
 |  position : float
 |      Specify relative alignments for bar plot layout.
 |      From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5
 |      (center).
 |  table : bool, Series or DataFrame, default False
 |      If True, draw a table using the data in the DataFrame and the data
 |      will be transposed to meet matplotlib's default layout.
 |      If a Series or DataFrame is passed, use passed data to draw a
 |      table.
 |  yerr : DataFrame, Series, array-like, dict and str
 |      See :ref:`Plotting with Error Bars <visualization.errorbars>` for
 |      detail.
 |  xerr : DataFrame, Series, array-like, dict and str
 |      Equivalent to yerr.
 |  stacked : bool, default False in line and bar plots, and True in area plot
 |      If True, create stacked plot.
 |  sort_columns : bool, default False
 |      Sort column names to determine plot ordering.
 |  secondary_y : bool or sequence, default False
 |      Whether to plot on the secondary y-axis if a list/tuple, which
 |      columns to plot on secondary y-axis.
 |  mark_right : bool, default True
 |      When using a secondary_y axis, automatically mark the column
 |      labels with "(right)" in the legend.
 |  include_bool : bool, default is False
 |      If True, boolean values can be plotted.
 |  backend : str, default None
 |      Backend to use instead of the backend specified in the option
 |      ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
 |      specify the ``plotting.backend`` for the whole session, set
 |      ``pd.options.plotting.backend``.
 |  
 |      .. versionadded:: 1.0.0
 |  
 |  **kwargs
 |      Options to pass to matplotlib plotting method.
 |  
 |  Returns
 |  -------
 |  :class:`matplotlib.axes.Axes` or numpy.ndarray of them
 |      If the backend is not the default matplotlib one, the return value
 |      will be the object returned by the backend.
 |  
 |  Notes
 |  -----
 |  - See matplotlib documentation online for more on this subject
 |  - If `kind` = 'bar' or 'barh', you can specify relative alignments
 |    for bar plot layout by `position` keyword.
 |    From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5
 |    (center)
 |  
 |  Method resolution order:
 |      PlotAccessor
 |      pandas.core.base.PandasObject
 |      pandas.core.accessor.DirNamesMixin
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __call__(self, *args, **kwargs)
 |      Make plots of Series or DataFrame.
 |      
 |      Uses the backend specified by the
 |      option ``plotting.backend``. By default, matplotlib is used.
 |      
 |      Parameters
 |      ----------
 |      data : Series or DataFrame
 |          The object for which the method is called.
 |      x : label or position, default None
 |          Only used if data is a DataFrame.
 |      y : label, position or list of label, positions, default None
 |          Allows plotting of one column versus another. Only used if data is a
 |          DataFrame.
 |      kind : str
 |          The kind of plot to produce:
 |      
 |          - 'line' : line plot (default)
 |          - 'bar' : vertical bar plot
 |          - 'barh' : horizontal bar plot
 |          - 'hist' : histogram
 |          - 'box' : boxplot
 |          - 'kde' : Kernel Density Estimation plot
 |          - 'density' : same as 'kde'
 |          - 'area' : area plot
 |          - 'pie' : pie plot
 |          - 'scatter' : scatter plot (DataFrame only)
 |          - 'hexbin' : hexbin plot (DataFrame only)
 |      ax : matplotlib axes object, default None
 |          An axes of the current figure.
 |      subplots : bool, default False
 |          Make separate subplots for each column.
 |      sharex : bool, default True if ax is None else False
 |          In case ``subplots=True``, share x axis and set some x axis labels
 |          to invisible; defaults to True if ax is None otherwise False if
 |          an ax is passed in; Be aware, that passing in both an ax and
 |          ``sharex=True`` will alter all x axis labels for all axis in a figure.
 |      sharey : bool, default False
 |          In case ``subplots=True``, share y axis and set some y axis labels to invisible.
 |      layout : tuple, optional
 |          (rows, columns) for the layout of subplots.
 |      figsize : a tuple (width, height) in inches
 |          Size of a figure object.
 |      use_index : bool, default True
 |          Use index as ticks for x axis.
 |      title : str or list
 |          Title to use for the plot. If a string is passed, print the string
 |          at the top of the figure. If a list is passed and `subplots` is
 |          True, print each item in the list above the corresponding subplot.
 |      grid : bool, default None (matlab style default)
 |          Axis grid lines.
 |      legend : bool or {'reverse'}
 |          Place legend on axis subplots.
 |      style : list or dict
 |          The matplotlib line style per column.
 |      logx : bool or 'sym', default False
 |          Use log scaling or symlog scaling on x axis.
 |          .. versionchanged:: 0.25.0
 |      
 |      logy : bool or 'sym' default False
 |          Use log scaling or symlog scaling on y axis.
 |          .. versionchanged:: 0.25.0
 |      
 |      loglog : bool or 'sym', default False
 |          Use log scaling or symlog scaling on both x and y axes.
 |          .. versionchanged:: 0.25.0
 |      
 |      xticks : sequence
 |          Values to use for the xticks.
 |      yticks : sequence
 |          Values to use for the yticks.
 |      xlim : 2-tuple/list
 |          Set the x limits of the current axes.
 |      ylim : 2-tuple/list
 |          Set the y limits of the current axes.
 |      xlabel : label, optional
 |          Name to use for the xlabel on x-axis. Default uses index name as xlabel, or the
 |          x-column name for planar plots.
 |      
 |          .. versionadded:: 1.1.0
 |      
 |          .. versionchanged:: 1.2.0
 |      
 |             Now applicable to planar plots (`scatter`, `hexbin`).
 |      
 |      ylabel : label, optional
 |          Name to use for the ylabel on y-axis. Default will show no ylabel, or the
 |          y-column name for planar plots.
 |      
 |          .. versionadded:: 1.1.0
 |      
 |          .. versionchanged:: 1.2.0
 |      
 |             Now applicable to planar plots (`scatter`, `hexbin`).
 |      
 |      rot : int, default None
 |          Rotation for ticks (xticks for vertical, yticks for horizontal
 |          plots).
 |      fontsize : int, default None
 |          Font size for xticks and yticks.
 |      colormap : str or matplotlib colormap object, default None
 |          Colormap to select colors from. If string, load colormap with that
 |          name from matplotlib.
 |      colorbar : bool, optional
 |          If True, plot colorbar (only relevant for 'scatter' and 'hexbin'
 |          plots).
 |      position : float
 |          Specify relative alignments for bar plot layout.
 |          From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5
 |          (center).
 |      table : bool, Series or DataFrame, default False
 |          If True, draw a table using the data in the DataFrame and the data
 |          will be transposed to meet matplotlib's default layout.
 |          If a Series or DataFrame is passed, use passed data to draw a
 |          table.
 |      yerr : DataFrame, Series, array-like, dict and str
 |          See :ref:`Plotting with Error Bars <visualization.errorbars>` for
 |          detail.
 |      xerr : DataFrame, Series, array-like, dict and str
 |          Equivalent to yerr.
 |      stacked : bool, default False in line and bar plots, and True in area plot
 |          If True, create stacked plot.
 |      sort_columns : bool, default False
 |          Sort column names to determine plot ordering.
 |      secondary_y : bool or sequence, default False
 |          Whether to plot on the secondary y-axis if a list/tuple, which
 |          columns to plot on secondary y-axis.
 |      mark_right : bool, default True
 |          When using a secondary_y axis, automatically mark the column
 |          labels with "(right)" in the legend.
 |      include_bool : bool, default is False
 |          If True, boolean values can be plotted.
 |      backend : str, default None
 |          Backend to use instead of the backend specified in the option
 |          ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
 |          specify the ``plotting.backend`` for the whole session, set
 |          ``pd.options.plotting.backend``.
 |      
 |          .. versionadded:: 1.0.0
 |      
 |      **kwargs
 |          Options to pass to matplotlib plotting method.
 |      
 |      Returns
 |      -------
 |      :class:`matplotlib.axes.Axes` or numpy.ndarray of them
 |          If the backend is not the default matplotlib one, the return value
 |          will be the object returned by the backend.
 |      
 |      Notes
 |      -----
 |      - See matplotlib documentation online for more on this subject
 |      - If `kind` = 'bar' or 'barh', you can specify relative alignments
 |        for bar plot layout by `position` keyword.
 |        From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5
 |        (center)
 |  
 |  __init__(self, data)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  area(self, x=None, y=None, **kwargs)
 |      Draw a stacked area plot.
 |      
 |      An area plot displays quantitative data visually.
 |      This function wraps the matplotlib area function.
 |      
 |      Parameters
 |      ----------
 |      x : label or position, optional
 |          Coordinates for the X axis. By default uses the index.
 |      y : label or position, optional
 |          Column to plot. By default uses all columns.
 |      stacked : bool, default True
 |          Area plots are stacked by default. Set to False to create a
 |          unstacked plot.
 |      **kwargs
 |          Additional keyword arguments are documented in
 |          :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      matplotlib.axes.Axes or numpy.ndarray
 |          Area plot, or array of area plots if subplots is True.
 |      
 |      See Also
 |      --------
 |      DataFrame.plot : Make plots of DataFrame using matplotlib / pylab.
 |      
 |      Examples
 |      --------
 |      Draw an area plot based on basic business metrics:
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> df = pd.DataFrame({
 |          ...     'sales': [3, 2, 3, 9, 10, 6],
 |          ...     'signups': [5, 5, 6, 12, 14, 13],
 |          ...     'visits': [20, 42, 28, 62, 81, 50],
 |          ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01',
 |          ...                        freq='M'))
 |          >>> ax = df.plot.area()
 |      
 |      Area plots are stacked by default. To produce an unstacked plot,
 |      pass ``stacked=False``:
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> ax = df.plot.area(stacked=False)
 |      
 |      Draw an area plot for a single column:
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> ax = df.plot.area(y='sales')
 |      
 |      Draw with a different `x`:
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> df = pd.DataFrame({
 |          ...     'sales': [3, 2, 3],
 |          ...     'visits': [20, 42, 28],
 |          ...     'day': [1, 2, 3],
 |          ... })
 |          >>> ax = df.plot.area(x='day')
 |  
 |  bar(self, x=None, y=None, **kwargs)
 |      Vertical bar plot.
 |      
 |      A bar plot is a plot that presents categorical data with
 |      rectangular bars with lengths proportional to the values that they
 |      represent. A bar plot shows comparisons among discrete categories. One
 |      axis of the plot shows the specific categories being compared, and the
 |      other axis represents a measured value.
 |      
 |      Parameters
 |      ----------
 |      x : label or position, optional
 |          Allows plotting of one column versus another. If not specified,
 |          the index of the DataFrame is used.
 |      y : label or position, optional
 |          Allows plotting of one column versus another. If not specified,
 |          all numerical columns are used.
 |      color : str, array-like, or dict, optional
 |          The color for each of the DataFrame's columns. Possible values are:
 |      
 |          - A single color string referred to by name, RGB or RGBA code,
 |              for instance 'red' or '#a98d19'.
 |      
 |          - A sequence of color strings referred to by name, RGB or RGBA
 |              code, which will be used for each column recursively. For
 |              instance ['green','yellow'] each column's bar will be filled in
 |              green or yellow, alternatively. If there is only a single column to
 |              be plotted, then only the first color from the color list will be
 |              used.
 |      
 |          - A dict of the form {column name : color}, so that each column will be
 |              colored accordingly. For example, if your columns are called `a` and
 |              `b`, then passing {'a': 'green', 'b': 'red'} will color bars for
 |              column `a` in green and bars for column `b` in red.
 |      
 |          .. versionadded:: 1.1.0
 |      
 |      **kwargs
 |          Additional keyword arguments are documented in
 |          :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      matplotlib.axes.Axes or np.ndarray of them
 |          An ndarray is returned with one :class:`matplotlib.axes.Axes`
 |          per column when ``subplots=True``.
 |      
 |              See Also
 |              --------
 |              DataFrame.plot.barh : Horizontal bar plot.
 |              DataFrame.plot : Make plots of a DataFrame.
 |              matplotlib.pyplot.bar : Make a bar plot with matplotlib.
 |      
 |              Examples
 |              --------
 |              Basic plot.
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]})
 |                  >>> ax = df.plot.bar(x='lab', y='val', rot=0)
 |      
 |              Plot a whole dataframe to a bar plot. Each column is assigned a
 |              distinct color, and each row is nested in a group along the
 |              horizontal axis.
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 |                  >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 |                  >>> index = ['snail', 'pig', 'elephant',
 |                  ...          'rabbit', 'giraffe', 'coyote', 'horse']
 |                  >>> df = pd.DataFrame({'speed': speed,
 |                  ...                    'lifespan': lifespan}, index=index)
 |                  >>> ax = df.plot.bar(rot=0)
 |      
 |              Plot stacked bar charts for the DataFrame
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> ax = df.plot.bar(stacked=True)
 |      
 |              Instead of nesting, the figure can be split by column with
 |              ``subplots=True``. In this case, a :class:`numpy.ndarray` of
 |              :class:`matplotlib.axes.Axes` are returned.
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> axes = df.plot.bar(rot=0, subplots=True)
 |                  >>> axes[1].legend(loc=2)  # doctest: +SKIP
 |      
 |              If you don't like the default colours, you can specify how you'd
 |              like each column to be colored.
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> axes = df.plot.bar(
 |                  ...     rot=0, subplots=True, color={"speed": "red", "lifespan": "green"}
 |                  ... )
 |                  >>> axes[1].legend(loc=2)  # doctest: +SKIP
 |      
 |              Plot a single column.
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> ax = df.plot.bar(y='speed', rot=0)
 |      
 |              Plot only selected categories for the DataFrame.
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> ax = df.plot.bar(x='lifespan', rot=0)
 |  
 |  barh(self, x=None, y=None, **kwargs)
 |      Make a horizontal bar plot.
 |      
 |      A horizontal bar plot is a plot that presents quantitative data with
 |      rectangular bars with lengths proportional to the values that they
 |      represent. A bar plot shows comparisons among discrete categories. One
 |      axis of the plot shows the specific categories being compared, and the
 |      other axis represents a measured value.
 |      
 |      Parameters
 |      ----------
 |      x : label or position, optional
 |          Allows plotting of one column versus another. If not specified,
 |          the index of the DataFrame is used.
 |      y : label or position, optional
 |          Allows plotting of one column versus another. If not specified,
 |          all numerical columns are used.
 |      color : str, array-like, or dict, optional
 |          The color for each of the DataFrame's columns. Possible values are:
 |      
 |          - A single color string referred to by name, RGB or RGBA code,
 |              for instance 'red' or '#a98d19'.
 |      
 |          - A sequence of color strings referred to by name, RGB or RGBA
 |              code, which will be used for each column recursively. For
 |              instance ['green','yellow'] each column's bar will be filled in
 |              green or yellow, alternatively. If there is only a single column to
 |              be plotted, then only the first color from the color list will be
 |              used.
 |      
 |          - A dict of the form {column name : color}, so that each column will be
 |              colored accordingly. For example, if your columns are called `a` and
 |              `b`, then passing {'a': 'green', 'b': 'red'} will color bars for
 |              column `a` in green and bars for column `b` in red.
 |      
 |          .. versionadded:: 1.1.0
 |      
 |      **kwargs
 |          Additional keyword arguments are documented in
 |          :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      matplotlib.axes.Axes or np.ndarray of them
 |          An ndarray is returned with one :class:`matplotlib.axes.Axes`
 |          per column when ``subplots=True``.
 |      
 |              See Also
 |              --------
 |              DataFrame.plot.bar: Vertical bar plot.
 |              DataFrame.plot : Make plots of DataFrame using matplotlib.
 |              matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib.
 |      
 |              Examples
 |              --------
 |              Basic example
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
 |                  >>> ax = df.plot.barh(x='lab', y='val')
 |      
 |              Plot a whole DataFrame to a horizontal bar plot
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 |                  >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 |                  >>> index = ['snail', 'pig', 'elephant',
 |                  ...          'rabbit', 'giraffe', 'coyote', 'horse']
 |                  >>> df = pd.DataFrame({'speed': speed,
 |                  ...                    'lifespan': lifespan}, index=index)
 |                  >>> ax = df.plot.barh()
 |      
 |              Plot stacked barh charts for the DataFrame
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> ax = df.plot.barh(stacked=True)
 |      
 |              We can specify colors for each column
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> ax = df.plot.barh(color={"speed": "red", "lifespan": "green"})
 |      
 |              Plot a column of the DataFrame to a horizontal bar plot
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 |                  >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 |                  >>> index = ['snail', 'pig', 'elephant',
 |                  ...          'rabbit', 'giraffe', 'coyote', 'horse']
 |                  >>> df = pd.DataFrame({'speed': speed,
 |                  ...                    'lifespan': lifespan}, index=index)
 |                  >>> ax = df.plot.barh(y='speed')
 |      
 |              Plot DataFrame versus the desired column
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 |                  >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
 |                  >>> index = ['snail', 'pig', 'elephant',
 |                  ...          'rabbit', 'giraffe', 'coyote', 'horse']
 |                  >>> df = pd.DataFrame({'speed': speed,
 |                  ...                    'lifespan': lifespan}, index=index)
 |                  >>> ax = df.plot.barh(x='lifespan')
 |  
 |  box(self, by=None, **kwargs)
 |      Make a box plot of the DataFrame columns.
 |      
 |      A box plot is a method for graphically depicting groups of numerical
 |      data through their quartiles.
 |      The box extends from the Q1 to Q3 quartile values of the data,
 |      with a line at the median (Q2). The whiskers extend from the edges
 |      of box to show the range of the data. The position of the whiskers
 |      is set by default to 1.5*IQR (IQR = Q3 - Q1) from the edges of the
 |      box. Outlier points are those past the end of the whiskers.
 |      
 |      For further details see Wikipedia's
 |      entry for `boxplot <https://en.wikipedia.org/wiki/Box_plot>`__.
 |      
 |      A consideration when using this chart is that the box and the whiskers
 |      can overlap, which is very common when plotting small sets of data.
 |      
 |      Parameters
 |      ----------
 |      by : str or sequence
 |          Column in the DataFrame to group by.
 |      
 |          .. versionchanged:: 1.4.0
 |      
 |             Previously, `by` is silently ignore and makes no groupings
 |      
 |      **kwargs
 |          Additional keywords are documented in
 |          :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      :class:`matplotlib.axes.Axes` or numpy.ndarray of them
 |      
 |      See Also
 |      --------
 |      DataFrame.boxplot: Another method to draw a box plot.
 |      Series.plot.box: Draw a box plot from a Series object.
 |      matplotlib.pyplot.boxplot: Draw a box plot in matplotlib.
 |      
 |      Examples
 |      --------
 |      Draw a box plot from a DataFrame with four columns of randomly
 |      generated data.
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> data = np.random.randn(25, 4)
 |          >>> df = pd.DataFrame(data, columns=list('ABCD'))
 |          >>> ax = df.plot.box()
 |      
 |      You can also generate groupings if you specify the `by` parameter (which
 |      can take a column name, or a list or tuple of column names):
 |      
 |      .. versionchanged:: 1.4.0
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85]
 |          >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list})
 |          >>> ax = df.plot.box(column="age", by="gender", figsize=(10, 8))
 |  
 |  density = kde(self, bw_method=None, ind=None, **kwargs)
 |  
 |  hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, **kwargs)
 |      Generate a hexagonal binning plot.
 |      
 |      Generate a hexagonal binning plot of `x` versus `y`. If `C` is `None`
 |      (the default), this is a histogram of the number of occurrences
 |      of the observations at ``(x[i], y[i])``.
 |      
 |      If `C` is specified, specifies values at given coordinates
 |      ``(x[i], y[i])``. These values are accumulated for each hexagonal
 |      bin and then reduced according to `reduce_C_function`,
 |      having as default the NumPy's mean function (:meth:`numpy.mean`).
 |      (If `C` is specified, it must also be a 1-D sequence
 |      of the same length as `x` and `y`, or a column label.)
 |      
 |      Parameters
 |      ----------
 |      x : int or str
 |          The column label or position for x points.
 |      y : int or str
 |          The column label or position for y points.
 |      C : int or str, optional
 |          The column label or position for the value of `(x, y)` point.
 |      reduce_C_function : callable, default `np.mean`
 |          Function of one argument that reduces all the values in a bin to
 |          a single number (e.g. `np.mean`, `np.max`, `np.sum`, `np.std`).
 |      gridsize : int or tuple of (int, int), default 100
 |          The number of hexagons in the x-direction.
 |          The corresponding number of hexagons in the y-direction is
 |          chosen in a way that the hexagons are approximately regular.
 |          Alternatively, gridsize can be a tuple with two elements
 |          specifying the number of hexagons in the x-direction and the
 |          y-direction.
 |      **kwargs
 |          Additional keyword arguments are documented in
 |          :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      matplotlib.AxesSubplot
 |          The matplotlib ``Axes`` on which the hexbin is plotted.
 |      
 |      See Also
 |      --------
 |      DataFrame.plot : Make plots of a DataFrame.
 |      matplotlib.pyplot.hexbin : Hexagonal binning plot using matplotlib,
 |          the matplotlib function that is used under the hood.
 |      
 |      Examples
 |      --------
 |      The following examples are generated with random data from
 |      a normal distribution.
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> n = 10000
 |          >>> df = pd.DataFrame({'x': np.random.randn(n),
 |          ...                    'y': np.random.randn(n)})
 |          >>> ax = df.plot.hexbin(x='x', y='y', gridsize=20)
 |      
 |      The next example uses `C` and `np.sum` as `reduce_C_function`.
 |      Note that `'observations'` values ranges from 1 to 5 but the result
 |      plot shows values up to more than 25. This is because of the
 |      `reduce_C_function`.
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> n = 500
 |          >>> df = pd.DataFrame({
 |          ...     'coord_x': np.random.uniform(-3, 3, size=n),
 |          ...     'coord_y': np.random.uniform(30, 50, size=n),
 |          ...     'observations': np.random.randint(1,5, size=n)
 |          ...     })
 |          >>> ax = df.plot.hexbin(x='coord_x',
 |          ...                     y='coord_y',
 |          ...                     C='observations',
 |          ...                     reduce_C_function=np.sum,
 |          ...                     gridsize=10,
 |          ...                     cmap="viridis")
 |  
 |  hist(self, by=None, bins=10, **kwargs)
 |      Draw one histogram of the DataFrame's columns.
 |      
 |      A histogram is a representation of the distribution of data.
 |      This function groups the values of all given Series in the DataFrame
 |      into bins and draws all bins in one :class:`matplotlib.axes.Axes`.
 |      This is useful when the DataFrame's Series are in a similar scale.
 |      
 |      Parameters
 |      ----------
 |      by : str or sequence, optional
 |          Column in the DataFrame to group by.
 |      
 |          .. versionchanged:: 1.4.0
 |      
 |             Previously, `by` is silently ignore and makes no groupings
 |      
 |      bins : int, default 10
 |          Number of histogram bins to be used.
 |      **kwargs
 |          Additional keyword arguments are documented in
 |          :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      class:`matplotlib.AxesSubplot`
 |          Return a histogram plot.
 |      
 |      See Also
 |      --------
 |      DataFrame.hist : Draw histograms per DataFrame's Series.
 |      Series.hist : Draw a histogram with Series' data.
 |      
 |      Examples
 |      --------
 |      When we roll a die 6000 times, we expect to get each value around 1000
 |      times. But when we roll two dice and sum the result, the distribution
 |      is going to be quite different. A histogram illustrates those
 |      distributions.
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> df = pd.DataFrame(
 |          ...     np.random.randint(1, 7, 6000),
 |          ...     columns = ['one'])
 |          >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
 |          >>> ax = df.plot.hist(bins=12, alpha=0.5)
 |      
 |      A grouped histogram can be generated by providing the parameter `by` (which
 |      can be a column name, or a list of column names):
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85]
 |          >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list})
 |          >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8))
 |  
 |  kde(self, bw_method=None, ind=None, **kwargs)
 |      Generate Kernel Density Estimate plot using Gaussian kernels.
 |      
 |      In statistics, `kernel density estimation`_ (KDE) is a non-parametric
 |      way to estimate the probability density function (PDF) of a random
 |      variable. This function uses Gaussian kernels and includes automatic
 |      bandwidth determination.
 |      
 |      .. _kernel density estimation:
 |          https://en.wikipedia.org/wiki/Kernel_density_estimation
 |      
 |      Parameters
 |      ----------
 |      bw_method : str, scalar or callable, optional
 |          The method used to calculate the estimator bandwidth. This can be
 |          'scott', 'silverman', a scalar constant or a callable.
 |          If None (default), 'scott' is used.
 |          See :class:`scipy.stats.gaussian_kde` for more information.
 |      ind : NumPy array or int, optional
 |          Evaluation points for the estimated PDF. If None (default),
 |          1000 equally spaced points are used. If `ind` is a NumPy array, the
 |          KDE is evaluated at the points passed. If `ind` is an integer,
 |          `ind` number of equally spaced points are used.
 |      **kwargs
 |          Additional keyword arguments are documented in
 |          :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      matplotlib.axes.Axes or numpy.ndarray of them
 |      
 |      See Also
 |      --------
 |      scipy.stats.gaussian_kde : Representation of a kernel-density
 |          estimate using Gaussian kernels. This is the function used
 |          internally to estimate the PDF.
 |      
 |      Examples
 |      --------
 |      Given a Series of points randomly sampled from an unknown
 |      distribution, estimate its PDF using KDE with automatic
 |      bandwidth determination and plot the results, evaluating them at
 |      1000 equally spaced points (default):
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5])
 |          >>> ax = s.plot.kde()
 |      
 |      A scalar bandwidth can be specified. Using a small bandwidth value can
 |      lead to over-fitting, while using a large bandwidth value may result
 |      in under-fitting:
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> ax = s.plot.kde(bw_method=0.3)
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> ax = s.plot.kde(bw_method=3)
 |      
 |      Finally, the `ind` parameter determines the evaluation points for the
 |      plot of the estimated PDF:
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5])
 |      
 |      For DataFrame, it works in the same way:
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> df = pd.DataFrame({
 |          ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
 |          ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
 |          ... })
 |          >>> ax = df.plot.kde()
 |      
 |      A scalar bandwidth can be specified. Using a small bandwidth value can
 |      lead to over-fitting, while using a large bandwidth value may result
 |      in under-fitting:
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> ax = df.plot.kde(bw_method=0.3)
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> ax = df.plot.kde(bw_method=3)
 |      
 |      Finally, the `ind` parameter determines the evaluation points for the
 |      plot of the estimated PDF:
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6])
 |  
 |  line(self, x=None, y=None, **kwargs)
 |      Plot Series or DataFrame as lines.
 |      
 |      This function is useful to plot lines using DataFrame's values
 |      as coordinates.
 |      
 |      Parameters
 |      ----------
 |      x : label or position, optional
 |          Allows plotting of one column versus another. If not specified,
 |          the index of the DataFrame is used.
 |      y : label or position, optional
 |          Allows plotting of one column versus another. If not specified,
 |          all numerical columns are used.
 |      color : str, array-like, or dict, optional
 |          The color for each of the DataFrame's columns. Possible values are:
 |      
 |          - A single color string referred to by name, RGB or RGBA code,
 |              for instance 'red' or '#a98d19'.
 |      
 |          - A sequence of color strings referred to by name, RGB or RGBA
 |              code, which will be used for each column recursively. For
 |              instance ['green','yellow'] each column's line will be filled in
 |              green or yellow, alternatively. If there is only a single column to
 |              be plotted, then only the first color from the color list will be
 |              used.
 |      
 |          - A dict of the form {column name : color}, so that each column will be
 |              colored accordingly. For example, if your columns are called `a` and
 |              `b`, then passing {'a': 'green', 'b': 'red'} will color lines for
 |              column `a` in green and lines for column `b` in red.
 |      
 |          .. versionadded:: 1.1.0
 |      
 |      **kwargs
 |          Additional keyword arguments are documented in
 |          :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      matplotlib.axes.Axes or np.ndarray of them
 |          An ndarray is returned with one :class:`matplotlib.axes.Axes`
 |          per column when ``subplots=True``.
 |      
 |              See Also
 |              --------
 |              matplotlib.pyplot.plot : Plot y versus x as lines and/or markers.
 |      
 |              Examples
 |              --------
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  >>> s = pd.Series([1, 3, 2])
 |                  >>> s.plot.line()
 |                  <AxesSubplot:ylabel='Density'>
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  The following example shows the populations for some animals
 |                  over the years.
 |      
 |                  >>> df = pd.DataFrame({
 |                  ...    'pig': [20, 18, 489, 675, 1776],
 |                  ...    'horse': [4, 25, 281, 600, 1900]
 |                  ...    }, index=[1990, 1997, 2003, 2009, 2014])
 |                  >>> lines = df.plot.line()
 |      
 |              .. plot::
 |                 :context: close-figs
 |      
 |                 An example with subplots, so an array of axes is returned.
 |      
 |                 >>> axes = df.plot.line(subplots=True)
 |                 >>> type(axes)
 |                 <class 'numpy.ndarray'>
 |      
 |              .. plot::
 |                 :context: close-figs
 |      
 |                 Let's repeat the same example, but specifying colors for
 |                 each column (in this case, for each animal).
 |      
 |                 >>> axes = df.plot.line(
 |                 ...     subplots=True, color={"pig": "pink", "horse": "#742802"}
 |                 ... )
 |      
 |              .. plot::
 |                  :context: close-figs
 |      
 |                  The following example shows the relationship between both
 |                  populations.
 |      
 |                  >>> lines = df.plot.line(x='pig', y='horse')
 |  
 |  pie(self, **kwargs)
 |      Generate a pie plot.
 |      
 |      A pie plot is a proportional representation of the numerical data in a
 |      column. This function wraps :meth:`matplotlib.pyplot.pie` for the
 |      specified column. If no column reference is passed and
 |      ``subplots=True`` a pie plot is drawn for each numerical column
 |      independently.
 |      
 |      Parameters
 |      ----------
 |      y : int or label, optional
 |          Label or position of the column to plot.
 |          If not provided, ``subplots=True`` argument must be passed.
 |      **kwargs
 |          Keyword arguments to pass on to :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      matplotlib.axes.Axes or np.ndarray of them
 |          A NumPy array is returned when `subplots` is True.
 |      
 |      See Also
 |      --------
 |      Series.plot.pie : Generate a pie plot for a Series.
 |      DataFrame.plot : Make plots of a DataFrame.
 |      
 |      Examples
 |      --------
 |      In the example below we have a DataFrame with the information about
 |      planet's mass and radius. We pass the 'mass' column to the
 |      pie function to get a pie plot.
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97],
 |          ...                    'radius': [2439.7, 6051.8, 6378.1]},
 |          ...                   index=['Mercury', 'Venus', 'Earth'])
 |          >>> plot = df.plot.pie(y='mass', figsize=(5, 5))
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> plot = df.plot.pie(subplots=True, figsize=(11, 6))
 |  
 |  scatter(self, x, y, s=None, c=None, **kwargs)
 |      Create a scatter plot with varying marker point size and color.
 |      
 |      The coordinates of each point are defined by two dataframe columns and
 |      filled circles are used to represent each point. This kind of plot is
 |      useful to see complex correlations between two variables. Points could
 |      be for instance natural 2D coordinates like longitude and latitude in
 |      a map or, in general, any pair of metrics that can be plotted against
 |      each other.
 |      
 |      Parameters
 |      ----------
 |      x : int or str
 |          The column name or column position to be used as horizontal
 |          coordinates for each point.
 |      y : int or str
 |          The column name or column position to be used as vertical
 |          coordinates for each point.
 |      s : str, scalar or array-like, optional
 |          The size of each point. Possible values are:
 |      
 |          - A string with the name of the column to be used for marker's size.
 |      
 |          - A single scalar so all points have the same size.
 |      
 |          - A sequence of scalars, which will be used for each point's size
 |            recursively. For instance, when passing [2,14] all points size
 |            will be either 2 or 14, alternatively.
 |      
 |            .. versionchanged:: 1.1.0
 |      
 |      c : str, int or array-like, optional
 |          The color of each point. Possible values are:
 |      
 |          - A single color string referred to by name, RGB or RGBA code,
 |            for instance 'red' or '#a98d19'.
 |      
 |          - A sequence of color strings referred to by name, RGB or RGBA
 |            code, which will be used for each point's color recursively. For
 |            instance ['green','yellow'] all points will be filled in green or
 |            yellow, alternatively.
 |      
 |          - A column name or position whose values will be used to color the
 |            marker points according to a colormap.
 |      
 |      **kwargs
 |          Keyword arguments to pass on to :meth:`DataFrame.plot`.
 |      
 |      Returns
 |      -------
 |      :class:`matplotlib.axes.Axes` or numpy.ndarray of them
 |      
 |      See Also
 |      --------
 |      matplotlib.pyplot.scatter : Scatter plot using multiple input data
 |          formats.
 |      
 |      Examples
 |      --------
 |      Let's see how to draw a scatter plot using coordinates from the values
 |      in a DataFrame's columns.
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> df = pd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
 |          ...                    [6.4, 3.2, 1], [5.9, 3.0, 2]],
 |          ...                   columns=['length', 'width', 'species'])
 |          >>> ax1 = df.plot.scatter(x='length',
 |          ...                       y='width',
 |          ...                       c='DarkBlue')
 |      
 |      And now with the color determined by a column as well.
 |      
 |      .. plot::
 |          :context: close-figs
 |      
 |          >>> ax2 = df.plot.scatter(x='length',
 |          ...                       y='width',
 |          ...                       c='species',
 |          ...                       colormap='viridis')
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pandas.core.base.PandasObject:
 |  
 |  __repr__(self) -> 'str'
 |      Return a string representation for a particular object.
 |  
 |  __sizeof__(self) -> 'int'
 |      Generates the total memory usage for an object that returns
 |      either a value or Series of values
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from pandas.core.base.PandasObject:
 |  
 |  __annotations__ = {'_cache': 'dict[str, Any]'}
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pandas.core.accessor.DirNamesMixin:
 |  
 |  __dir__(self) -> 'list[str]'
 |      Provide method name lookup and completion.
 |      
 |      Notes
 |      -----
 |      Only provide 'public' methods.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from pandas.core.accessor.DirNamesMixin:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)

可以画这么多的图形

  • The kind of plot to produce:
    - ‘line’ : line plot (default)
    - ‘bar’ : vertical bar plot
    - ‘barh’ : horizontal bar plot
    - ‘hist’ : histogram
    - ‘box’ : boxplot
    - ‘kde’ : Kernel Density Estimation plot
    - ‘density’ : same as ‘kde’
    - ‘area’ : area plot
    - ‘pie’ : pie plot
    - ‘scatter’ : scatter plot (DataFrame only)
    - ‘hexbin’ : hexbin plot (DataFrame only)

文件操作

import pandas as pd

在这里插入图片描述

CSV

参数:
filepath_or_buffer : str,pathlib。str, pathlib.Path, py._path.local.LocalPath or any object with a read() method (such as a file handle or StringIO)
可以是URL,可用URL类型包括:http, ftp, s3和文件。对于多文件正在准备中
本地文件读取实例:😕/localhost/path/to/table.csv

sep : str, default ‘,’
指定分隔符。如果不指定参数,则会尝试使用逗号分隔。分隔符长于一个字符并且不是‘\s+’,将使用python的语法分析器。并且忽略数据中的逗号。正则表达式例子:‘\r\t’

delimiter : str, default None
定界符,备选分隔符(如果指定该参数,则sep参数失效)

delim_whitespace : boolean, default False.
指定空格(例如’ ‘或者’ ‘)是否作为分隔符使用,等效于设定sep=‘\s+’。如果这个参数设定为Ture那么delimiter 参数失效。
在新版本0.18.1支持

header : int or list of ints, default ‘infer’
指定行数用来作为列名,数据开始行数。如果文件中没有列名,则默认为0,否则设置为None。如果明确设定header=0 就会替换掉原来存在列名。header参数可以是一个list例如:[0,1,3],这个list表示将文件中的这些行作为列标题(意味着每一列有多个标题),介于中间的行将被忽略掉(例如本例中的2;本例中的数据1,2,4行将被作为多级标题出现,第3行数据将被丢弃,dataframe的数据从第5行开始。)。
注意:如果skip_blank_lines=True 那么header参数忽略注释行和空行,所以header=0表示第一行数据而不是文件的第一行。

names : array-like, default None
用于结果的列名列表,如果数据文件中没有列标题行,就需要执行header=None。默认列表中不能出现重复,除非设定参数mangle_dupe_cols=True。

index_col : int or sequence or False, default None
用作行索引的列编号或者列名,如果给定一个序列则有多个行索引。
如果文件不规则,行尾有分隔符,则可以设定index_col=False 来是的pandas不适用第一列作为行索引。

usecols : array-like, default None
返回一个数据子集,该列表中的值必须可以对应到文件中的位置(数字可以对应到指定的列)或者是字符传为文件中的列名。例如:usecols有效参数可能是 [0,1,2]或者是 [‘foo’, ‘bar’, ‘baz’]。使用这个参数可以加快加载速度并降低内存消耗。

as_recarray : boolean, default False
不赞成使用:该参数会在未来版本移除。请使用pd.read_csv(…).to_records()替代。
返回一个Numpy的recarray来替代DataFrame。如果该参数设定为True。将会优先squeeze参数使用。并且行索引将不再可用,索引列也将被忽略。

squeeze : boolean, default False
如果文件值包含一列,则返回一个Series

prefix : str, default None
在没有列标题时,给列添加前缀。例如:添加‘X’ 成为 X0, X1, …

mangle_dupe_cols : boolean, default True
重复的列,将‘X’…’X’表示为‘X.0’…’X.N’。如果设定为false则会将所有重名列覆盖。

dtype : Type name or dict of column -> type, default None
每列数据的数据类型。例如 {‘a’: np.float64, ‘b’: np.int32}

engine : {‘c’, ‘python’}, optional
Parser engine to use. The C engine is faster while the python engine is currently more feature-complete.
使用的分析引擎。可以选择C或者是python。C引擎快但是Python引擎功能更加完备。

converters : dict, default None
列转换函数的字典。key可以是列名或者列的序号。

true_values : list, default None
Values to consider as True

false_values : list, default None
Values to consider as False

skipinitialspace : boolean, default False
忽略分隔符后的空白(默认为False,即不忽略).

skiprows : list-like or integer, default None
需要忽略的行数(从文件开始处算起),或需要跳过的行号列表(从0开始)。

skipfooter : int, default 0
从文件尾部开始忽略。 (c引擎不支持)

skip_footer : int, default 0
不推荐使用:建议使用skipfooter ,功能一样。

nrows : int, default None
需要读取的行数(从文件头开始算起)。

na_values : scalar, str, list-like, or dict, default None
一组用于替换NA/NaN的值。如果传参,需要制定特定列的空值。默认为‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘nan’`.

keep_default_na : bool, default True
如果指定na_values参数,并且keep_default_na=False,那么默认的NaN将被覆盖,否则添加。

na_filter : boolean, default True
是否检查丢失值(空字符串或者是空值)。对于大文件来说数据集中没有空值,设定na_filter=False可以提升读取速度。

verbose : boolean, default False
是否打印各种解析器的输出信息,例如:“非数值列中缺失值的数量”等。

skip_blank_lines : boolean, default True
如果为True,则跳过空行;否则记为NaN。

parse_dates : boolean or list of ints or names or list of lists or dict, default False
boolean. True -> 解析索引
list of ints or names. e.g. If [1, 2, 3] -> 解析1,2,3列的值作为独立的日期列;
list of lists. e.g. If [[1, 3]] -> 合并1,3列作为一个日期列使用
dict, e.g. {‘foo’ : [1, 3]} -> 将1,3列合并,并给合并后的列起名为"foo"

infer_datetime_format : boolean, default False
如果设定为True并且parse_dates 可用,那么pandas将尝试转换为日期类型,如果可以转换,转换方法并解析。在某些情况下会快5~10倍。

keep_date_col : boolean, default False
如果连接多列解析日期,则保持参与连接的列。默认为False。

date_parser : function, default None
用于解析日期的函数,默认使用dateutil.parser.parser来做转换。Pandas尝试使用三种不同的方式解析,如果遇到问题则使用下一种方式。
1.使用一个或者多个arrays(由parse_dates指定)作为参数;
2.连接指定多列字符串作为一个列作为参数;
3.每行调用一次date_parser函数来解析一个或者多个字符串(由parse_dates指定)作为参数。

dayfirst : boolean, default False
DD/MM格式的日期类型

iterator : boolean, default False
返回一个TextFileReader 对象,以便逐块处理文件。

chunksize : int, default None
文件块的大小, See IO Tools docs for more informationon iterator and chunksize.

compression : {‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}, default ‘infer’
直接使用磁盘上的压缩文件。如果使用infer参数,则使用 gzip, bz2, zip或者解压文件名中以‘.gz’, ‘.bz2’, ‘.zip’, or ‘xz’这些为后缀的文件,否则不解压。如果使用zip,那么ZIP包中国必须只包含一个文件。设置为None则不解压。
新版本0.18.1版本支持zip和xz解压

thousands : str, default None
千分位分割符,如“,”或者“."

decimal : str, default ‘.’
字符中的小数点 (例如:欧洲数据使用’,‘).

float_precision : string, default None
Specifies which converter the C engine should use for floating-point values. The options are None for the ordinary converter, high for the high-precision converter, and round_trip for the round-trip converter.
指定

lineterminator : str (length 1), default None
行分割符,只在C解析器下使用。

quotechar : str (length 1), optional
引号,用作标识开始和解释的字符,引号内的分割符将被忽略。

quoting : int or csv.QUOTE_* instance, default 0
控制csv中的引号常量。可选 QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3)

doublequote : boolean, default True
双引号,当单引号已经被定义,并且quoting 参数不是QUOTE_NONE的时候,使用双引号表示引号内的元素作为一个元素使用。

escapechar : str (length 1), default None
当quoting 为QUOTE_NONE时,指定一个字符使的不受分隔符限值。

comment : str, default None
标识着多余的行不被解析。如果该字符出现在行首,这一行将被全部忽略。这个参数只能是一个字符,空行(就像skip_blank_lines=True)注释行被header和skiprows忽略一样。例如如果指定comment=‘#’ 解析‘#empty\na,b,c\n1,2,3’ 以header=0 那么返回结果将是以’a,b,c’作为header。

encoding : str, default None
指定字符集类型,通常指定为’utf-8’. List of Python standard encodings

dialect : str or csv.Dialect instance, default None
如果没有指定特定的语言,如果sep大于一个字符则忽略。具体查看csv.Dialect 文档

tupleize_cols : boolean, default False
Leave a list of tuples on columns as is (default is to convert to a Multi Index on the columns)

error_bad_lines : boolean, default True
如果一行包含太多的列,那么默认不会返回DataFrame ,如果设置成false,那么会将改行剔除(只能在C解析器下使用)。

warn_bad_lines : boolean, default True
如果error_bad_lines =False,并且warn_bad_lines =True 那么所有的“bad lines”将会被输出(只能在C解析器下使用)。

low_memory : boolean, default True
分块加载到内存,再低内存消耗中解析。但是可能出现类型混淆。确保类型不被混淆需要设置为False。或者使用dtype 参数指定类型。注意使用chunksize 或者iterator 参数分块读入会将整个文件读入到一个Dataframe,而忽略类型(只能在C解析器中有效)

buffer_lines : int, default None
不推荐使用,这个参数将会在未来版本移除,因为他的值在解析器中不推荐使用

compact_ints : boolean, default False
不推荐使用,这个参数将会在未来版本移除
如果设置compact_ints=True ,那么任何有整数类型构成的列将被按照最小的整数类型存储,是否有符号将取决于use_unsigned 参数

use_unsigned : boolean, default False
不推荐使用:这个参数将会在未来版本移除
如果整数列被压缩(i.e. compact_ints=True),指定被压缩的列是有符号还是无符号的。
memory_map : boolean, default False
如果使用的文件在内存内,那么直接map文件使用。使用这种方式可以避免文件再次进行IO操作。

# pd.read_csv(filepath_or_buffer, seq=",")
help(pd.read_csv)
Help on function read_csv in module pandas.io.parsers.readers:

read_csv(filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]', sep=<no_default>, delimiter=None, header='infer', names=<no_default>, index_col=None, usecols=None, squeeze=None, prefix=<no_default>, mangle_dupe_cols=True, dtype: 'DtypeArg | None' = None, engine: 'CSVEngine | None' = None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=None, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression: 'CompressionOptions' = 'infer', thousands=None, decimal: 'str' = '.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, encoding_errors: 'str | None' = 'strict', dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options: 'StorageOptions' = None)
    Read a comma-separated values (csv) file into DataFrame.
    
    Also supports optionally iterating or breaking of the file
    into chunks.
    
    Additional help can be found in the online docs for
    `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
    
    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
        expected. A local file could be: file://localhost/path/to/table.csv.
    
        If you want to pass in a path object, pandas accepts any ``os.PathLike``.
    
        By file-like object, we refer to objects with a ``read()`` method, such as
        a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
    sep : str, default ','
        Delimiter to use. If sep is None, the C engine cannot automatically detect
        the separator, but the Python parsing engine can, meaning the latter will
        be used and automatically detect the separator by Python's builtin sniffer
        tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
        different from ``'\s+'`` will be interpreted as regular expressions and
        will also force the use of the Python parsing engine. Note that regex
        delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
    delimiter : str, default ``None``
        Alias for sep.
    header : int, list of int, None, default 'infer'
        Row number(s) to use as the column names, and the start of the
        data.  Default behavior is to infer the column names: if no names
        are passed the behavior is identical to ``header=0`` and column
        names are inferred from the first line of the file, if column
        names are passed explicitly then the behavior is identical to
        ``header=None``. Explicitly pass ``header=0`` to be able to
        replace existing names. The header can be a list of integers that
        specify row locations for a multi-index on the columns
        e.g. [0,1,3]. Intervening rows that are not specified will be
        skipped (e.g. 2 in this example is skipped). Note that this
        parameter ignores commented lines and empty lines if
        ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
        data rather than the first line of the file.
    names : array-like, optional
        List of column names to use. If the file contains a header row,
        then you should explicitly pass ``header=0`` to override the column names.
        Duplicates in this list are not allowed.
    index_col : int, str, sequence of int / str, or False, optional, default ``None``
      Column(s) to use as the row labels of the ``DataFrame``, either given as
      string name or column index. If a sequence of int / str is given, a
      MultiIndex is used.
    
      Note: ``index_col=False`` can be used to force pandas to *not* use the first
      column as the index, e.g. when you have a malformed file with delimiters at
      the end of each line.
    usecols : list-like or callable, optional
        Return a subset of the columns. If list-like, all elements must either
        be positional (i.e. integer indices into the document columns) or strings
        that correspond to column names provided either by the user in `names` or
        inferred from the document header row(s). If ``names`` are given, the document
        header row(s) are not taken into account. For example, a valid list-like
        `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
        Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
        To instantiate a DataFrame from ``data`` with element order preserved use
        ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
        in ``['foo', 'bar']`` order or
        ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
        for ``['bar', 'foo']`` order.
    
        If callable, the callable function will be evaluated against the column
        names, returning names where the callable function evaluates to True. An
        example of a valid callable argument would be ``lambda x: x.upper() in
        ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
        parsing time and lower memory usage.
    squeeze : bool, default False
        If the parsed data only contains one column then return a Series.
    
        .. deprecated:: 1.4.0
            Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
            the data.
    prefix : str, optional
        Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
    
        .. deprecated:: 1.4.0
           Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
    mangle_dupe_cols : bool, default True
        Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
        'X'...'X'. Passing in False will cause data to be overwritten if there
        are duplicate names in the columns.
    dtype : Type name or dict of column -> type, optional
        Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,
        'c': 'Int64'}
        Use `str` or `object` together with suitable `na_values` settings
        to preserve and not interpret dtype.
        If converters are specified, they will be applied INSTEAD
        of dtype conversion.
    engine : {'c', 'python', 'pyarrow'}, optional
        Parser engine to use. The C and pyarrow engines are faster, while the python engine
        is currently more feature-complete. Multithreading is currently only supported by
        the pyarrow engine.
    
        .. versionadded:: 1.4.0
    
            The "pyarrow" engine was added as an *experimental* engine, and some features
            are unsupported, or may not work correctly, with this engine.
    converters : dict, optional
        Dict of functions for converting values in certain columns. Keys can either
        be integers or column labels.
    true_values : list, optional
        Values to consider as True.
    false_values : list, optional
        Values to consider as False.
    skipinitialspace : bool, default False
        Skip spaces after delimiter.
    skiprows : list-like, int or callable, optional
        Line numbers to skip (0-indexed) or number of lines to skip (int)
        at the start of the file.
    
        If callable, the callable function will be evaluated against the row
        indices, returning True if the row should be skipped and False otherwise.
        An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
    skipfooter : int, default 0
        Number of lines at bottom of file to skip (Unsupported with engine='c').
    nrows : int, optional
        Number of rows of file to read. Useful for reading pieces of large files.
    na_values : scalar, str, list-like, or dict, optional
        Additional strings to recognize as NA/NaN. If dict passed, specific
        per-column NA values.  By default the following values are interpreted as
        NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
        '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
        'nan', 'null'.
    keep_default_na : bool, default True
        Whether or not to include the default NaN values when parsing the data.
        Depending on whether `na_values` is passed in, the behavior is as follows:
    
        * If `keep_default_na` is True, and `na_values` are specified, `na_values`
          is appended to the default NaN values used for parsing.
        * If `keep_default_na` is True, and `na_values` are not specified, only
          the default NaN values are used for parsing.
        * If `keep_default_na` is False, and `na_values` are specified, only
          the NaN values specified `na_values` are used for parsing.
        * If `keep_default_na` is False, and `na_values` are not specified, no
          strings will be parsed as NaN.
    
        Note that if `na_filter` is passed in as False, the `keep_default_na` and
        `na_values` parameters will be ignored.
    na_filter : bool, default True
        Detect missing value markers (empty strings and the value of na_values). In
        data without any NAs, passing na_filter=False can improve the performance
        of reading a large file.
    verbose : bool, default False
        Indicate number of NA values placed in non-numeric columns.
    skip_blank_lines : bool, default True
        If True, skip over blank lines rather than interpreting as NaN values.
    parse_dates : bool or list of int or names or list of lists or dict, default False
        The behavior is as follows:
    
        * boolean. If True -> try parsing the index.
        * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
          each as a separate date column.
        * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
          a single date column.
        * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
          result 'foo'
    
        If a column or index cannot be represented as an array of datetimes,
        say because of an unparsable value or a mixture of timezones, the column
        or index will be returned unaltered as an object data type. For
        non-standard datetime parsing, use ``pd.to_datetime`` after
        ``pd.read_csv``. To parse an index or column with a mixture of timezones,
        specify ``date_parser`` to be a partially-applied
        :func:`pandas.to_datetime` with ``utc=True``. See
        :ref:`io.csv.mixed_timezones` for more.
    
        Note: A fast-path exists for iso8601-formatted dates.
    infer_datetime_format : bool, default False
        If True and `parse_dates` is enabled, pandas will attempt to infer the
        format of the datetime strings in the columns, and if it can be inferred,
        switch to a faster method of parsing them. In some cases this can increase
        the parsing speed by 5-10x.
    keep_date_col : bool, default False
        If True and `parse_dates` specifies combining multiple columns then
        keep the original columns.
    date_parser : function, optional
        Function to use for converting a sequence of string columns to an array of
        datetime instances. The default uses ``dateutil.parser.parser`` to do the
        conversion. Pandas will try to call `date_parser` in three different ways,
        advancing to the next if an exception occurs: 1) Pass one or more arrays
        (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
        string values from the columns defined by `parse_dates` into a single array
        and pass that; and 3) call `date_parser` once for each row using one or
        more strings (corresponding to the columns defined by `parse_dates`) as
        arguments.
    dayfirst : bool, default False
        DD/MM format dates, international and European format.
    cache_dates : bool, default True
        If True, use a cache of unique, converted dates to apply the datetime
        conversion. May produce significant speed-up when parsing duplicate
        date strings, especially ones with timezone offsets.
    
        .. versionadded:: 0.25.0
    iterator : bool, default False
        Return TextFileReader object for iteration or getting chunks with
        ``get_chunk()``.
    
        .. versionchanged:: 1.2
    
           ``TextFileReader`` is a context manager.
    chunksize : int, optional
        Return TextFileReader object for iteration.
        See the `IO Tools docs
        <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
        for more information on ``iterator`` and ``chunksize``.
    
        .. versionchanged:: 1.2
    
           ``TextFileReader`` is a context manager.
    compression : str or dict, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
        path-like, then detect compression from the following extensions: '.gz',
        '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
        'zip', the ZIP file must contain only one data file to be read in. Set to
        ``None`` for no decompression. Can also be a dict with key ``'method'`` set
        to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
        key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
        ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
        example, the following could be passed for Zstandard decompression using a
        custom compression dictionary:
        ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
    
        .. versionchanged:: 1.4.0 Zstandard support.
    
    thousands : str, optional
        Thousands separator.
    decimal : str, default '.'
        Character to recognize as decimal point (e.g. use ',' for European data).
    lineterminator : str (length 1), optional
        Character to break file into lines. Only valid with C parser.
    quotechar : str (length 1), optional
        The character used to denote the start and end of a quoted item. Quoted
        items can include the delimiter and it will be ignored.
    quoting : int or csv.QUOTE_* instance, default 0
        Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
        QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
    doublequote : bool, default ``True``
       When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
       whether or not to interpret two consecutive quotechar elements INSIDE a
       field as a single ``quotechar`` element.
    escapechar : str (length 1), optional
        One-character string used to escape other characters.
    comment : str, optional
        Indicates remainder of line should not be parsed. If found at the beginning
        of a line, the line will be ignored altogether. This parameter must be a
        single character. Like empty lines (as long as ``skip_blank_lines=True``),
        fully commented lines are ignored by the parameter `header` but not by
        `skiprows`. For example, if ``comment='#'``, parsing
        ``#empty\na,b,c\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
        treated as the header.
    encoding : str, optional
        Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
        standard encodings
        <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
    
        .. versionchanged:: 1.2
    
           When ``encoding`` is ``None``, ``errors="replace"`` is passed to
           ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
           This behavior was previously only the case for ``engine="python"``.
    
        .. versionchanged:: 1.3.0
    
           ``encoding_errors`` is a new argument. ``encoding`` has no longer an
           influence on how encoding errors are handled.
    
    encoding_errors : str, optional, default "strict"
        How encoding errors are treated. `List of possible values
        <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
    
        .. versionadded:: 1.3.0
    
    dialect : str or csv.Dialect, optional
        If provided, this parameter will override values (default or not) for the
        following parameters: `delimiter`, `doublequote`, `escapechar`,
        `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
        override values, a ParserWarning will be issued. See csv.Dialect
        documentation for more details.
    error_bad_lines : bool, optional, default ``None``
        Lines with too many fields (e.g. a csv line with too many commas) will by
        default cause an exception to be raised, and no DataFrame will be returned.
        If False, then these "bad lines" will be dropped from the DataFrame that is
        returned.
    
        .. deprecated:: 1.3.0
           The ``on_bad_lines`` parameter should be used instead to specify behavior upon
           encountering a bad line instead.
    warn_bad_lines : bool, optional, default ``None``
        If error_bad_lines is False, and warn_bad_lines is True, a warning for each
        "bad line" will be output.
    
        .. deprecated:: 1.3.0
           The ``on_bad_lines`` parameter should be used instead to specify behavior upon
           encountering a bad line instead.
    on_bad_lines : {'error', 'warn', 'skip'} or callable, default 'error'
        Specifies what to do upon encountering a bad line (a line with too many fields).
        Allowed values are :
    
            - 'error', raise an Exception when a bad line is encountered.
            - 'warn', raise a warning when a bad line is encountered and skip that line.
            - 'skip', skip bad lines without raising or warning when they are encountered.
    
        .. versionadded:: 1.3.0
    
            - callable, function with signature
              ``(bad_line: list[str]) -> list[str] | None`` that will process a single
              bad line. ``bad_line`` is a list of strings split by the ``sep``.
              If the function returns ``None``, the bad line will be ignored.
              If the function returns a new list of strings with more elements than
              expected, a ``ParserWarning`` will be emitted while dropping extra elements.
              Only supported when ``engine="python"``
    
        .. versionadded:: 1.4.0
    
    delim_whitespace : bool, default False
        Specifies whether or not whitespace (e.g. ``' '`` or ``'    '``) will be
        used as the sep. Equivalent to setting ``sep='\s+'``. If this option
        is set to True, nothing should be passed in for the ``delimiter``
        parameter.
    low_memory : bool, default True
        Internally process the file in chunks, resulting in lower memory use
        while parsing, but possibly mixed type inference.  To ensure no mixed
        types either set False, or specify the type with the `dtype` parameter.
        Note that the entire file is read into a single DataFrame regardless,
        use the `chunksize` or `iterator` parameter to return the data in chunks.
        (Only valid with C parser).
    memory_map : bool, default False
        If a filepath is provided for `filepath_or_buffer`, map the file object
        directly onto memory and access the data directly from there. Using this
        option can improve performance because there is no longer any I/O overhead.
    float_precision : str, optional
        Specifies which converter the C engine should use for floating-point
        values. The options are ``None`` or 'high' for the ordinary converter,
        'legacy' for the original lower precision pandas converter, and
        'round_trip' for the round-trip converter.
    
        .. versionchanged:: 1.2
    
    storage_options : dict, optional
        Extra options that make sense for a particular storage connection, e.g.
        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
        are forwarded to ``urllib`` as header options. For other URLs (e.g.
        starting with "s3://", and "gcs://") the key-value pairs are forwarded to
        ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
    
        .. versionadded:: 1.2
    
    Returns
    -------
    DataFrame or TextParser
        A comma-separated values (csv) file is returned as two-dimensional
        data structure with labeled axes.
    
    See Also
    --------
    DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
    read_csv : Read a comma-separated values (csv) file into DataFrame.
    read_fwf : Read a table of fixed-width formatted lines into DataFrame.
    
    Examples
    --------
    >>> pd.read_csv('data.csv')  # doctest: +SKIP

读取

data = pd.read_csv("data/stock_day.csv", usecols=["open", "close"])

data
openclose
2018-02-2723.5324.16
2018-02-2622.8023.53
2018-02-2322.8822.82
2018-02-2222.2522.28
2018-02-1421.4921.92
.........
2015-03-0613.1714.28
2015-03-0512.8813.16
2015-03-0412.8012.90
2015-03-0312.5212.70
2015-03-0212.2512.52

643 rows × 2 columns

保存

在这里插入图片描述

# 保存open列前十行的值 若加上index=False则不保留行索引
data[:10].to_csv("save/stock_open.csv",  columns=["open"])

# 读出来查看结果
pd.read_csv("save/stock_open.csv")
Unnamed: 0open
02018-02-2723.53
12018-02-2622.80
22018-02-2322.88
32018-02-2222.25
42018-02-1421.49
52018-02-1321.40
62018-02-1220.70
72018-02-0921.20
82018-02-0821.79
92018-02-0722.69

HDF5

在这里插入图片描述

读取

# 先使用store接受看一看有什么内容
store = pd.HDFStore("data/test1.h5")

# 看一看所有的key
store.keys()
['/building6/elec/meter1',
 '/building6/elec/meter10',
 '/building6/elec/meter11',
 '/building6/elec/meter12',
 '/building6/elec/meter13',
 '/building6/elec/meter14',
 '/building6/elec/meter15',
 '/building6/elec/meter16',
 '/building6/elec/meter17',
 '/building6/elec/meter2',
 '/building6/elec/meter3',
 '/building6/elec/meter4',
 '/building6/elec/meter5',
 '/building6/elec/meter6',
 '/building6/elec/meter7',
 '/building6/elec/meter8',
 '/building6/elec/meter9',
 '/building6/elec/cache/meter8/good_sections',
 '/building6/elec/cache/meter8/total_energy',
 '/building5/elec/meter1',
 '/building5/elec/meter10',
 '/building5/elec/meter11',
 '/building5/elec/meter12',
 '/building5/elec/meter13',
 '/building5/elec/meter14',
 '/building5/elec/meter15',
 '/building5/elec/meter16',
 '/building5/elec/meter17',
 '/building5/elec/meter18',
 '/building5/elec/meter19',
 '/building5/elec/meter2',
 '/building5/elec/meter20',
 '/building5/elec/meter21',
 '/building5/elec/meter22',
 '/building5/elec/meter23',
 '/building5/elec/meter24',
 '/building5/elec/meter25',
 '/building5/elec/meter26',
 '/building5/elec/meter3',
 '/building5/elec/meter4',
 '/building5/elec/meter5',
 '/building5/elec/meter6',
 '/building5/elec/meter7',
 '/building5/elec/meter8',
 '/building5/elec/meter9',
 '/building5/elec/cache/meter18/good_sections',
 '/building5/elec/cache/meter18/total_energy',
 '/building4/elec/meter1',
 '/building4/elec/meter10',
 '/building4/elec/meter11',
 '/building4/elec/meter12',
 '/building4/elec/meter13',
 '/building4/elec/meter14',
 '/building4/elec/meter15',
 '/building4/elec/meter16',
 '/building4/elec/meter17',
 '/building4/elec/meter18',
 '/building4/elec/meter19',
 '/building4/elec/meter2',
 '/building4/elec/meter20',
 '/building4/elec/meter3',
 '/building4/elec/meter4',
 '/building4/elec/meter5',
 '/building4/elec/meter6',
 '/building4/elec/meter7',
 '/building4/elec/meter8',
 '/building4/elec/meter9',
 '/building3/elec/meter1',
 '/building3/elec/meter10',
 '/building3/elec/meter11',
 '/building3/elec/meter12',
 '/building3/elec/meter13',
 '/building3/elec/meter14',
 '/building3/elec/meter15',
 '/building3/elec/meter16',
 '/building3/elec/meter17',
 '/building3/elec/meter18',
 '/building3/elec/meter19',
 '/building3/elec/meter2',
 '/building3/elec/meter20',
 '/building3/elec/meter21',
 '/building3/elec/meter22',
 '/building3/elec/meter3',
 '/building3/elec/meter4',
 '/building3/elec/meter5',
 '/building3/elec/meter6',
 '/building3/elec/meter7',
 '/building3/elec/meter8',
 '/building3/elec/meter9',
 '/building3/elec/cache/meter7/good_sections',
 '/building3/elec/cache/meter7/total_energy',
 '/building2/elec/meter1',
 '/building2/elec/meter10',
 '/building2/elec/meter11',
 '/building2/elec/meter2',
 '/building2/elec/meter3',
 '/building2/elec/meter4',
 '/building2/elec/meter5',
 '/building2/elec/meter6',
 '/building2/elec/meter7',
 '/building2/elec/meter8',
 '/building2/elec/meter9',
 '/building2/elec/cache/meter9/good_sections',
 '/building2/elec/cache/meter9/total_energy',
 '/building1/elec/meter1',
 '/building1/elec/meter10',
 '/building1/elec/meter11',
 '/building1/elec/meter12',
 '/building1/elec/meter13',
 '/building1/elec/meter14',
 '/building1/elec/meter15',
 '/building1/elec/meter16',
 '/building1/elec/meter17',
 '/building1/elec/meter18',
 '/building1/elec/meter19',
 '/building1/elec/meter2',
 '/building1/elec/meter20',
 '/building1/elec/meter3',
 '/building1/elec/meter4',
 '/building1/elec/meter5',
 '/building1/elec/meter6',
 '/building1/elec/meter7',
 '/building1/elec/meter8',
 '/building1/elec/meter9',
 '/building1/elec/cache/meter9/total_energy',
 '/building1/elec/cache/meter8/total_energy',
 '/building1/elec/cache/meter7/total_energy',
 '/building1/elec/cache/meter6/total_energy',
 '/building1/elec/cache/meter5/dropout_rate',
 '/building1/elec/cache/meter5/good_sections',
 '/building1/elec/cache/meter5/total_energy',
 '/building1/elec/cache/meter4/total_energy',
 '/building1/elec/cache/meter3/total_energy',
 '/building1/elec/cache/meter20/total_energy',
 '/building1/elec/cache/meter2/total_energy',
 '/building1/elec/cache/meter19/total_energy',
 '/building1/elec/cache/meter18/total_energy',
 '/building1/elec/cache/meter17/total_energy',
 '/building1/elec/cache/meter16/total_energy',
 '/building1/elec/cache/meter15/total_energy',
 '/building1/elec/cache/meter14/total_energy',
 '/building1/elec/cache/meter13/total_energy',
 '/building1/elec/cache/meter12/total_energy',
 '/building1/elec/cache/meter11/total_energy',
 '/building1/elec/cache/meter10/total_energy',
 '/building1/elec/cache/meter1/good_sections',
 '/building1/elec/cache/meter1/total_energy']
# 及时关闭养成好习惯
store.close()

# 若为False则说明已关闭
print(store.is_open)
False
# 利用获取到的键进行读取
data1 = pd.read_hdf("data/test1.h5", key="/building6/elec/meter1")
# 结果
data1
physical_quantitypower
typeapparent
2011-05-21 15:39:19-04:0047.000000
2011-05-21 15:39:21-04:0046.919998
2011-05-21 15:39:22-04:0047.020000
2011-05-21 15:39:24-04:0046.970001
2011-05-21 15:39:26-04:0046.779999
......
2011-06-14 01:31:37-04:0021.820000
2011-06-14 01:31:38-04:0021.850000
2011-06-14 01:31:40-04:0021.900000
2011-06-14 01:31:42-04:0021.799999
2011-06-14 01:31:44-04:0021.870001

887457 rows × 1 columns

保存

data1.to_hdf("save/test.h5", key="test")
data2 = pd.read_hdf("save/test.h5")

data2
physical_quantitypower
typeapparent
2011-05-21 15:39:19-04:0047.000000
2011-05-21 15:39:21-04:0046.919998
2011-05-21 15:39:22-04:0047.020000
2011-05-21 15:39:24-04:0046.970001
2011-05-21 15:39:26-04:0046.779999
......
2011-06-14 01:31:37-04:0021.820000
2011-06-14 01:31:38-04:0021.850000
2011-06-14 01:31:40-04:0021.900000
2011-06-14 01:31:42-04:0021.799999
2011-06-14 01:31:44-04:0021.870001

887457 rows × 1 columns

JSON

在这里插入图片描述

读取

json_read = pd.read_json("data/UI.json")
json_read
widget
debugon
image{'src': 'Images/Sun.png', 'name': 'sun1', 'hOf...
text{'data': 'Click Here', 'size': 36, 'style': 'b...
window{'title': 'Sample Konfabulator Widget', 'name'...
json_read = pd.read_json("data/UI.json", orient="records")
json_read
widget
debugon
image{'src': 'Images/Sun.png', 'name': 'sun1', 'hOf...
text{'data': 'Click Here', 'size': 36, 'style': 'b...
window{'title': 'Sample Konfabulator Widget', 'name'...
json_read = pd.read_json("data/UI.json", orient="index")
json_read
debugwindowimagetext
widgeton{'title': 'Sample Konfabulator Widget', 'name'...{'src': 'Images/Sun.png', 'name': 'sun1', 'hOf...{'data': 'Click Here', 'size': 36, 'style': 'b...
json_read = pd.read_json("data/UI.json", orient="values")
json_read
widget
debugon
image{'src': 'Images/Sun.png', 'name': 'sun1', 'hOf...
text{'data': 'Click Here', 'size': 36, 'style': 'b...
window{'title': 'Sample Konfabulator Widget', 'name'...

保存

# 不做详细介绍
# xxx.to_json()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值