【金融】【python】处理表格(*.xlsx)形式的期货数据
匆忙整理,直接从jupyter notebook上粘贴下来的。
数据读取
打开文件
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# %matplotlib inline
# %config InlineBackend.figure_format = 'svg'
data = pd.read_excel(r'F:\\data\\market_index_data\\恒生指数日线数据.xlsx', sheet_name=0)
# data = pd.read_excel(r'D:\\MINE_FILE\\dataSet\\market_index_data\\恒生指数日线数据.xlsx', sheet_name=0)
data.head()
截取特定列
data = pd.read_excel(r'F:\\data\\market_index_data\\恒生指数日线数据.xlsx', sheet_name=0, usecols=[0,1,2,3,4,10])
# data = pd.read_excel(r'D:\\MINE_FILE\\dataSet\\market_index_data\\恒生指数日线数据.xlsx', sheet_name=0, usecols=[0,1,2,3,4,10])
data.rename(columns={"交易日期": "date", "开盘点位": "Open", "最高点位": "High", "最低点位": "Low", "收盘价": "Close", "成交额(万)": "Volume"},inplace=True)
# 手动取有成交额的部分
data = data.loc[0:3678]
# 确认一下
data.tail()
处理脏数据/空数据
清理逗号和空数据"–"
data = data[~data['Volume'].isin(['--'])]
data['Open'] = data['Open'].str.replace(',', "")
data['High'] = data['High'].str.replace(',', "")
data['Low'] = data['Low'].str.replace(',', "")
data['Close'] = data['Close'].str.replace(',', "")
data['Volume'] = data['Volume'].str.replace(',', "")
转换数据格式
由于带逗号,读出来是字符串,需要进行格式转换。
data[u"Open"] = pd.to_numeric(data[u"Open"])
data[u"High"] = pd.to_numeric(data[u"High"])
data[u"Low"] = pd.to_numeric(data[u"Low"])
data[u"Close"] = pd.to_numeric(data[u"Close"])
data[u"Volume"] = pd.to_numeric(data[u"Volume"])
# 查看一下
data.head()
处理数据索引和顺序
原本是倒的。
data['newindex'] = np.arange(len(data)-1,-1,-1)
data.sort_values('newindex',inplace=True)
data.drop('newindex',axis=1,inplace=True)
data = data.reset_index(drop=True)
data.head()
显示收盘价折线图
plt.figure(figsize=(20,7))
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data['Close'].plot()
# plt.plot(data.loc[:,'Close'].values, 'b', label='Close Price')
plt.xlabel("time")
plt.ylabel("point")
plt.legend()
plt.show()
对读取后的数据进行处理
处理金融指标
featureExtraction
中的函数未放入本文,
from featureExtraction import *
priceSeq = data.iloc[:,1:5].values
data.insert(data.shape[1], 'RSI', RSI(priceSeq))
print('finish RSI')
data.insert(data.shape[1], 'SO', SO(priceSeq))
print('finish SO')
data.insert(data.shape[1], 'WR', WR(priceSeq))
print('finish WR')
data.insert(data.shape[1], 'MACD', MACD(priceSeq))
print('finish MACD')
data.insert(data.shape[1], 'PROC', PROC(priceSeq))
print('finish PROC')
data.insert(data.shape[1], 'Simple10MA', Simple10MA(priceSeq))
print('finish Simple10MA')
data.insert(data.shape[1], 'Weighted10MA', Weighted10MA(priceSeq))
print('finish Weighted10MA')
data.insert(data.shape[1], 'Momentum', Momentum(priceSeq))
print('finish Momentum')
data.insert(data.shape[1], 'ADOscillator', ADOscillator(priceSeq))
print('finish ADOscillator')
data.insert(data.shape[1], 'CCI', CCI(priceSeq))
print('finish CCI')
将表示时间的字符串拆分为数值
def DateSplit(df, col):
"""
split the object of '2010-01-02' into year(2010), month(1) and day(2).
:param df: to operate data (type:DataFrame)
:param col: column label of date object (type:str)
:return: converted date (type: DataFrame)
"""
year, month, day = [], [], []
data = df.loc[:, col].values
df = df.drop([col], axis=1)
for i in range(data.shape[0]):
year.append(int(data[i][:4]))
month.append(int(data[i][5:7]))
day.append(int(data[i][8:]))
date = pd.DataFrame({'year': year, 'month': month, 'day': day})
result = pd.concat([date, df], axis=1)
return result
data= DateSplit(df=data,col='date')
data.head(10)
剔除前20天数据
剔除各个金融指标计算的周期大小的数据。
# 剔除前20天数据
data = data.loc[20:]
data = data.reset_index(drop=True)
data.info()
可视化
plt.subplot(3,2,1)
plt.plot(data.loc[:,'Close'].values, 'r', label='Close')
plt.legend(loc='best')
plt.subplot(3,2,2)
plt.plot(data.loc[:,'RSI'].values, 'b', label='RSI')
plt.subplot(3,2,3)
plt.plot(data.loc[:,'SO'].values, 'g', label='SO')
plt.subplot(3,2,4)
plt.plot(data.loc[:,'WR'].values, 'r', label='WR')
plt.subplot(3,2,5)
plt.plot(data.loc[:,'MACD'].values, 'b', label='MACD')
plt.subplot(3,2,6)
plt.plot(data.loc[:,'PROC'].values, 'g', label='PROC')
plt.show()
plt.subplot(3,2,1)
plt.plot(data.loc[:,'Volume'].values, 'r', label='Close')
plt.legend(loc='best')
plt.subplot(3,2,2)
plt.plot(data.loc[:,'Simple10MA'].values, 'b', label='Simple10MA')
plt.subplot(3,2,3)
plt.plot(data.loc[:,'Weighted10MA'].values, 'g', label='Weighted10MA')
plt.subplot(3,2,4)
plt.plot(data.loc[:,'Momentum'].values, 'r', label='Momentum')
plt.subplot(3,2,5)
plt.plot(data.loc[:,'ADOscillator'].values, 'b', label='ADOscillator')
plt.subplot(3,2,6)
plt.plot(data.loc[:,'CCI'].values, 'g', label='CCI')
plt.show()