【Python】从新浪抓取美国天然气NG历史交易数据并进行可视化

import urllib.request
import re
import datetime
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
import time
import matplotlib.ticker as ticker
import re
import json

from urllib import request
from urllib import parse

# 设置全局变量,用于保存提取的数据
HISTORY_ALL = []#存放所有历史数据
div_list = []#存放网页中的div表
infos = ['日期','收盘价', '开盘价','最高价', '最低价','成交量']
PER_PAGE_DAYS = 25    #单个网页爬取的天数

cyc_val = 17.502  #设置成本价

def getHtml(url):
    """
    打开网页
    :param url:
    :return:
    """
    while True:
        try:
            html = urllib.request.urlopen(url, timeout=5).read()
            break
        except:
            print("超时重试")
    html = html.decode('gbk')
    return html
def time_long(time1, time2, type="day"):
    """
    计算日期之间的时间差
    :param time1: 较小的时间(datetime类型)

    :param time2: 较大的时间(datetime类型)

    :param type: 返回结果的时间类型(暂时就是返回相差天数)

    :return: 相差的天数

    """

    day1 = time.strptime(str(time1), '%Y-%m-%d')

    day2 = time.strptime(str(time2), '%Y-%m-%d')

    if type == 'day':

        day_num = (int(time.mktime(day2)) - int(time.mktime(day1))) / (

            24 * 60 * 60)

    return abs(int(day_num))
def get_end_point(start_time,days):
    (year, month, day) = start_time.split("-")
    day1 = datetime.date(int(year), int(month), int(day))

    ended_point = day1 + datetime.timedelta(days)
    return ended_point
def get_current_USDCNY():
    """
    获取美元兑人民币实时汇率
    :return:
    """
    # 爬取网页
    url = "http://srh.bankofchina.com/search/whpj/search.jsp"
    Form_Data = {}
    Form_Data['erectDate'] = ''
    Form_Data['nothing'] = ''
    Form_Data['pjname'] = '1316'
    data = parse.urlencode(Form_Data).encode('utf-8')
    html = request.urlopen(url, data).read()
    soup = BeautifulSoup(html, 'html.parser')
    # 解析数据
    div = soup.find('div', attrs={'class': 'BOC_main publish'})
    table = div.find('table')

    tr = table.find_all('tr')

    td = tr[1].find_all('td')
    print(td[0].get_text(), td[1].get_text(), td[2].get_text(),td[3].get_text(), td[4].get_text(), td[5].get_text(), td[6].get_text())
    return float(td[3].get_text())/100,td[6].get_text()

USDCNY ,current_time= get_current_USDCNY()
print("实时汇率   "+current_time+"       USDCNY:"+str(USDCNY))

#输入起止时间
while 1:
    print("----------------Please input the start time and the end time----------------")
    start_time = input()
    end_time = input()
    print("----------------The start time ----------------"+start_time)
    print("----------------The end time   ----------------"+end_time)

    if start_time>=end_time:
        print("----------------Input date invalided.Please try again ----------------")
    else:
        break

#DEBUG
# start_time = "2018-08-09"
# end_time = "2018-10-06"

daysLen=time_long(end_time,start_time)
print(daysLen)

if daysLen < 20:
    ylabel_dis = 1
else:
    ylabel_dis = int(daysLen/20)   #用于控制绘图时横轴的标注

per_loop_len = 0               #用于保存每次爬取的长度,为实际爬取的长度

if(daysLen <= PER_PAGE_DAYS):
    per_loop_len = daysLen
    loop = 1
    left = 0
else:
    per_loop_len = PER_PAGE_DAYS
    loop = int(daysLen/PER_PAGE_DAYS)
    left = daysLen%PER_PAGE_DAYS

print("爬取循环次数"+str(loop))
print("爬取循环次数"+str(left))

ended_point = start_time
cnt = 0
end_flag = 0
total_ok = 0

xlabel = []
date = []
maxVal = []
minVal = []
PERP_PAGE_INFO = []

while 1:
    #===========================日期处理部分,按每页25天的大小爬取网页
    print("=============================PAGE"+str(cnt)+"=============================")
    start_point = ended_point
    ended_point = get_end_point(str(start_point), per_loop_len);
    # print(str(cnt) + ":" + str(left) + ":" + str(per_loop_len))
    print("start_point:" + str(start_point))
    print("ended_point:" + str(ended_point))

    #===========================生成URL
    Url = 'https://vip.stock.finance.sina.com.cn/q/view/vFutures_History.php?jys=NYME&pz=NG&hy=&breed=NG&type=global&start=' + str(start_point) + '&end=' + str(ended_point)
    html = getHtml(Url)
    soup = BeautifulSoup(html, 'lxml')
    #============================得到本页表格
    table = soup.find('div', attrs={'class': 'historyList'})
    trs = table.find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        div_list.clear()
        for td in tds:
            divs = td.find_all('div', attrs={'align': 'center'})
            for div in divs:
                val = div.get_text()
                div_list.append(val)
        data = {}
        for i in range(0, len(div_list)):
            data.update({infos[i]: div_list[i]})
            # print(div_list[i])
        PERP_PAGE_INFO.append(data)

    for i in range(1, len(PERP_PAGE_INFO)-2):
        total_ok = total_ok+1
        xlabel.append(total_ok)
        date.append(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i].get("日期"))
        USD_maxVal = float(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i].get("最高价"))
        maxVal.append(USD_maxVal*USDCNY)
        USD_minVal = float(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i].get("最低价"))
        if USD_minVal == 0.020:#仅仅针对2016-02-24的异常最小值进行修正
            USD_minVal = 1.7+0.020
        minVal.append(USD_minVal*USDCNY)
        HISTORY_ALL.append(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i])
    print("the length of per page : --->"+str(len(xlabel)))
    # print(xlabel)
    # print(date)
    # print(maxVal)
    # print(minVal)

    if end_flag == 1:
        print("-----------------END-------------------")
        break
    cnt = cnt+1
    if cnt == loop:
        if left == 0:
            print("-----------------END-------------------")
            break
        else :
            per_loop_len = left
            left = 0
            end_flag = 1
    PERP_PAGE_INFO.clear()
print(HISTORY_ALL)

# 成本均价,绘制成本线
length = len(xlabel)
CYC = []
for i in range(0,length):
    CYC.append(cyc_val)

# ---------------------------------------画图---------------------------------------
fig,ax = plt.subplots()
plt.rcParams['figure.dpi'] = 300 #分辨率

# plot data
ax.plot(date, maxVal, 'b', label='maxVal', linewidth=0.4)
ax.plot(date, minVal, 'r', label='minVal', linewidth=0.4)
ax.plot(date, CYC, 'c', label='CYC', linewidth=0.3)

#解决横轴过密的问题1
for label in ax.get_xticklabels():
    label.set_visible(False)
for label in ax.get_xticklabels()[::ylabel_dis]:
    label.set_visible(True)

#设置x轴标签文字的大小(size),倾斜角度(rotation),字体大小(fontsize)
plt.xticks(size='small',rotation=90,fontsize=5)

plt.title('NG Historical Data')  # 添加图表标题
plt.ylabel('Price/RMB')  # 添加 y 轴标题
plt.xlabel('Date')  # 添加 x 轴标题
plt.legend ()#设置图例
#以分辨率 300 来保存图片
plt.savefig('NG-'+start_time+"-"+end_time+'.png', dpi=300) #指定分辨率保存
# show the figure
plt.show()

使用方式:运行程序。运行环境;pycharm  python3.65 Anconda3

输入起始时间:如 2016-08-01

输入结束时间:如2019-09-06

格式要正确,等待爬取数据。

运行过程如下图

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值