使用selenium+BeautifulSoup爬取上证指数并画出k线图

先放完整代码,数据来源https://cn.investing.com/indices/shanghai-composite-historical-data。爬取时请遵守爬虫协议

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from bs4 import BeautifulSoup as bs
from urllib import request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import re


#get直接返回,不再等待界面加载完成,有些网站加载缓慢,不加这一段的话,需要等待全部加载完成才能进行后续操作,浪费时间
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"

wd = webdriver.Chrome(r'D:\chrome\chromedriver_86_0Ver\chromedriver.exe')# 创建一个webdriver对象
wd.get('https://cn.investing.com/indices/shanghai-composite-historical-data')
print('open website')
time.sleep(10)
#打开日历
press = wd.find_element_by_id('widgetFieldDateRange')
press.click()
print('open calender')
time.sleep(1)
# 初始时间
element1 = wd.find_element_by_id('startDate')
print('find startDate')
time.sleep(1)
element1.clear()
print("clear old date")
time.sleep(1)
element1.send_keys('2020/07/01')
print("input start date")
time.sleep(1)
# 结束时间
element2 = wd.find_element_by_id('endDate')
print('find endDate')
time.sleep(1)
element2.clear()
print("clear old date")
time.sleep(1)
element2.send_keys('2020/11/13')
print("input start date")
time.sleep(1)
# 点击查询
press1 = wd.find_element_by_id('applyBtn')
print('find enter button')
time.sleep(1)
press1.click()
time.sleep(1)
print('click enter button. The data you want has been shown!')
pass

## 这一段注释的代码为调试使用,调试时如果用selenium就非常慢
# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
# url = "https://cn.investing.com/indices/shanghai-composite-historical-data"
# req = request.Request(url=url, headers=headers)
# res = request.urlopen(req)
# print(res.status)
# html_data = res.read().decode("utf-8")

html_data = wd.page_source

soup = bs(html_data, "html.parser")
currTable = soup.find_all("table", id="curr_table", class_="genTbl closedTbl historicalTbl")
currTable_str = str(currTable)
# print(currTable_str)
# 匹配 日期
patternDate = re.compile(r'">\d{4}.+</td>')
dateList = re.findall(patternDate, currTable_str)
# 收盘价
patternClosingValue = re.compile(r'Font" data-real-value="\d,\d{3}\.\d{2}')
closingPriceList = re.findall(patternClosingValue, currTable_str)
# 开盘价 高值 低值
patternOtherValue = re.compile(r'<td data-real-value="\d,\d{3}\.\d{2}')
otherPriceList = re.findall(patternOtherValue, currTable_str)
# 涨跌幅
patternPercentage = re.compile(r'Font">.?\d\.\d{2}')
percentageList = re.findall(patternPercentage, currTable_str)

result = []
colorList = []
Date = []
for i in range(len(dateList)):
    date_i = dateList[i][2:-5]
    # 收盘价
    closingPrice_i_str = closingPriceList[i][23:]
    closingPrice_i = float(closingPrice_i_str[0] + closingPrice_i_str[2:])
    # 开盘价
    openingPrice_i_str = otherPriceList[i * 3][21:]
    openingPrice_i = float(openingPrice_i_str[0] + openingPrice_i_str[2:])
    # 高值
    highPrice_i_str = otherPriceList[i * 3 + 1][21:]
    highPrice_i = float(highPrice_i_str[0] + highPrice_i_str[2:])
    # 低值
    lowPrice_i_str = otherPriceList[i * 3 + 2][21:]
    lowPrice_i = float(lowPrice_i_str[0] + lowPrice_i_str[2:])

    percentage_i = float(percentageList[i][6:])
    # 指定涨跌的红色和绿色
    if closingPrice_i > openingPrice_i:
        p1 = closingPrice_i
        p2 = openingPrice_i
    else:
        p1 = openingPrice_i
        p2 = closingPrice_i
    if percentage_i > 0:
        colorList.append('r')
    else:
        colorList.append('g')
    # ["ClosingPrice", "OpenPrice", "HighPrice", "LowPrice", "Rate"]
    lst = [p1, p2, closingPrice_i, openingPrice_i, highPrice_i, lowPrice_i, percentage_i]
    result.append(lst)
    Date.append(date_i)

# 爬取的原始数据按照时间由近到远排列,倒序以后按由远到近排列,画图时候才呈现左边时远的时间,右边时近的时间
Date_r = list(reversed(Date))
colorList_r = list(reversed(colorList))
print(result)
result = np.array(result)
result_r = result[::-1, :]
x = np.arange(result.shape[0])
#创建一个figure画图对象
fig, ax = plt.subplots(1, 1, dpi=160)
#用subplot方法绘制子图
plt.subplot(111) #subplot方法中有3个int参数,分别代表1行,2列,最后一个1代表正在绘制第1个子图
plt.bar(x, result_r[:, 0], width=0.5, color=colorList_r)#color参数传入颜色列表,可以在一幅图中显示不同颜色
plt.bar(x, result_r[:, 1], width=0.5, color='w')
# plt.scatter(x, result_r[:, 4], s=1, color='r') #color参数传入颜色列表,可以在一幅图中显示不同颜色
# plt.scatter(x, result_r[:, 5], s=1, color='g')
plt.plot(x, result_r[:, 2])
# 数值显示在上面
for x1, y1 in zip(x, result_r[:, 2]):
    plt.text(x1, y1 + 1, str(y1), ha='center', va='bottom', fontsize=4.5, rotation=60)

plt.ylim([3000, 3500])
# 设置x轴标签
plt.xticks(range(len(Date)), Date_r, rotation=60, fontsize=8)

plt.show()

print('done')

主要内容有三点,第一点是selenium库进行点击,输入操作

press = wd.find_element_by_id('widgetFieldDateRange')
press.click()
print('open calender')
time.sleep(1)

find_element_by_id找到元素,.click()方法进行点击操作,sleep是为了等待网站加载,如果没有加载出来,无法进行后续操作

element1 = wd.find_element_by_id('startDate')
print('find startDate')
time.sleep(1)
element1.clear()
print("clear old date")
time.sleep(1)
element1.send_keys('2020/07/01')
print("input start date")
time.sleep(1)

找到文本输入框,清除里面的原始内容,输入新内容

第二点是BeautifulSoup库+正则表达式找到所需内容

html_data = wd.page_source

soup = bs(html_data, "html.parser")

解析网站

currTable = soup.find_all("table", id="curr_table", class_="genTbl closedTbl historicalTbl")
currTable_str = str(currTable)

找到所需的标签

# 匹配 日期
patternDate = re.compile(r'">\d{4}.+</td>')
dateList = re.findall(patternDate, currTable_str)
# 收盘价
patternClosingValue = re.compile(r'Font" data-real-value="\d,\d{3}\.\d{2}')
closingPriceList = re.findall(patternClosingValue, currTable_str)
# 开盘价 高值 低值
patternOtherValue = re.compile(r'<td data-real-value="\d,\d{3}\.\d{2}')
otherPriceList = re.findall(patternOtherValue, currTable_str)
# 涨跌幅
patternPercentage = re.compile(r'Font">.?\d\.\d{2}')
percentageList = re.findall(patternPercentage, currTable_str)

正则表达式匹配搜索,匹配规则参考这篇博文https://www.cnblogs.com/boyfyw/p/11911124.html

第三点是matplotlib画图,画出开盘价和收盘价这种“悬空”柱状图

plt.subplot(111) #subplot方法中有3个int参数,分别代表1行,2列,最后一个1代表正在绘制第1个子图
plt.bar(x, result_r[:, 0], width=0.5, color=colorList_r)#color参数传入颜色列表,可以在一幅图中显示不同颜色
plt.bar(x, result_r[:, 1], width=0.5, color='w')

方法为:画两个柱状图,前面一个设置颜色为白色,就K线图里的开盘价和收盘价这种“悬空”柱状图效果就出来了。后面柱形图的颜色通过一个名为colotList_r的颜色列表进行指定。

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值