先放完整代码,数据来源https://cn.investing.com/indices/shanghai-composite-historical-data。爬取时请遵守爬虫协议
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from bs4 import BeautifulSoup as bs
from urllib import request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import re
#get直接返回,不再等待界面加载完成,有些网站加载缓慢,不加这一段的话,需要等待全部加载完成才能进行后续操作,浪费时间
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
wd = webdriver.Chrome(r'D:\chrome\chromedriver_86_0Ver\chromedriver.exe')# 创建一个webdriver对象
wd.get('https://cn.investing.com/indices/shanghai-composite-historical-data')
print('open website')
time.sleep(10)
#打开日历
press = wd.find_element_by_id('widgetFieldDateRange')
press.click()
print('open calender')
time.sleep(1)
# 初始时间
element1 = wd.find_element_by_id('startDate')
print('find startDate')
time.sleep(1)
element1.clear()
print("clear old date")
time.sleep(1)
element1.send_keys('2020/07/01')
print("input start date")
time.sleep(1)
# 结束时间
element2 = wd.find_element_by_id('endDate')
print('find endDate')
time.sleep(1)
element2.clear()
print("clear old date")
time.sleep(1)
element2.send_keys('2020/11/13')
print("input start date")
time.sleep(1)
# 点击查询
press1 = wd.find_element_by_id('applyBtn')
print('find enter button')
time.sleep(1)
press1.click()
time.sleep(1)
print('click enter button. The data you want has been shown!')
pass
## 这一段注释的代码为调试使用,调试时如果用selenium就非常慢
# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
# url = "https://cn.investing.com/indices/shanghai-composite-historical-data"
# req = request.Request(url=url, headers=headers)
# res = request.urlopen(req)
# print(res.status)
# html_data = res.read().decode("utf-8")
html_data = wd.page_source
soup = bs(html_data, "html.parser")
currTable = soup.find_all("table", id="curr_table", class_="genTbl closedTbl historicalTbl")
currTable_str = str(currTable)
# print(currTable_str)
# 匹配 日期
patternDate = re.compile(r'">\d{4}.+</td>')
dateList = re.findall(patternDate, currTable_str)
# 收盘价
patternClosingValue = re.compile(r'Font" data-real-value="\d,\d{3}\.\d{2}')
closingPriceList = re.findall(patternClosingValue, currTable_str)
# 开盘价 高值 低值
patternOtherValue = re.compile(r'<td data-real-value="\d,\d{3}\.\d{2}')
otherPriceList = re.findall(patternOtherValue, currTable_str)
# 涨跌幅
patternPercentage = re.compile(r'Font">.?\d\.\d{2}')
percentageList = re.findall(patternPercentage, currTable_str)
result = []
colorList = []
Date = []
for i in range(len(dateList)):
date_i = dateList[i][2:-5]
# 收盘价
closingPrice_i_str = closingPriceList[i][23:]
closingPrice_i = float(closingPrice_i_str[0] + closingPrice_i_str[2:])
# 开盘价
openingPrice_i_str = otherPriceList[i * 3][21:]
openingPrice_i = float(openingPrice_i_str[0] + openingPrice_i_str[2:])
# 高值
highPrice_i_str = otherPriceList[i * 3 + 1][21:]
highPrice_i = float(highPrice_i_str[0] + highPrice_i_str[2:])
# 低值
lowPrice_i_str = otherPriceList[i * 3 + 2][21:]
lowPrice_i = float(lowPrice_i_str[0] + lowPrice_i_str[2:])
percentage_i = float(percentageList[i][6:])
# 指定涨跌的红色和绿色
if closingPrice_i > openingPrice_i:
p1 = closingPrice_i
p2 = openingPrice_i
else:
p1 = openingPrice_i
p2 = closingPrice_i
if percentage_i > 0:
colorList.append('r')
else:
colorList.append('g')
# ["ClosingPrice", "OpenPrice", "HighPrice", "LowPrice", "Rate"]
lst = [p1, p2, closingPrice_i, openingPrice_i, highPrice_i, lowPrice_i, percentage_i]
result.append(lst)
Date.append(date_i)
# 爬取的原始数据按照时间由近到远排列,倒序以后按由远到近排列,画图时候才呈现左边时远的时间,右边时近的时间
Date_r = list(reversed(Date))
colorList_r = list(reversed(colorList))
print(result)
result = np.array(result)
result_r = result[::-1, :]
x = np.arange(result.shape[0])
#创建一个figure画图对象
fig, ax = plt.subplots(1, 1, dpi=160)
#用subplot方法绘制子图
plt.subplot(111) #subplot方法中有3个int参数,分别代表1行,2列,最后一个1代表正在绘制第1个子图
plt.bar(x, result_r[:, 0], width=0.5, color=colorList_r)#color参数传入颜色列表,可以在一幅图中显示不同颜色
plt.bar(x, result_r[:, 1], width=0.5, color='w')
# plt.scatter(x, result_r[:, 4], s=1, color='r') #color参数传入颜色列表,可以在一幅图中显示不同颜色
# plt.scatter(x, result_r[:, 5], s=1, color='g')
plt.plot(x, result_r[:, 2])
# 数值显示在上面
for x1, y1 in zip(x, result_r[:, 2]):
plt.text(x1, y1 + 1, str(y1), ha='center', va='bottom', fontsize=4.5, rotation=60)
plt.ylim([3000, 3500])
# 设置x轴标签
plt.xticks(range(len(Date)), Date_r, rotation=60, fontsize=8)
plt.show()
print('done')
主要内容有三点,第一点是selenium库进行点击,输入操作
press = wd.find_element_by_id('widgetFieldDateRange')
press.click()
print('open calender')
time.sleep(1)
find_element_by_id找到元素,.click()方法进行点击操作,sleep是为了等待网站加载,如果没有加载出来,无法进行后续操作
element1 = wd.find_element_by_id('startDate')
print('find startDate')
time.sleep(1)
element1.clear()
print("clear old date")
time.sleep(1)
element1.send_keys('2020/07/01')
print("input start date")
time.sleep(1)
找到文本输入框,清除里面的原始内容,输入新内容
第二点是BeautifulSoup库+正则表达式找到所需内容
html_data = wd.page_source
soup = bs(html_data, "html.parser")
解析网站
currTable = soup.find_all("table", id="curr_table", class_="genTbl closedTbl historicalTbl")
currTable_str = str(currTable)
找到所需的标签
# 匹配 日期
patternDate = re.compile(r'">\d{4}.+</td>')
dateList = re.findall(patternDate, currTable_str)
# 收盘价
patternClosingValue = re.compile(r'Font" data-real-value="\d,\d{3}\.\d{2}')
closingPriceList = re.findall(patternClosingValue, currTable_str)
# 开盘价 高值 低值
patternOtherValue = re.compile(r'<td data-real-value="\d,\d{3}\.\d{2}')
otherPriceList = re.findall(patternOtherValue, currTable_str)
# 涨跌幅
patternPercentage = re.compile(r'Font">.?\d\.\d{2}')
percentageList = re.findall(patternPercentage, currTable_str)
正则表达式匹配搜索,匹配规则参考这篇博文https://www.cnblogs.com/boyfyw/p/11911124.html
第三点是matplotlib画图,画出开盘价和收盘价这种“悬空”柱状图
plt.subplot(111) #subplot方法中有3个int参数,分别代表1行,2列,最后一个1代表正在绘制第1个子图
plt.bar(x, result_r[:, 0], width=0.5, color=colorList_r)#color参数传入颜色列表,可以在一幅图中显示不同颜色
plt.bar(x, result_r[:, 1], width=0.5, color='w')
方法为:画两个柱状图,前面一个设置颜色为白色,就K线图里的开盘价和收盘价这种“悬空”柱状图效果就出来了。后面柱形图的颜色通过一个名为colotList_r的颜色列表进行指定。