候选网站的选择:股票信息静态存在于HTML页面中,非js代码生成,没有Robots协议限制。
程序结构:
- 从东方财富网获取股票列表
- 根据股票列表逐个到百度股票获取个股信息
- 将结果存储到文件
import requests
from bs4 import BeautifulSoup
import re
import traceback
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('连接网页失败')
#在东方财富网上获取所有股票的列表
def getStockList(lst,stockURL):
html = getHTMLText(stockURL)
soup = BeautifulSoup(html,'html.parser')
a = soup.find_all('a') #股票代码在a标签中
for i in a:
try:
href = i.attrs['href']
lst.append(re.findall(r'[s][hz]\d{6}',href)[0]) #使用正则表达式来匹配上海或者深圳股票
except:
continue
#在百度股票上对单个股票进行数据查询
def getStockInfo(lst,stockURL,fpath):
for stock in lst:
url = stockURL + str(stock) +'.html'
html =getHTMLText(url)
try:
if html == '':
continue
infoDict = {}
soup = BeautifulSoup(html,'html.parser')
stockInfo = soup.find('div',attrs={'class':'stock-bets'})
if stockInfo == None:
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(stock) + ':该股票暂不支持查询' + '\n')
else:
name = stockInfo.find_all(attrs={'class':'bets-name'})[0] #获取股票名字
infoDict.update({'股票名称':name.text.split()[0]})
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
infoDict[key] = val #获取股票的更多信息
with open(fpath,'a',encoding='utf-8') as f:
f.write(str(infoDict) + '\n')
except:
traceback.print_exc()
continue
if __name__ == '__main__':
stock_list_url = 'http://quote.eastmoney.com/stock_list.html'
stock_info_url = 'https://gupiao.baidu.com/stock/'
output_file = 'P:\Tools\Pycharm\PycharmProjects\\test\data\stock.txt'
slist = []
getStockList(slist,stock_list_url)
getStockInfo(slist,stock_info_url,output_file)
一开始对所有的在东方财富网上查询到的股票进行查询,但是发现有些股票在百度股票上是没有的,会有’404‘的错误,所以中间做了个处理
if stockInfo == None:
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(stock) + ':该股票暂不支持查询' + '\n')
优化:
- 提高速度:编码识别的优化
由于r.apparent_encoding是通过解析文本使用了什么编码方式,需要一些时间去解析,所以这里我们可以先解析出文本编码,再赋值给r.encoding
增加一个code变量。
def getHTMLText(url,code='utf-8'):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = code
return r.text
except:
print('连接网页失败')
def getStockList(lst,stockURL):
html = getHTMLText(stockURL,'GB2312')
- 增加用户体验:增加动态进度显示(不换行输出)
with open(fpath,'a',encoding='utf-8') as f:
f.write(str(infoDict) + '\n')
count = count + 1
print('\r当前进度:{:.2f}%'.format(count * 100 / len(lst)),end='')
python 不换行输出
for i in range(1000000):
print("\r已完成 {:.2f}%".format(i*100/1000000), end = "")