技术路线:requests-bs4-re
使用场景:股票信息存储在静态页面中,非js调用输出;本例使用东方财务网、百度股票;
from bs4 import BeautifulSoup
import requests
import re#获取html
def getHtmlText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "获取html异常"
#解析股票list清单 http://quote.eastmoney.com/sh500029.html
def resolveGupiaoList(glist, htext):
try:
soup = BeautifulSoup(htext, "html.parser")
alist = soup.find_all("a")
pat = re.compile(r'http://quote.eastmoney.com/sh\d{6}.html|http://quote.eastmoney.com/sz\d{6}.html')
for i in range(len(alist)):
if (alist[i].get('href')):
link = pat.search(alist[i].attrs['href'])
if link:
glist.append(str(link.group(0).split('/')[-1].split('.')[0]))
except:
print("resolveGupiaoList异常")
#解析个股交易详情,写入文件
def getGupiaoDetail(htext, fileAddress):
try:
gpdict = {}
soup = BeautifulSoup(htext, "html.parser")
gpname = soup.find('a', attrs={'class':'bets-name'})
if(gpname):
gpdict['股票名称'] = gpname.text.split()[0]
gpprice = soup.find('strong', attrs={'class':'_close'})
if(gpprice):
gpdict['当前股价'] = gpprice.text.split()[0]
stockInfo = soup.find('div',attrs={'class':'stock-bets'})
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
gpdict[key] = val
f = open(fileAddress, 'a', encoding='utf-8')
f.write(str(gpdict) + '\n' )
except:
print("获取个股信息失败")
def main():
glist = []
baiduGupiaoUrl = "https://gupiao.baidu.com/stock/"
gupiaoListUrl = "http://quote.eastmoney.com/stocklist.html"
fileAddress = 'D:/gupiao.txt'
htext = getHtmlText(gupiaoListUrl)
resolveGupiaoList(glist, htext)
for gpitem in glist:
baiduhtext = getHtmlText(baiduGupiaoUrl + gpitem + '.html')
getGupiaoDetail(baiduhtext, fileAddress)
main()
优化方向:1、优化requests编码解析;
r.apparent_encoding 通过全文遍历url返回的html文本,解析可能使用的编码。执行效率收到影响。
改进:当返回的页面内容较大,或者需要反复返回同一类页面时,可以提前确认页面的编码,从而直接赋值encoding。
code="utf-8"
r.encoding = code
2、动态显示股票解析进度,提高用户体验。
print("\r当前进度: {:.2f}%".format(count*100/len(glist)),end="")