全代码
仍然是对静态网页的数据爬取
#CrowBaiduStocks.py
import requests
from bs4 import BeautifulSoup
import traceback
import re
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def getStocksList(lst,stocksurl):
html=getHTMLText(stocksurl)
soup=BeautifulSoup(html,"html.parser")
a=soup.find_all('a')
for i in a:
try:
href=i.attrs['href']
onestock=re.findall(r'[s][hz]\d{6}',href)
lst.append(onestock[0])
except:
#traceback.print_exc()
continue
def StocksParse(lst,stocksurl,fpath):
for stock in lst:
infodic={}
onestockurl='https://gupiao.baidu.com/stock/'+stock+'.html'
#print(onestockurl)
onestock=getHTMLText(onestockurl)
try:
soup=BeautifulSoup(onestock,"html.parser")
div=soup.find('div',class_='stock-bets')
name=div.find_all('a',class_='bets-name')[0]
infodic.update({'股票名称':name.text.split()[0]})
keylist=div.find_all('dt')
valuelist=div.find_all('dd')
for i in range(len(keylist)):
key=keylist[i].text
value=valuelist[i].text
infodic[key]=value
with open(fpath,'a',encoding='utf-8') as f:
f.write(str(infodic)+'\n')
except:
traceback.print_exc()
continue
def main():
stocksurl='http://quote.eastmoney.com/stocklist.html'
lst=[]
getStocksList(lst,stocksurl)
fpath='F://baidustocks.txt'
StocksParse(lst,stocksurl,fpath)
main()
加深了对find()和find_all()的认识
但仍不熟悉数据处理这一方面,比如字典类型
运行后,爬了很久也没有结束,可见这样的爬虫不适合较大量的数据的爬取