- 花了一晚上的时间调试,练习,终于把这个程序改动好了
- 通过这个更熟悉了BeautifulSoup库的用法,也练习了正则表达式的使用
import requests
from bs4 import BeautifulSoup
import bs4
import traceback
import re
def getHTMLText(url, code="utf-8"):
try:
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
r = requests.get(url, headers=Headers, timeout=30)
r.raise_for_status()
r.encoding = code
return r.text
except:
return "网页访问失败"
def getFundList(lst, fundURL):
html = getHTMLText(fundURL, "GB2312")
soup = BeautifulSoup(html, 'html.parser')
tr = soup.find_all('tr')
for i in tr:
try:
id = i.attrs['id']
lst.append(re.findall(r"\d{6}", id)[0])
except:
continue
def getFundInfo(lst, fundURL, fpath):
count = 0
for fund_code in lst:
url = fundURL + fund_code + ".html"
html = getHTMLText(url)
try:
if html == "":
continue
infoDict = {}
soup = BeautifulSoup(html, 'html.parser')
FundInfo = soup.find('div', attrs={'class': 'merchandiseDetail'})
if isinstance(FundInfo, bs4.element.Tag):
name = FundInfo.find_all(attrs={'class': "fundDetail-tit"})[0]
infoDict.update({'基金名称': name.text})
keyList = FundInfo.find_all('dt')
valueList = FundInfo.find_all('dd')
for i in range(len(keyList)):
if i==0:
key = re.findall(r"^净值估算.{16}", keyList[i].text)[0]
val = re.findall(r"[\+\-]\d.\d\d+%$", valueList[3 * i].text)[0], valueList[3 * i + 1].text, valueList[3 * i + 2].text
infoDict[key] = val
else:
key = keyList[i].text
val = valueList[3*i].text,valueList[3*i+1].text,valueList[3*i+2].text
infoDict[key] = val
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(infoDict) + '\n')
count = count + 1
if count > 100:
break
print("\r当前进度: {:.2f}%".format(count * 100 / 100), end="")
except:
count = count + 1
print("\r当前进度: {:.2f}%".format(count * 100 / 100), end="")
continue
def main():
Fund_list_url = 'https://fund.eastmoney.com/fund.html#os_0;isall_0;ft_;pt_1'
Fund_info_url = 'https://fund.eastmoney.com/'
output_file = 'D:/DayDayFund.txt'
slist = []
getFundList(slist, Fund_list_url)
getFundInfo(slist, Fund_info_url, output_file)
main()