一、获得银行官网网址信息
from urllib import request
from urllib.request import urlopen
import re
url = 'http://www.cbrc.gov.cn/chinese/jrjg/index.html'
def get_content(url,fileName):
"""
因为中国银行的官方网址的服务器可能会因为我们多次进行爬虫,
而对我们的IP进行暂时的封锁,导致实验失败,因此我们只要成功一次,
将数据保存即可
:param url:
:param fileName: 将网页的内容保存到本地文件中
:return:
"""
try:
headers = {'User-agent': 'Chrome/23.0'}
req = request.Request(url, headers=headers)
with urlopen(req) as urlObj:
content = urlObj.read().decode('utf-8')
except Exception as Error:
print('爬取网页信息失败',Error)
else:
with open(fileName, 'w') as f:
f.write(content)
print('write success')
def get_file_content(fileName,url):
get_content(url,fileName)
with open(fileName) as f:
return f.read().replace('\t', '') #去掉文本内容中的许多\t
def get_bank_info(filName,url,New_filename):
# <a href="http://www.jcfc.cn/" target="_blank" style="color:#08619D">
# 晋商消费金融股份有限公司
content = get_file_content(filName,url)
bank_infor = re.findall(r'<a