正则
from urllib import request
from urllib import error
class CollegateRank(object):
def get_page_data(self,url):
response = self.send_request(url=url)
if response:
# print(response)
with open('page.html','w',encoding='gbk') as file:
file.write(response)
self.parse_page_data(response)
#继续提取下一页,继续发起请求
def parse_page_data(self,response):
pattern = re.compile('<div\sclass="scores_List">(.*?)<ul\sclass="fany">',re.S)
sub_str = re.findall(pattern,response)[0]
#解析分页列表数据
pattern = re.compile(
'<dl>.*?<a\shref="(.*?)".*?>'+
'.*?<img.*?src="(.*?)".*?>'+
'.*?<a.*?>(.*?)</a>'+
'.*?<li>.*?:(.*?)</li>'+
'.*?<li>(.*?)</li>'+
'.*?<li>.*?:(.*?)</li>'+
'.*?<li>.*?:(.*?)</li>'+
'.*?<li>.*?:(.*?)</li>'+
'.*?<li>.*?:(.*?)</li>.*?</dl>',
re.S
)
ranks = re.findall(pattern,sub_str)
print(ranks)
for item in ranks:
school_info = {}
school_info['url'] = item[0] #详情url地址
school_info['icon'] = item[1]
school_info['name'] = item[2]
school_info['adress'] = item[3]
school_info['tese'] = '、'.join(re.findall('<span.*?>(.*?)</span>',item[4]))
school_info['type'] = item[5]
school_info['belong'] = item[6]
school_info['level'] = item[7]
school_info['weburl'] = item[8]
print(school_info)
xiangqing = self.send_request(url=school_info['url'])
pattern = re.compile(
'<dd\sclass="left">.*?<li>.*?</li>.*?<li>.*?</li>.*?<li>.*?</li>.*?<li>(.*?)</li>.*?<p>.*?:(.*?)<br />.*?:(.*?)<br />.*?:(.*?)<br