爬虫复习一(parse里面的几种用法/正则的用法/多字符匹配,贪婪模式/多字符匹配,非贪婪模式/正则的五种用法/)

正则
from urllib import request
from urllib import error

class CollegateRank(object):

def get_page_data(self,url):
    response = self.send_request(url=url)
    if response:
        # print(response)
        with open('page.html','w',encoding='gbk') as file:
            file.write(response)
        self.parse_page_data(response)

    #继续提取下一页,继续发起请求

def parse_page_data(self,response):
    pattern = re.compile('<div\sclass="scores_List">(.*?)<ul\sclass="fany">',re.S)
    sub_str = re.findall(pattern,response)[0]
    #解析分页列表数据
    pattern = re.compile(
        '<dl>.*?<a\shref="(.*?)".*?>'+
        '.*?<img.*?src="(.*?)".*?>'+
        '.*?<a.*?>(.*?)</a>'+
        '.*?<li>.*?:(.*?)</li>'+
        '.*?<li>(.*?)</li>'+
        '.*?<li>.*?:(.*?)</li>'+
        '.*?<li>.*?:(.*?)</li>'+
        '.*?<li>.*?:(.*?)</li>'+
        '.*?<li>.*?:(.*?)</li>.*?</dl>',
        re.S
    )
    ranks = re.findall(pattern,sub_str)
    print(ranks)

    for item in ranks:
        school_info = {}
        school_info['url'] = item[0] #详情url地址
        school_info['icon'] = item[1]
        school_info['name'] = item[2]
        school_info['adress'] = item[3]
        school_info['tese'] = '、'.join(re.findall('<span.*?>(.*?)</span>',item[4]))
        school_info['type'] = item[5]
        school_info['belong'] = item[6]
        school_info['level'] = item[7]
        school_info['weburl'] = item[8]

        print(school_info)

        xiangqing = self.send_request(url=school_info['url'])
        pattern = re.compile(
            '<dd\sclass="left">.*?<li>.*?</li>.*?<li>.*?</li>.*?<li>.*?</li>.*?<li>(.*?)</li>.*?<p>.*?:(.*?)<br />.*?:(.*?)<br />.*?:(.*?)<br
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值