python三种方法爬取糗事百科时间对比

# -*- coding: utf-8 -*-
"""
Created on Fri Jan 19 22:59:33 2018

@author: Administrator
"""

import requests
import time
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
        }

import re
def re_scraper(url):
    res=requests.get(url,headers=headers)
    id=re.findall(r'<h2>\n(.*?)\n</h2>',res.text)
    text=re.findall(r'<span>\s+(.*?)\s+</span>',res.text)[:-1]
    haoxiao=re.findall(r'<span class="stats-vote"><i class="number">(\d+)</i> 好笑</span>',res.text)
    pinglun=re.findall(r'<i class="number">(\d+)</i> 评论',res.text)
    for id,text,xiao,ping in zip(id,text,haoxiao,pinglun):
        info={'id':id,
              'text':text,
              'haoxiao':xiao,
              'pinglun':ping}
        return info

from bs4 import BeautifulSoup
def bs_scraper(url):
    res=requests.get(url,headers=headers)
    soup=BeautifulSoup(res.text,'html.parser')
    text_soup=soup.find_all('div',class_='content')#文本内容
    text_list=list(map(lambda x:x.text.strip(),text_soup))
    id_soup=soup.find_all('h2')#昵称   
    id_list=list(map(lambda x:x.text.strip(),id_soup))
    xiao_soup=soup.find_all('span',class_='stats-vote')#好笑数   
    xiao_list=list(map(lambda x:int(x.i.text),xiao_soup))    
    lun_soup=soup.find_all('a',class_='qiushi_comments')#评论数
    lun_list=list(map(lambda x:x.find('i','number'),lun_soup)) 
    ping_list=[]
    for pinglun in lun_list:
        if pinglun != None:#抓取的有None值
            ping_list.append(int(pinglun.text))
        else:
            pass
    for id,text,xiao,ping in zip(id_list,text_list,xiao_list,ping_list):
        info={'id':id,
              'text':text,
              'haoxiao':xiao,
              'pinglun':ping}
        return info

from lxml import etree
def xp_scraper(url):
    res=requests.get(url,headers=headers)
    file=etree.HTML(res.text)
    zong_list=[]
    for tt in ['long','hot','old']:#有三种形式
    #自己构造循环部分,就是包含所有信息的标签(选定单条整个信息)
        zong='//div[@class="article block untagged mb15 typs_{}"]'.format(tt)
        zong_list.append(zong)
    for cc in zong_list:
        all=file.xpath(cc)
        for each in all:
            id=each.xpath('div[1]/a[2]/h2/text()')
            if id != []:
                id1=id[0].strip()
            else:
                id1='匿名'
            text=each.xpath('a[1]/div/span/text()')[0].strip()
            haoxiao=each.xpath('div[2]/span[1]/i/text()')[0].strip()
            pinglun=each.xpath('div[2]/span[2]/a/i/text()')[0].strip()
            info={'id':id1,
                  'text':text,
                  'haoxiao':haoxiao,
                  'pinglun':pinglun}
            return info

if __name__ == '__main__':
    for name,scraper in [('re',re_scraper),('bs',bs_scraper),('xp',xp_scraper)]:
        start = time.time()       
        for i in range(1,31):
            URL='https://www.qiushibaike.com/text/page/{}/'.format(i)
            scraper(URL)
        end = time.time()
        print(name,end-start)

结果如下:(时间)
re 11.39765214920044
bs 21.445226430892944
xp 11.944683313369751

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值