python 爬虫工具 butter_xpath re bs4 等爬虫解析器的性能比较

# -*- coding: utf-8 -*-

import re

import time

import scrapy

from bs4 import BeautifulSoup

class NewsSpider(scrapy.Spider):

name = 'news'

allowed_domains = ['baidu.com']

start_urls = ['http://baijiahao.baidu.com/s?id=1644707202199076031']

def parse(self, response):

re_time_list = []

xpath_time_list = []

lxml_time_list = []

bs4_lxml_time_list = []

html5lib_time_list = []

bs4_html5lib_time_list = []

for _ in range(500):

# re

re_start_time = time.time()

news_title = re.findall(pattern="

(.*?)", string=response.text)[0]

news_content = "".join(re.findall(pattern='(.*?)', string=response.text))

re_time_list.append(time.time() - re_start_time)

# xpath

xpath_start_time = time.time()

news_title = response.xpath("//div[@class='article-title']/h2/text()").extract_first()

news_content = response.xpath('string(//*[@id="article"])').extract_first()

xpath_time_list.append(time.time() - xpath_start_time)

# bs4 html5lib without BeautifulSoup

soup = BeautifulSoup(response.text, "html5lib")

html5lib_start_time = time.time()

news_title = soup.select_one("div.article-title > h2").text

news_content = soup.select_one("#article").text

html5lib_time_list.append(time.time() - html5lib_start_time)

# bs4 html5lib with BeautifulSoup

bs4_html5lib_start_time = time.time()

soup = BeautifulSoup(response.text, "html5lib")

news_title = soup.select_one("div.article-title > h2").text

news_content = soup.select_one("#article").text

bs4_html5lib_time_list.append(time.time() - bs4_html5lib_start_time)

# bs4 lxml without BeautifulSoup

soup = BeautifulSoup(response.text, "lxml")

lxml_start_time = time.time()

news_title = soup.select_one("div.article-title > h2").text

news_content = soup.select_one("#article").text

lxml_time_list.append(time.time() - lxml_start_time)

# bs4 lxml without BeautifulSoup

bs4_lxml_start_time = time.time()

soup = BeautifulSoup(response.text, "lxml")

news_title = soup.select_one("div.article-title > h2").text

news_content = soup.select_one("#article").text

bs4_lxml_time_list.append(time.time() - bs4_lxml_start_time)

re_result = sum(re_time_list)

xpath_result = sum(xpath_time_list)

lxml_result = sum(lxml_time_list)

html5lib_result = sum(html5lib_time_list)

bs4_lxml_result = sum(bs4_lxml_time_list)

bs4_html5lib_result = sum(bs4_html5lib_time_list)

print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")

print(f"re 使用时间:{re_result}")

print(f"xpath 使用时间:{xpath_result}")

print(f"lxml 纯解析使用时间:{lxml_result}")

print(f"html5lib 纯解析使用时间:{html5lib_result}")

print(f"bs4_lxml 转换解析使用时间:{bs4_lxml_result}")

print(f"bs4_html5lib 转换解析使用时间:{bs4_html5lib_result}")

print("\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")

print(f"xpath/re :{xpath_result / re_result}")

print(f"lxml/re :{lxml_result / re_result}")

print(f"html5lib/re :{html5lib_result / re_result}")

print(f"bs4_lxml/re :{bs4_lxml_result / re_result}")

print(f"bs4_html5lib/re :{bs4_html5lib_result / re_result}")

print("\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值