python 爬虫工具 butter_xpath re bs4 等爬虫解析器的性能比较

最新推荐文章于 2022-06-28 17:28:37 发布

weixin_39849800

最新推荐文章于 2022-06-28 17:28:37 发布

阅读量195

点赞数

文章标签： python 爬虫工具 butter

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/weixin_39849800/article/details/114919563

版权

# -*- coding: utf-8 -*-

import re

import time

import scrapy

from bs4 import BeautifulSoup

class NewsSpider(scrapy.Spider):

name = 'news'

allowed_domains = ['baidu.com']

start_urls = ['http://baijiahao.baidu.com/s?id=1644707202199076031']

def parse(self, response):

re_time_list = []

xpath_time_list = []

lxml_time_list = []

bs4_lxml_time_list = []

html5lib_time_list = []

bs4_html5lib_time_list = []

for _ in range(500):

# re

re_start_time = time.time()

news_title = re.findall(pattern="

(.*?)", string=response.text)[0]

news_content = "".join(re.findall(pattern='(.*?)', string=response.text))

re_time_list.append(time.time() - re_start_time)

# xpath

xpath_start_time = time.time()

news_title = response.xpath("//div[@class='article-title']/h2/text()").extract_first()

news_content = response.xpath('string(//*[@id="article"])').extract_first()

xpath_time_list.append(time.time() - xpath_start_time)

# bs4 html5lib without BeautifulSoup

soup = BeautifulSoup(response.text, "html5lib")

html5lib_start_time = time.time()

news_title = soup.select_one("div.article-title > h2").text

news_content = soup.select_one("#article").text

html5lib_time_list.append(time.time() - html5lib_start_time)

# bs4 html5lib with BeautifulSoup

bs4_html5lib_start_time = time.time()

soup = BeautifulSoup(response.text, "html5lib")

news_title = soup.select_one("div.article-title > h2").text

news_content = soup.select_one("#article").text

bs4_html5lib_time_list.append(time.time() - bs4_html5lib_start_time)

# bs4 lxml without BeautifulSoup

soup = BeautifulSoup(response.text, "lxml")

lxml_start_time = time.time()

news_title = soup.select_one("div.article-title > h2").text

news_content = soup.select_one("#article").text

lxml_time_list.append(time.time() - lxml_start_time)

# bs4 lxml without BeautifulSoup

bs4_lxml_start_time = time.time()

soup = BeautifulSoup(response.text, "lxml")

news_title = soup.select_one("div.article-title > h2").text

news_content = soup.select_one("#article").text

bs4_lxml_time_list.append(time.time() - bs4_lxml_start_time)

re_result = sum(re_time_list)

xpath_result = sum(xpath_time_list)

lxml_result = sum(lxml_time_list)

html5lib_result = sum(html5lib_time_list)

bs4_lxml_result = sum(bs4_lxml_time_list)

bs4_html5lib_result = sum(bs4_html5lib_time_list)

print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")

print(f"re 使用时间：{re_result}")

print(f"xpath 使用时间：{xpath_result}")

print(f"lxml 纯解析使用时间：{lxml_result}")

print(f"html5lib 纯解析使用时间：{html5lib_result}")

print(f"bs4_lxml 转换解析使用时间：{bs4_lxml_result}")

print(f"bs4_html5lib 转换解析使用时间：{bs4_html5lib_result}")

print("\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")

print(f"xpath/re ：{xpath_result / re_result}")

print(f"lxml/re ：{lxml_result / re_result}")

print(f"html5lib/re ：{html5lib_result / re_result}")

print(f"bs4_lxml/re ：{bs4_lxml_result / re_result}")

print(f"bs4_html5lib/re ：{bs4_html5lib_result / re_result}")

print("\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

weixin_39849800

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。