# -*- coding: utf-8 -*-
import re
import time
import scrapy
from bs4 import BeautifulSoup
class NewsSpider(scrapy.Spider):
name = 'news'
allowed_domains = ['baidu.com']
start_urls = ['http://baijiahao.baidu.com/s?id=1644707202199076031']
def parse(self, response):
re_time_list = []
xpath_time_list = []
lxml_time_list = []
bs4_lxml_time_list = []
html5lib_time_list = []
bs4_html5lib_time_list = []
for _ in range(500):
# re
re_start_time = time.time()
news_title = re.findall(pattern="
(.*?)", string=response.text)[0]news_content = "".join(re.findall(pattern='(.*?)', string=response.text))
re_time_list.append(time.time() - re_start_time)
# xpath
xpath_start_time = time.time()
news_title = response.xpath("//div[@class='article-title']/h2/text()").extract_first()
news_content = response.xpath('string(//*[@id="article"])').extract_first()
xpath_time_list.append(time.time() - xpath_start_time)
# bs4 html5lib without BeautifulSoup
soup = BeautifulSoup(response.text, "html5lib")
html5lib_start_time = time.time()
news_title = soup.select_one("div.article-title > h2").text
news_content = soup.select_one("#article").text
html5lib_time_list.append(time.time() - html5lib_start_time)
# bs4 html5lib with BeautifulSoup
bs4_html5lib_start_time = time.time()
soup = BeautifulSoup(response.text, "html5lib")
news_title = soup.select_one("div.article-title > h2").text
news_content = soup.select_one("#article").text
bs4_html5lib_time_list.append(time.time() - bs4_html5lib_start_time)
# bs4 lxml without BeautifulSoup
soup = BeautifulSoup(response.text, "lxml")
lxml_start_time = time.time()
news_title = soup.select_one("div.article-title > h2").text
news_content = soup.select_one("#article").text
lxml_time_list.append(time.time() - lxml_start_time)
# bs4 lxml without BeautifulSoup
bs4_lxml_start_time = time.time()
soup = BeautifulSoup(response.text, "lxml")
news_title = soup.select_one("div.article-title > h2").text
news_content = soup.select_one("#article").text
bs4_lxml_time_list.append(time.time() - bs4_lxml_start_time)
re_result = sum(re_time_list)
xpath_result = sum(xpath_time_list)
lxml_result = sum(lxml_time_list)
html5lib_result = sum(html5lib_time_list)
bs4_lxml_result = sum(bs4_lxml_time_list)
bs4_html5lib_result = sum(bs4_html5lib_time_list)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
print(f"re 使用时间:{re_result}")
print(f"xpath 使用时间:{xpath_result}")
print(f"lxml 纯解析使用时间:{lxml_result}")
print(f"html5lib 纯解析使用时间:{html5lib_result}")
print(f"bs4_lxml 转换解析使用时间:{bs4_lxml_result}")
print(f"bs4_html5lib 转换解析使用时间:{bs4_html5lib_result}")
print("\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
print(f"xpath/re :{xpath_result / re_result}")
print(f"lxml/re :{lxml_result / re_result}")
print(f"html5lib/re :{html5lib_result / re_result}")
print(f"bs4_lxml/re :{bs4_lxml_result / re_result}")
print(f"bs4_html5lib/re :{bs4_html5lib_result / re_result}")
print("\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")