# -*- coding: utf-8 -*-
"""
Created on Fri Jan 19 22:59:33 2018
@author: Administrator
"""
import requests
import time
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
import re
def re_scraper(url):
res=requests.get(url,headers=headers)
id=re.findall(r'<h2>\n(.*?)\n</h2>',res.text)
text=re.findall(r'<span>\s+(.*?)\s+</span>',res.text)[:-1]
haoxiao=re.findall(r'<span class="stats-vote"><i class="number">(\d+)</i> 好笑</span>',res.text)
pinglun=re.findall(r'<i class="number">(\d+)</i> 评论',res.text)
for id,text,xiao,ping in zip(id,text,haoxiao,pinglun):
info={'id':id,
'text':text,
'haoxiao':xiao,
'pinglun':ping}
return info
from bs4 import BeautifulSoup
def bs_scraper(url):
res=requests.get(url,headers=headers)
soup=BeautifulSoup(res.text,'html.parser')
text_soup=soup.find_all('div',class_='content')#文本内容
text_list=list(map(lambda x:x.text.strip(),text_soup))
id_soup=soup.find_all('h2')#昵称
id_list=list(map(lambda x:x.text.strip(),id_soup))
xiao_soup=soup.find_all('span',class_='stats-vote')#好笑数
xiao_list=list(map(lambda x:int(x.i.text),xiao_soup))
lun_soup=soup.find_all('a',class_='qiushi_comments')#评论数
lun_list=list(map(lambda x:x.find('i','number'),lun_soup))
ping_list=[]
for pinglun in lun_list:
if pinglun != None:#抓取的有None值
ping_list.append(int(pinglun.text))
else:
pass
for id,text,xiao,ping in zip(id_list,text_list,xiao_list,ping_list):
info={'id':id,
'text':text,
'haoxiao':xiao,
'pinglun':ping}
return info
from lxml import etree
def xp_scraper(url):
res=requests.get(url,headers=headers)
file=etree.HTML(res.text)
zong_list=[]
for tt in ['long','hot','old']:#有三种形式
#自己构造循环部分,就是包含所有信息的标签(选定单条整个信息)
zong='//div[@class="article block untagged mb15 typs_{}"]'.format(tt)
zong_list.append(zong)
for cc in zong_list:
all=file.xpath(cc)
for each in all:
id=each.xpath('div[1]/a[2]/h2/text()')
if id != []:
id1=id[0].strip()
else:
id1='匿名'
text=each.xpath('a[1]/div/span/text()')[0].strip()
haoxiao=each.xpath('div[2]/span[1]/i/text()')[0].strip()
pinglun=each.xpath('div[2]/span[2]/a/i/text()')[0].strip()
info={'id':id1,
'text':text,
'haoxiao':haoxiao,
'pinglun':pinglun}
return info
if __name__ == '__main__':
for name,scraper in [('re',re_scraper),('bs',bs_scraper),('xp',xp_scraper)]:
start = time.time()
for i in range(1,31):
URL='https://www.qiushibaike.com/text/page/{}/'.format(i)
scraper(URL)
end = time.time()
print(name,end-start)
结果如下:(时间)
re 11.39765214920044
bs 21.445226430892944
xp 11.944683313369751