增加了时间统计,另外直接写入excel,还有些问题需要解决:
xpath爬取时发现,作者中有些位于span下,有些位于a下,不知道如何才能将两者组合在一起,发帖求问~~~
import requests
import re
import time
import random
import xlwt
from bs4 import BeautifulSoup
from lxml import etree
def get_text(url, code):
try:
r = requests.get(url, timeout=30)
r.encoding = code
r.raise_for_status()
return r.text
except BaseException:
return ''
def re_crawl(html,ls):
start = time.time()
authors = re.findall('<h2>(.*?)</h2>', html, re.S)
contents = re.findall('<span>(.*?)</span>', html, re.S)
laughs = re.findall('<i class="number">(\d{1,5})</i>', html, re.S)
for author, content, laugh in zip(authors, contents, laughs):
ls.append([author.strip(), laugh, content.strip().replace(