三种方法爬取,数据清洗到列表里,下一步可以直接写入到excel等文件,这里就没写了。代码如下:
import requests
import re
import time
import random
from bs4 import BeautifulSoup
from lxml import etree
def get_text(url, code):
try:
r = requests.get(url, timeout=30)
r.encoding = code
r.raise_for_status()
return r.text
except BaseException:
return ''
def re_crawl(html):
ls = []
authors = re.findall('<h2>(.*?)</h2>', html, re.S)
contents = re.findall('<span>(.*?)</span>', html, re.S)
laughs = re.findall('<i class="number">(\d{1,5})</i>', html ,re.S)
for author, content,laugh in zip(authors, contents, laughs):
ls.append([author.replace("\n", ''), laugh, content.replace("\n", '').replace("<br/>", '')])
ret