路线: requests + BeautifulSoup + re
关键之处:
具体估计是为了让爬取者麻烦一点,他在评论内容里加了’< br >'标签
但是,这是阻止不了我们的嘻嘻。因为内容里加了’< br >'所以内容部分我们用BeautifulSoup的get_text()方法,把所有子节点的非属性字符串变为一个列表,让后用join方法把他们合并成一个完整的段子,这一步是与之前的实例不同之处,其他的都用re.findall+正则表达式就完事了。
上马!
from bs4 import BeautifulSoup
import requests
import time
import re
key_value = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
def get_html(url):
try:
web_data = requests.get(url,headers = key_value)
web_data.raise_for_status()
web_data.encoding = web_data.apparent_encoding
return web_data.text
except:
return None
def judge_sex(sex):
if sex== 'manIcon':
return '男'
else:
return '女'
def get_info(html,mylist):#昵称,性别,年龄,内容,笑数,评论
ids = re.findall('<h2>(.*?)</h2>',html,re.S)
sexs = re.findall('<div class="articleGender (\w*?)">',html,re.S)
ages = re.findall('<div class="articleGender manIcon">(\d+?)</div>',html,re.S)
'''这个兄弟里面弄了点<br>,难受,不能用findall'''
soup = BeautifulSoup(html,'lxml')
#contents = re.findall('')
contents = soup.select('div.content > span')
laughs = re.findall('<span class="stats-vote"><i class="number">(\d+?)</i>',html,re.S)
comments = re.findall('<i class="number">(\d+?)</i>',html,re.S)
for id,sex,age,content,laugh,comment in zip(ids,sexs,ages,contents,laughs,comments):
temp = content.get_text().strip()
real_content = ''.join(temp)
data = {
'昵称':id,
'性别':judge_sex(sex),
'年龄':age,
'内容':real_content,
'笑数':laugh,
'评论数':comment
}
mylist.append(data)
def write_file(mylist):
for part in mylist:
with open('D:/duanzi.txt','a+',encoding='UTF-8') as myobject:
try:
myobject.write(part['昵称']+'\n')
myobject.write(part['性别'] + '\n')
myobject.write(part['年龄'] + '\n')
myobject.write(part['内容'] + '\n')
myobject.write(part['笑数'] + '\n')
myobject.write(part['评论数'] + '\n\n\n')
except:
pass
if __name__ == '__main__':
mylist = []
depth = int(input('请输入你要爬取的页数'))
url = 'https://www.qiushibaike.com/text/'
for i in range(depth):
real_url = url + 'page/' + str(i) + '/'
html = get_html(real_url)
get_info(html,mylist)
time.sleep(0.5)
write_file(mylist)
Result!!!
热爱生活,热爱编程