1、说明:利用requests和正则表达式方法,爬取糗事百科网中”文字“专题的段子信息,并把爬取的数据存储在本地文件中。
2、爬虫分析:
(1)网址信息及规律:
https://www.qiushibaike.com/text/page/1/
https://www.qiushibaike.com/text/page/2/
https://www.qiushibaike.com/text/page/3/
(2)爬取信息:用户ID、用户等级、用户性别、发表段子文字信息、好笑数量和评论数量
(3)将数据保存在txt文件中
3、编写代码:
D:\Anaconda\Python_Workplace\Chapter01\QiuShiBaiKe.py
import requests as rs
import os
import time as t
import re
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'
}
if os.path.exists(r'C:/Users/huake/Desktop/QiuShiBaiKe.txt'):
os.remove(r'C:/Users/huake/Desktop/QiuShiBaiKe.txt')
else:
f=open('C:/Users/huake/Desktop/QiuShiBaiKe.txt','w')
f.close()
f=open('C:/Users/huake/Desktop/QiuShiBaiKe.txt','a+',encoding='utf-8')
def judgement_sex(class_name):
if class_name=='manIcon':
return '男'
else:
return '女'
def get_info(url):
wb_data=rs.get(url,headers=headers)
content=wb_data.content.decode('utf-8')
#content=wb_data.text
titles=re.findall('<h2>(.*?)</h2>',content,re.S)
sexs_levels=re.findall('<div class="articleGender(.*?)</div>',content,re.S)
texts=re.findall('<div class="content">.*?<span>(.*?)</span>',content,re.S) #特别要注意这句话
laughs=re.findall('<span class="stats-vote"><i class="number">(\d+)</i>',content,re.S)
comments=re.findall('<i class="number">(\d+)</i> 评论',content,re.S)
for title,sex_level,text,laugh,comment in zip(titles,sexs_levels,texts,laughs,comments):
data={
'用户ID':title.strip(),
'用户等级':sex_level.strip().split('">')[1],
'用户性别':judgement_sex(sex_level.strip().split('">')[0]),
'段子':text.strip(),
'搞笑数':laugh.strip(),
'评论数':comment.strip()
}
f.write(str(data)+'\n\n')
print(data)
if __name__=='__main__':
urls=['https://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1,3)]
for url in urls:
get_info(url)
t.sleep(1)
f.close()