糗事百科爬虫
爬取糗事百科的网页并提取数据保存为json格式
-工具:
-python
class QiuShi(object):
def __init__(self):
self.url = 'https://www.qiushibaike.com/8hr/page/{}'
self.url_page = 'https://www.qiushibaike.com'
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
#生成获取链接
def get_url(self,end):
return [self.url.format(num) for num in range(1,end+1)]
# 获取糗事百科数据
def get_data(self,url):
response = requests.get(url,headers=self.headers).content
return response
# 处理糗事百科数据
def dispose_data(self,response):
# 补全数据
element = etree.HTML(response)
# 获取数据根数据
data=element.xpath('//*[@id="content-left"]/div')
data_list = []
for div in data:
# 链接数据
data = {}
# 获取到头像
data['head_img'] = div.xpath('./div[1]/a//@src')
data['head_img'] = 'https:' + data['head_img'][0] if len( data['head_img'])!=0 else "https://static.qiushibaike.com/images/thumb/anony.png?v=b61e7f5162d14b7c0d5f419cd6649c87"
# 获取到用户名
data['author_name'] = div.xpath('./div[1]/a//@alt')
data['author_name'] = data['author_name'][0].strip() if len(data['author_name'])!=0 else "匿名用户"
# 获取年龄
data['age'] = div.xpath('./div/div/text()')
data['age'] = data['age'][0] if len(data['age']) != 0 else 'null'
# 获取用户性别
genders = div.xpath('./div/div/@class')
if len(genders):
data['gender'] = re.findall('articleGender (\w+)Icon',genders[0])[0]
else:
data['gender'] = 'null'
# 获取评论
data['content_text'] = div.xpath('.//a[2]/div/div/text()')
data['content_text'] = data['content_text'][0].strip() if len(data['content_text']) != 0 else "null"
# 获取评论量
data['content'] = div.xpath('.//span[2]//i[1]/text()')
data['content'] = data['content'][0].strip() if len(data['content'])!=0 else None
# 获取段子数据
data['text'] = div.xpath('./a/div/span[1]//text()')
data['text'] = data['text'][0].strip() if len(data['text'])!=0 else None
# 获取好笑量
data['funny'] = div.xpath('.//span[1]/i/text()')
data['funny'] = data['funny'][0].strip() if len(data['funny'])!=0 else None
data_list.append(data)
return data_list
# 保存糗事百科数据
def save_data(self,data_list):
file_name = "糗事百科.json"
with open(file_name,'a',encoding='utf8')as f:
for data in data_list:
json.dump(data,f,ensure_ascii=False)
f.write('\n')
def run(self,end):
# 生成url链接
url_list = self.get_url(end)
for url in url_list:
print('正在生成%s页'% re.findall('page/\w+',url)[0])
#设置暂停
time.sleep(1)
# 1 ,获取响应页
response = self.get_data(url)
# 2, 处理响应页数据
data_dict = self.dispose_data(response)
# 3, 保存数据
self.save_data(data_dict)
qs = QiuShi()
qs.run(13)`