使用xpath爬取糗事百科的数据
from lxml import etree
#提取页面数据
html_data= etree.HTML(html)
list_data= html_data.xpath('.//div[@id="content-left"]/div')
代码详情
# -*- coding:utf-8 -*-
import requests
from lxml import etree
class Spider_QSBK(object):
'''糗事百科爬虫'''
def __init__(self):
'''初始化设置请求头和请求地址'''
self.headers ={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
}
self.start_url = 'https://www.qiushibaike.com/text/page/{}/'
def get_url(self):
'''构建url列表'''
self.list_url = [self.start_url.format(i) for i in range(1,2)]
def send_request(self):
'发送请求,获取页面内容'
for url in self.list_url:
response = requests.get(url,self.headers)
html = response.content.decode()
self.get_data(html)
def get_data(self,html):
#提取页面数据
html_data= etree.HTML(html)
list_data= html_data.xpath('.//div[@id="content-left"]/div')
for data in list_data:
author= data.xpath('./div/a/h2/text()')[0].strip() if len(data.xpath('./div/a/h2/text()'))>0 else None
content = data.xpath('./a/div/span/text()')[0].strip() if len(data.xpath('./a/div/span/text()')[0].strip()) else None
stats = data.xpath('./div/span/i/text()')[0]
count = data.xpath('./div/span/a/i/text()')[0]
data_dict = dict(
author =author,
content=content,
stats=stats,
count=count
)
self.save_data(data_dict)
def save_data(self,data_dict):
# 保存数据
with open('qiubai.txt','a',encoding='utf8')as f :
f.write(str(data_dict)+'\n')
def run(self):
# 构建url列表
self.get_url()
#发送请求
self.send_request()
if __name__ == '__main__':
qiushi = Spider_QSBK()
qiushi.run()