import requests
from lxml import etree
import json
class QiuSpider:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/text/page/{}/" #初始url
self.url_home = "https://www.qiushibaike.com{}" #补全信息url
self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"}
def get_url_list(self):#url生成
return [self.url_temp.format(i) for i in range(1,14)] #列表生成式生成需要的访问的url
def parse_url(self,url): #解析url
print(url)
response = requests.get(url, headers=self.headers) #发送请求
return response.content.decode() #得到返回
def supplement_text(self,href_str):#获得补全段子信息
supplement_url = self.url_home.format(href_str[0])#拼接url获取完整的url信息
supplement_str = self.parse_url(supplement_url)#解析url
html_sup = etree.HTML(supplement_str)
return html_sup.xpath("//div[@class='content']/text()")[0]#得到需求信息
def get_content_list(self, html_str):
html = etree.HTML(html_str)
div_list = html.xpath("//div[@id='content-left']/div") #分组
content_list = []
for div in div_list:
item= {}
# 糗事百科用户发表内容
if div.xpath(".//div[@class='content']/span[text()='查看全文']"):
supplement_href = div.xpath(".//a[@class='contentHerf']/@href")
item["content"] = self.supplement_text(supplement_href)
else:
cont = div.xpath(".//div[@class='content']/span/text()")
item["content"] = cont[0].replace("\n","")
# 糗事百科用户名字
item["author_name"] = div.xpath(".//div/a/img/@alt")
item["author_name"] = item["author_name"][0] if len(item["author_name"]) > 0 else None#防止数据未写入
##糗事百科用户年龄
item["author_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")#查询div中class属性中包含articleGender的xpath
item["author_age"] = item["author_age"][0] if len(item["author_age"]) > 0 else None#防止数据未写入
# 糗事百科性别
item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")#获取class属性
item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
# 糗事百科发表内容中包含的图片
item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
item["content_img"] = "https:" + item["content_img"][0] if len(item["content_img"]) > 0 else None
#糗事百科用户头像
item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"]) > 0 else None
#糗事百科好笑指数
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None
content_list.append(item)
return content_list
def save_content_list(self,content_list): #保存
for c in content_list:
with open('糗事百科.text', 'a', encoding='utf-8') as f:
f.write(json.dumps(c,ensure_ascii=False,indent=2))
f.write("\n")
def run(self):#主要逻辑
# 1. url_list
url_list = self.get_url_list()
# 2. 遍历,发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
# 3. 提取数据
content_list = self.get_content_list(html_str)
# 4. 保存
self.save_content_list(content_list)
if __name__ == '__main__':
qiubai = QiuSpider()
qiubai.run()