目标网址:https://mil.news.sina.com.cn/roll/index.d.html?cid=57918&page=1
1.导入模块
import json
import re
import requests
from lxml import etree
2.初始化函数
class Sina:
def __init__(self):
self.start_url="https://mil.news.sina.com.cn/roll/index.d.html?cid=57918&page={}"
self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"}
3.构造url列表
def get_url_list(self):#构造url列表
url_list=[]
for i in range(1,5):
url_list.append(self.start_url.format(i))
return url_list
4.发送请求,获取响应
def parse_url(self,url):#发送请求,获取响应
response=requests.get(url,headers=self.headers)
return response.content.decode()
5.提取数据
def get_content_list(self,html_str):#提取数据
html=etree.HTML(html_str)
li_list=html.xpath("//div[@class='fixList']//li")
content_list=[]
for li in li_list:
item={}
item["标题"]=li.xpath(".//a/text()")[0]#标题
item["时间"]=li.xpath(".//span[@class='time']/text()")[0]#时间
item["链接"]=li.xpath(".//a/@href")[0]#链接
paragraph_url=self.parse_url(item["链接"])#对每篇新闻的链接发送请求
item["段落"]=etree.HTML(paragraph_url).xpath("//div[@id='article']/p/text()")#正文段落
content_list.append(item)
return content_list
6.保存
def save_content(self,content_list,page_num):#保存
with open("新浪.txt","a",encoding="utf-8")as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write("\n")
print("Page"+str(page_num)+"保存成功")
7.主要逻辑
def run(self):#实现主要逻辑
url_list=self.get_url_list()
for url in url_list:#遍历,发送请求,获取响应
html_str=self.parse_url(url)
content_list=self.get_content_list(html_str)#提取数据
page_num=url_list.index(url)+1#页码数
self.save_content(content_list,page_num)#保存
8.主函数
if __name__=="__main__":
sina=Sina()
sina.run()
9.完整代码
#使用XPath爬取新浪新闻及其正文段落
import json
import re
import requests
from lxml import etree
class Sina:
def __init__(self):
self.start_url="https://mil.news.sina.com.cn/roll/index.d.html?cid=57918&page={}"
self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"}
def get_url_list(self):#构造url列表
url_list=[]
for i in range(1,5):
url_list.append(self.start_url.format(i))
return url_list
def parse_url(self,url):#发送请求,获取响应
response=requests.get(url,headers=self.headers)
return response.content.decode()
def get_content_list(self,html_str):#提取数据
html=etree.HTML(html_str)
li_list=html.xpath("//div[@class='fixList']//li")
content_list=[]
for li in li_list:
item={}
item["标题"]=li.xpath(".//a/text()")[0]#标题
item["时间"]=li.xpath(".//span[@class='time']/text()")[0]#时间
item["链接"]=li.xpath(".//a/@href")[0]#链接
paragraph_url=self.parse_url(item["链接"])#对每篇新闻的链接发送请求
item["段落"]=etree.HTML(paragraph_url).xpath("//div[@id='article']/p/text()")#正文段落
content_list.append(item)
return content_list
def save_content(self,content_list,page_num):#保存
with open("新浪.txt","a",encoding="utf-8")as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write("\n")
print("Page"+str(page_num)+"保存成功")
def run(self):#实现主要逻辑
url_list=self.get_url_list()
for url in url_list:#遍历,发送请求,获取响应
html_str=self.parse_url(url)
content_list=self.get_content_list(html_str)#提取数据
page_num=url_list.index(url)+1#页码数
self.save_content(content_list,page_num)#保存
if __name__=="__main__":
sina=Sina()
sina.run()