文章目录
前言
爬虫流程
- 获得请求地址
- python发送请求request
- 服务器返回响应response
#导入第三方发送请求的库,并用get方法发送请求
import requests
#爬虫地址
url="""https://mil.news.sina.com.cn/world/2023-04-20/doc-imyqzkcr5138902.shtml"""
res = requests.get(url=url) #发送get请求
res.encoding = res.apparent_encoding #编码方式按照自动识别的编码方式编码,或者直接设置成“utf8”
#解析网页的第三方库:pip install beautifulsoup4
from bs4 import BeautifulSoup
#解析返回数据的文本
soup = BeautifulSoup(markup=res.text)
soup.find(name="div",attrs={"class":"article"}).text
wrapper = soup.find(name="div",attrs={"class":"article"})
#爬虫数据保存到本地
with open("paragraph.txt",mode ='w',encoding="utf8") as f:
for paragraph in wrapper.find_all(name = 'p'):
print(paragraph.text,file = f)
# f.write(paragraph)
# f.write("\n")
print(file=f)
#爬取文章标题
# wrapper.find_all(name = 'p')
title = soup.find(name="div",attrs={"class":"main-content w1240"}).find_all(name="h1")[1].text
#爬取文章中的图片数据
url = "https:"+soup.find_all(name="div",attrs={"class":"img_wrapper"})[0].find(name="img").get("src")
res1 = requests.get(url = url)
#保存图像数据到本地
with open('abc.png',mode = 'wb') as f:
f.write(res1.content)
]
url = "https://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page=5" #一页的url
"""
分页操作
"""
for page_no in range(1,6):
url = "https://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page={}".format(page_no)
print(url)
"""
列表页操作
"""
url = """https://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page=1"""
res =requests.get(url=url)
res.encoding = "utf8"
soup =BeautifulSoup(markup=res.text)
li = soup.find(name="div",attrs = {"class":"fixList"}).find_all(name='a')
]
for ti in li:
print(ti.get("href"))