一、环境准备
'''
1、安装虚拟环境 python -m venv venv
2、选择解释器 File/Settings/Project:getPicture(项目名)
3、进入虚拟环境 cd .\venv\Scripts\ ,并且激活虚拟环境 .\activate.bat
模块使用:
requests >>> pip install requests
parsel >>> pip install parsel
待爬取的壁纸网站:http://www.netbian.com/
'''
二、爬取步骤
'''
爬虫操作步骤:
1、发送请求
2、获取数据
3、提取数据
4、二次发送请求
5、保存数据
6、批量操作
'''
三、爬取并保存数据
import requests
import parsel
# 请求网址
url = 'http://www.netbian.com/1920x1080/'
# 伪装头部信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}
# 此处是返回是一个对象
response = requests.get(url=url, headers=headers)
# 保持父目录的编码格式,爬的时候是什么,爬下来就是什么
response.encoding = response.apparent_encoding
# response.text, 将response对象以json字符串的格式,
selector = parsel.Selector(response.text)
lis = selector.css('#main > div.list > ul > li')
# 遍历list标签对象
for li in lis:
href = 'http://www.netbian.com' + li.css('a::attr(href)').get()
# 获取<b></b>标签内容
title = li.css('b::text').get()
# 判断,如果title不为空
if title:
# print(title, href)
# 二次发送请求,并将response对象,转换为json字符串
html_data = requests.get(url=href, headers=headers).text
# 将字符串转换为标签语言,此处调用parsel的Selector方法
sel = parsel.Selector(html_data)
# 获取img标签的src属性包含内容
pic_url = sel.css('#main > div.endpage > div > p > a > img::attr(src)').get()
print(title, pic_url)
# 发送请求,响应对象,转换为流的形式
img_content = requests.get(url=pic_url, headers=headers).content
# 将二进制流信息,写入到image文件夹下,
with open('image\\' + title + '.jpg', mode='wb') as f:
f.write(img_content)
四、效果展示
![](https://img-blog.csdnimg.cn/4b6682d901aa4236b9dc168cee43049a.png)
![](https://img-blog.csdnimg.cn/1b5cc28b9650401592e4ed01a91fb422.jpeg)
五、批量爬取
import requests
import parsel
# 爬取图片从第2页到第11页
for page in range(2,11):
url = f'http://www.netbian.com/1920x1080/index_{page}.htm'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding
selector = parsel.Selector(response.text)
lis = selector.css('#main > div.list > ul > li')
for li in lis:
href = 'http://www.netbian.com' + li.css('a::attr(href)').get()
title = li.css('b::text').get()
if title:
# print(title, href)
html_data = requests.get(url=href, headers=headers).text
sel = parsel.Selector(html_data)
pic_url = sel.css('#main > div.endpage > div > p > a > img::attr(src)').get()
print(title, pic_url)
img_content = requests.get(url=pic_url, headers=headers).content
with open('image\\' + title + '.jpg', mode='wb') as f:
f.write(img_content)