Python爬取图片之家
1、 爬虫实现的思路
1.1、请求图片之家的首页,找到我们要爬取的板块
1.2、请求包含我们要爬取图片的页面
1.3、使用BeautifulSoup、xpath、正则表达式等解析工具,解析出图片的路径
1.4、请求图片的途径(即下载图片)
2、爬虫的实现(Python代码)
针对上面的思路,我们一步一步实现图片爬虫
2.1、请求图片之间的首页,获取要爬取的板块
点开图片之家的首页我们发现一共有八个板块,每一个板块有不同的id,我们将这些id存储起来放在一个列表里,方便后续访问。
def pic_male():
url = 'https://www.tupianzj.com/meinv/'
# 使用requests模块的get的方法请求网页
response = requests.get(url=url, headers=headers)
# 对获取到的内容进行编码,使用gb2312进行编码,使用utf-8编码会乱码
response.encoding = "gb2312"
# 获取HTML页面文本内容,使用字符串存储
data = response.text
# 从字符串常量解析HTML文档。返回根节点
tree = etree.HTML(data)
pic_list = ['tag6166a8b4e5bb4a43dc708a3c4c26383e', 'tagece4a5f04199957553b4a8be39952bc7',
'tag245af89317f0f3aeb7d0c7a37b8bc416', 'tagedc722867ded49f2abb0dcd02f280d57',
'tag638ab696c93b18632c5c257274827aae', 'tag2e618e33ce4be001221a80b12483ebdc',
'tag922c19b5321c51a0dc40dde0338921d4', 'tag35acf75bfbf410b548b3b043d09d2c31']
for j in pic_list:
xpath_list = '//*[@id="' + j + '"]/li/a/@href'
url_list = tree.xpath(xpath_list)
for i in url_list:
first_url = "https://www.tupianzj.com" + i
print(first_url)
photo_male(first_url)
time.sleep(1.5)
2.2、请求包含我们要爬取图片的页面,进行翻页爬取
去除前面的两个按钮,获取到每一按钮所对应的网页
def photo_male(url):
my_url = url[:-11]
# print(my_url)
response = requests.get(url=url, headers=headers)
response.encoding = "gb2312"
data = response.text
tree = etree.HTML(data)
url_list = tree.xpath('//*[@id="container"]/div/div/div[2]/div[2]/div[3]/ul/li/a/@href')
# 删除掉无法访问的页面s-url-list:javascript:dPlayPre(); s-url-list:# s-url-list:210612_2.html
del (url_list[:2])
for i in url_list:
# 构造请求图片页面
page_url = my_url + i # https://www.tupianzj.com/meinv/20200910/ + 217273_2.html
# print(page_url)
page(page_url)
2.3、获取图片的src属性
获取到每一张图片的src属性,以便进行下一步的持久化存储
def page(img_url_01):
# 请求图片的页面
img_res = requests.get(url=img_url_01, headers=headers)
# 使用gb2312进行编码,使用utf-8会乱码
img_res.encoding = "gb2312"
# 获取页面的文本,存储为字符串
img_data = img_res.text
# 把字符串解析成HTML
img_tree = etree.HTML(img_data)
# 提取图片侧src属性
img_url = img_tree.xpath('//*[@id="bigpicimg"]/@src')
for i in img_url:
# 从URL中截取到图片的名字
img_name = i.split('/')[-1]
# print(img_name)
print(i)
download(i, img_name)
2.4、对图片进行持久化存储
def download(pic_url, name):
pic = requests.get(pic_url, headers=headers)
my_dir = "D:\FireFoxDownload\SpiderPic/" + name
with open(my_dir, "wb") as f:
f.write(pic.content)
2.5、全部代码
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 '
'Safari/537.36 Edg/87.0.664.66',
}
def pic_male():
url = 'https://www.tupianzj.com/meinv/'
# 使用requests模块的get的方法请求网页
response = requests.get(url=url, headers=headers)
# 对获取到的内容进行编码,使用gb2312进行编码,使用utf-8编码会乱码
response.encoding = "gb2312"
# 获取HTML页面文本内容,使用字符串存储
data = response.text
# 从字符串常量解析HTML文档。返回根节点
tree = etree.HTML(data)
# 存放板块的id
pic_list = ['tag6166a8b4e5bb4a43dc708a3c4c26383e', 'tagece4a5f04199957553b4a8be39952bc7',
'tag245af89317f0f3aeb7d0c7a37b8bc416', 'tagedc722867ded49f2abb0dcd02f280d57',
'tag638ab696c93b18632c5c257274827aae', 'tag2e618e33ce4be001221a80b12483ebdc',
'tag922c19b5321c51a0dc40dde0338921d4', 'tag35acf75bfbf410b548b3b043d09d2c31']
for j in pic_list:
# 构造板块,并获取a标签的属性
xpath_list = '//*[@id="' + j + '"]/li/a/@href'
url_list = tree.xpath(xpath_list)
for i in url_list:
# 遍历所有的板块
first_url = "https://www.tupianzj.com" + i
print(first_url)
photo_male(first_url)
time.sleep(1.5)
# 访问图片的页面,进行翻页爬取
def photo_male(url):
my_url = url[:-11]
# print(my_url)
response = requests.get(url=url, headers=headers)
response.encoding = "gb2312"
data = response.text
tree = etree.HTML(data)
url_list = tree.xpath('//*[@id="container"]/div/div/div[2]/div[2]/div[3]/ul/li/a/@href')
# 删除掉无法访问的页面s-url-list:javascript:dPlayPre(); s-url-list:# s-url-list:210612_2.html
del (url_list[:2])
for i in url_list:
# 构造请求图片页面
page_url = my_url + i # https://www.tupianzj.com/meinv/20200910/ + 217273_2.html
# print(page_url)
page(page_url)
# 提取图片的src属性
def page(img_url_01):
img_res = requests.get(url=img_url_01, headers=headers)
img_res.encoding = "gb2312"
img_data = img_res.text
img_tree = etree.HTML(img_data)
img_url = img_tree.xpath('//*[@id="bigpicimg"]/@src')
for i in img_url:
img_name = i.split('/')[-1]
# print(img_name)
print(i)
download(i, img_name)
# 下载图片
def download(pic_url, name):
pic = requests.get(pic_url, headers=headers)
my_dir = "D:\FireFoxDownload\SpiderPic/" + name
with open(my_dir, "wb") as f:
f.write(pic.content)
if __name__ == '__main__':
print("正在下载图片...")
pic_male()
print("下载成功")
3、总结
总的来说,爬虫并不是很难,之前一直写不出爬虫,个人认为有以下几点原因:
1、思路不清晰,不知道这么多页面之间的关系
2、不知道如何提取页面的一些属性
3、python语言的熟练度,包括但不限于数组的操作,切片等
上面就是我学习爬虫遇到的问题,经过不断的学习,慢慢地学会到了如何使用xpth,BeautifulSoup等工具进行页面属性提取。