1. 项目介绍
使用Python3.7中提供的一些库来实现妹子图的爬取;其中使用到的库有requests库,os模块,还有xpath库。图片的存储我使用的是write进行图片的写入实现图片的存储;也可以使用urllib提供的request中的urlretrieve()方法来实现图片的存储。
2.项目整备
对网站的爬取,我们必须先对需要爬取的网站进行分析;分析我们需要的爬取的内容在哪,并该使用那种方式对内容进行提取;本篇文章中,对内容的提取,我是用的是xpath方法;因为它比较bs4或者re来说,提取的方法相对简单。
并且,需要注意的是:该网站中下载图片,我们不可以直接进行下载,需要提供每张图片的refer才能实现图片的无损下载;具体的会在代码中进行说明。
3.对网页进行分析
打开主页我们会发现,图片分为几个不同的主题,
首先我们先实现主页的爬取,试试手:
import requests
import os
from lxml import etree
headers = {
'cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1556805731; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1556805746',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
base_url = 'http://www.mzitu.com/'
response = requests.get(base_url, headers=headers)
# print(response.status_code)
response.encoding = response.apparent_encoding
html = response.text
html_x_1 = etree.HTML(html)
# with open('zhuye.html', 'w', encoding='utf-8')as fp:
# fp.write(html)
# 获取详情页的图片链接
link_url = html_x_1.xpath('.//ul[@id="pins"]/li') # .选取当前节点
# print(link_url)
for info in link_url:
title = info.xpath('./span[1]/a')[0].text # 获取图片主题,作为图片存储的文件夹
# 增加图片存储的文件夹
if not os.path.exists(title):
os.mkdir(title)
detail_url = info.xpath('./a/@href')[0] # 获取到每个图片列表页面的url
print(title, detail_url)
# 访问图片列表页
response = requests.get(detail_url, headers=headers)
print(response.status_code)
response.encoding = response.apparent_encoding #解码
html_det = response.text
html_x = etree.HTML(html_det)
# with open('detail.html', 'w', encoding='utf-8')as fp:
# fp.write(html)
# 获取detail_url下面图片的页数
total = html_x.xpath('//div[@class="pagenavi"]/a/span')[-2].text
print(total)
#访问每一页获取每页图片的url并进行下载
for i in range(1, int(total)+1):
# 拼接每一页的url
pj_url = detail_url + '/' + str(i)
print('正在访问第%s页:' % i + pj_url)
response = requests.get(pj_url, headers=headers)
response.encoding = response.apparent_encoding
html_end = response.text
html_x_end = etree.HTML(html_end)
pic_url = html_x_end.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
print(pic_url)
# 图片下载
referer = pj_url
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'referer': referer,
}
filename = pic_url.split('/')[-1] #图片名字的命名通过去每张图片url的后部分
response = requests.get(pic_url, headers=headers)
response.encoding = response.apparent_encoding
with open(title + '/' + filename, 'wb') as fp:
fp.write(response.content)
通过上面代码可以实现主页的爬取
实现不同模块之间的爬取,使用了字典的操作:
```
import requests
import os
from lxml import etree
# 获取首页情况
def get_type(url, headers):
first_url = 'http://www.mzitu.com/page/{}/'
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
with open("1.html", "w", encoding="utf-8") as fp:
fp.write(html)
html_x = etree.HTML(html)
# 获取总的页数
total = html_x.xpath('//div[@class="nav-links"]/a/text()')[-2]
for i in range(1, int(total)+1):
end_url = first_url.format(i)
print("正在访问:" + end_url)
xiangqqing(end_url)
break
def xiangqqing(url):
print("进去详情页面...")
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
html_x = etree.HTML(html)
link_url = html_x.xpath('.//ul[@id="pins"]/li') # .选取当前节点
# print(link_url)
# 有点问题 需要进行修整...
for link_info in link_url:
title = link_info.xpath('./span[1]/a')[0].text
if os.path.exists(title):
pass
else:
os.mkdir(title)
tuurl = link_info.xpath('./a/@href')[0]
print(tuurl)
get_downurl(tuurl, title)
# break
def get_downurl(url, title):
print('进入获取图片的url页面...')
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
html_x = etree.HTML(html)
# 获取图片的页数
total = html_x.xpath('//div[@class="pagenavi"]/a/span')[-2].text
print(total)
# 访问每一页获取每页图片的url并进行下载
for i in range(1, int(total) + 1):
# 拼接每一页的url
endurl = url + '/' + str(i)
print(endurl)
downloadtu(endurl, title)
def downloadtu(url,title):
print('进入获取图片的链接...')
# html_x = get_url(url)
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
html = response.text
html_x = etree.HTML(html)
# print(html)
# with open("2.html", "w", encoding="utf-8") as fp:
# fp.write(html)
tupian_url = html_x.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
down_photo(tupian_url, url, title)
# 开始下载
def down_photo(tupian_url, url, title):
referer = repr(url)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'referer' : referer
}
filename = tupian_url.split('/')[-1]
response = requests.get(tupian_url, headers=headers)
response.encoding = response.apparent_encoding
with open(title +'/' + filename, 'wb') as fp:
fp.write(response.content)
if __name__ == '__main__':
type = input("请输入要搜索的类型(性感、日本、台湾、清纯):")
dict = {"性感": "xinggan/", "日本": "japan/", "台湾": "taiwan/", "清纯": "mm/"}
pj = dict[type]
print(pj)
base_url = 'http://www.mzitu.com/' + pj
print(base_url)
headers = {
'cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1556805731; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1556805746',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
get_type(base_url, headers)
```
注意:请求时必须添加请求头…
有啥问题,希望各位一起探讨。。。谢谢!