上个案例粘贴乱码了,重发一个
import requests
from lxml import etree
import os
def girl_spider(base_url, headers):
res = requests.get(base_url, headers)
html = etree.HTML(res.text)
#获取详情页信息
img_src = html.xpath('//div[@class="postlist"]/ul/li/a/@href')
for img_url in img_src:
# print(img_url)
img_parse(img_url)
#解析图片详情页
def img_parse(img_url):
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Referer':'http://www.mzitu.com'
}
res = requests.get(img_url,headers)
html = etree.HTML(res.text)
#获取标题
title = html.xpath('//div[@class="content"]/h2/text()')[0]
# print(title)
# #获取总页数
page_num = html.xpath('//div[@class="pagenavi"]/a/span/text()')[-2]
# print(page_num)
# #获取链接
for num in range(1,int(page_num)+1):
img_src = img_url+"/"+str(num)
# print(img_src)
download_img(img_src,title,num)
def download_img(img_src,title,num):
res = requests.get(img_src)
html = etree.HTML(res.text)
#获取图片连接
img_url = html.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
#下载路径
root_dir = 'girl_img'
title = title.replace(' ','')
img_name = title + str(num) + '.jpg'
#如果不存在,新建文件夹
root_dir = root_dir + '\\' + title
if not os.path.exists(root_dir):
os.makedirs(root_dir)
res = requests.get(img_url,headers=headers)
with open(root_dir+"\\"+img_name,'wb') as f:
f.write(res.content)
f.close()
print(title+'---'+img_name+'文件保存成功')
if __name__ == '__main__':
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer':'http://www.mzitu.com'
}
for i in range(1,2):
base_url = 'http://www.mzitu.com/page/{}/'.format(str(i))
girl_spider(base_url, headers)