# -*- coding:utf8 -*-
import requests
import time
from lxml import etree
class DouTu:
def __init__(self):
self.header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"
}
#设置header头
def get_url(self,page):
url = "https://www.doutula.com/photo/list/"+page
#设置页面链接
response = requests.get(url=url,headers=self.header).text
#获取页面信息
self.parse_url(response)
def parse_url(self,response):
html = etree.HTML(response)
link = html.xpath('//div[@class="page-content text-center"]/div/a/@href')
#获取详情页面的链接
# print(link)
for i in link:
# print(i)
self.parse(i)
#将链接进行循环
time.sleep(3)
#设置延迟时间
def parse(self,i):
response = requests.get(url=i,headers=self.header).text
#详情页面的纤细
html = etree.HTML(response)
name = html.xpath('//div[@class="pic-title"]/h1/a/text()')[0]
#图片的名字
img = html.xpath(".//*[@id='detail']/div/div[1]/li/div[3]/div/div/div/div[1]/table/tbody/tr[1]/td/img/@src")[0]
#图片的链接
# print(img)
self.write(name,img)
def write(self,name,img):
#将推按写入文件
try:#进行异常处理
response = requests.get(url=img,headers=self.header).content
#过去图片的链接信息
with open('图片/%s' % name +'.jpg', 'wb') as f:
#with open 打开一个文件夹加上图片名字进行存储
f.write(response)
print('成功')
except:
print('图片有问题,跳过!')
if __name__ == '__main__':
dt = DouTu()
for i in range(1,51):#设置分页爬取
print("第%s页开始"%i)
page = '?page=%s'%i
dt.get_url(page)
print("第%s页结束"%i)