互联网时代,难免会和别人在线上聊天,而现在的年轻人呐!一言不合就开始斗图!我难道就默默看着别人装逼吗?NO!拒绝! 所以呢借此机会我们找个表情网站,爬一波图片啦
斗图网链接: https://www.doutula.com
由于网站结构比较简单,没有异步加载,直接从html就能查找信息啦,所以就不做详细分析~
#coding:utf-8
import requests
import os
from lxml import html
from multiprocessing import Pool
class doutula():
base_url = 'https://www.doutula.com/'
headers={
'accept-encoding':'gzip, deflate, sdch, br',
'accept-language':'zh-CN,zh;q=0.8',
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'
}
def get_selector(self,url):
return html.fromstring(requests.get(url,headers=self.headers).text)
## 获取第num页下每个套图的入口地址
def get_page_link(self,num):
now_url = "{}/article/list/?page={}".format(self.base_url,num)
selector = self.get_selector(now_url)
page_link = []
for i in selector.xpath('//ul[@class="list-group"]/a/@href'):
page_link.append(i)
return page_link
# 获取详细页下的信息,返回一个元组,包括标题和链接的列表
def get_page_detail(self,url):
pic_list = []
selector = self.get_selector(url)
# 获取详细页的标题,图片连接
title = selector.xpath('//li[@class="list-group-item"]/h3/blockquote/a/text()')[0]
pic_link = selector.xpath('//div[@class="artile_des"]/table/tbody/tr/td/a/img/@src')
for i in pic_link:
pic_list.append(i)
return title,pic_link
def Make_dir(self,title):
future_dir = "{}/{}".format(os.path.abspath('.'),title)
if os.path.exists(future_dir):
print(u'文件夹已存在,跳过')
return False
else:
os.mkdir(future_dir)
print(title,u'文件夹创建完成')
return True
def down_load(self,page_info):
count = 1
title = page_info[0]
pic_link = page_info[1]
if self.Make_dir(title):
for i in pic_link:
now_path = "{}/{}/{}.jpg".format(os.path.abspath('.'), title, str(count))
print(now_path)
page_link = "https:{}".format(i)
with open(now_path,'wb') as f:
f.write(requests.get(page_link,headers=self.headers).content)
count+=1
def run(self,num=1):
for i in self.get_page_link(num):
self.down_load(self.get_page_detail(i))
if __name__ == '__main__':
dt = doutula()
dt.run(1)