抓取表情包
网址:https://www.doutula.com
语言:Python 3.7
编译器: PyCharm 2020.1 专业版
浏览器:Chrome 81.0.4044.122
使用到的库:requests、lxml、Queue、ThreadPoolExecutor
大概思路
分析网页构成,发现有很多页,找到翻页规律,如下:
https://www.doutula.com/article/list/?page=页数
最后的页码可以通过网页来获取到。
通过每一页的链接来获取到每一个图集的链接,通过每一个图集来获取图片的链接和名称。
然后进行下载。
不说废话,直接开搞
1、首先创建一个类,并且需要传入首页链接,代码如下:
class BiaoQB:
def __init__(self, start_url):
self.start_url = start_url
self.headers = {'user-agent': 'Mozilla/5.0'}
2、定义一个解析网页的方法,代码如下:
def get_response(self, url):
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response
3、通过第一页的链接获取到最后一页的页数并返回,代码如下:
def lase_page(self, url):
'''
获取到最后一页的页码
:param url: 初始页的链接
:return: 最后一页的页码
'''
html = etree.HTML(self.get_response(url).text)
last_page_num = html.xpath('//a[@class="page-link"]/text()')[-2]
return int(last_page_num)
4、通过最后一页的页码来生成所有页的链接,并加入队列中去,注意:这里的队列需要提前创建好,后面的完整代码有创建队列的语句。本次代码如下:
def create_all_page_url(self, last_page_num):
'''
根据页码来生成所有页的链接,并添加到队列中
:param last_page_num: 最后一页的页码
:return:
'''
for page in range(1, last_page_num+1):
the_page_url = f'https://www.doutula.com/article/list/?page={page}'
page_queue.put(the_page_url)
5、通过页码来获取当前页的图集链接,代码如下:
def get_image_list_url(self, page_url):
'''
获取到当前页面所有图集的链接
:param page_url: 页面链接
:return:
'''
html = etree.HTML(self.get_response(page_url).text)
image_list_url = html.xpath('//a[@class="list-group-item random_list tg-article"]/@href|//a[@class="list-group-item random_list"]/@href')
for i in range(len(image_list_url)):
self.get_image_info(image_list_url[i])
6、通过图集链接来获取图片的信息,包括链接和名称,代码如下:
def get_image_info(self, image_list_url):
html = etree.HTML(self.get_response(image_list_url).text)
image_url = html.xpath('//img[@referrerpolicy="no-referrer"]/@src')
image_name = html.xpath('//img[@referrerpolicy="no-referrer"]/@alt')
for i in range(len(image_url)):
self.download_img(image_url[i], image_name[i])
7、通过获取到的图片的信息来下载图片,代码如下:
def download_img(self, img_url, img_name):
# 这里判断是因为有的网页上不包含名字。对于那种图片,我们自己添加名字
if img_name != '':
file_path = 'd:/SpiderData/Img/' + img_name + '.jpg'
else:
file_path = 'd:/SpiderData/Img/' + img_url.split('_')[-1]
with open(file_path, 'wb') as f:
f.write(self.get_response(img_url).content)
print(file_path)
完成
完整代码如下:
import requests
from lxml import etree
from queue import Queue
from concurrent.futures import ThreadPoolExecutor
class BiaoQB:
def __init__(self, start_url):
self.start_url = start_url
self.headers = {'user-agent': 'Mozilla/5.0'}
def get_response(self, url):
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response
def lase_page(self, url):
'''
获取到最后一页的页码
:param url: 初始页的链接
:return: 最后一页的页码
'''
html = etree.HTML(self.get_response(url).text)
last_page_num = html.xpath('//a[@class="page-link"]/text()')[-2]
return int(last_page_num)
def create_all_page_url(self, last_page_num):
'''
根据页码来生成所有页的链接,并添加到队列中
:param last_page_num: 最后一页的页码
:return:
'''
for page in range(1, last_page_num+1):
the_page_url = f'https://www.doutula.com/article/list/?page={page}'
page_queue.put(the_page_url)
def get_image_list_url(self, page_url):
'''
获取到当前页面所有图集的链接
:param page_url: 页面链接
:return:
'''
html = etree.HTML(self.get_response(page_url).text)
image_list_url = html.xpath('//a[@class="list-group-item random_list tg-article"]/@href|//a[@class="list-group-item random_list"]/@href')
for i in range(len(image_list_url)):
self.get_image_info(image_list_url[i])
def get_image_info(self, image_list_url):
html = etree.HTML(self.get_response(image_list_url).text)
image_url = html.xpath('//img[@referrerpolicy="no-referrer"]/@src')
image_name = html.xpath('//img[@referrerpolicy="no-referrer"]/@alt')
for i in range(len(image_url)):
self.download_img(image_url[i], image_name[i])
def download_img(self, img_url, img_name):
# 这里判断是因为有的网页上不包含名字。对于那种图片,我们自己添加名字
if img_name != '':
file_path = 'd:/SpiderData/Img/' + img_name + '.jpg'
else:
file_path = 'd:/SpiderData/Img/' + img_url.split('_')[-1]
with open(file_path, 'wb') as f:
f.write(self.get_response(img_url).content)
print(file_path)
def main(self):
last_page_num = self.lase_page(self.start_url)
self.create_all_page_url(last_page_num)
[page_pool.submit(self.get_image_list_url, page_queue.get()) for _ in range(page_queue.qsize())]
if __name__ == '__main__':
url = 'https://www.doutula.com/article/list/?page=1'
page_queue = Queue()
page_pool = ThreadPoolExecutor(max_workers=10)
bqb = BiaoQB(url)
bqb.main()
如有疑问请私信我或者评论。