功能:自动根据关键词爬取指定页数的图片,并保存在指定目录下(关键词为文件夹名,图片网址为图片名)
参数:word 关键字
begin_page_num 开始页数
end_page_num 结束页数
"""
TODO:
funtion: lsp的救赎
other:
author: limenghua
createTime: 2020-12-25 17:20
"""
import requests
import urllib3
import urllib
import re
import os
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # fuck安全警告
class LspFighting:
def __init__(self, word, begin_page_num, end_page_num):
"""
初始化
:param word: 搜索关键词
"""
self.url = "https://image.baidu.com/search/acjson?tn=resultjson_com&logid=11432824844719315390&ipn=rj&" \
"ct=&is=&fp=result&queryWord=%E7%BE%8E%E9%A3%9F&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&" \
"st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&" \
"expermode=&force=&rn=30&1608887774681=" + "&word=" + word
self.word = word
self.headers = { # 请求头
"User-agent": "Mozilla/5.0",
"Content-Type": "utf-8",
}
self.begin_page_num = begin_page_num # 开始页数
self.end_page_num = end_page_num # 结束页数
self.page_num = 30 # 每页编号的增加值
def decode_url(self, url):
"""
对百度加密后的地址进行解码
:param url:百度加密的url
:return:解码后的url
"""
table = {'w': "a", 'k': "b", 'v': "c", '1': "d", 'j': "e", 'u': "f", '2': "g", 'i': "h",
't': "i", '3': "j", 'h': "k", 's': "l", '4': "m", 'g': "n", '5': "o", 'r': "p",
'q': "q", '6': "r", 'f': "s", 'p': "t", '7': "u", 'e': "v", 'o': "w", '8': "1",
'd': "2", 'n': "3", '9': "4", 'c': "5", 'm': "6", '0': "7",
'b': "8", 'l': "9", 'a': "0", '_z2C$q': ":", "_z&e3B": ".", 'AzdH3F': "/"}
url = re.sub(r'(?P<value>_z2C\$q|_z\&e3B|AzdH3F+)', lambda matched: table.get(matched.group('value')), url)
return re.sub(r'(?P<value>[0-9a-w])', lambda matched: table.get(matched.group('value')), url)
def filter_url(self, url):
"""
过滤不合法的网址
:param url: url
:return: bool
"""
res = re.search(".jpg|.gif|.jpeg|.png", url)
if res == None:
return False
return True
def run(self):
"""
对外运行方法
:return:
"""
while True:
if self.begin_page_num > self.end_page_num:
break
print("请求第{}页:".format(self.begin_page_num + 1))
current_page_num = (self.begin_page_num - 1) * self.page_num # 计算url所需要的翻页参数
url = self.url + "&pn=" + str(current_page_num) + "&gsm=" + str(hex(current_page_num)) # url拼装翻页
res = requests.get(url, headers=self.headers, verify=False) # 请求图片资源
pic_urls = re.findall('"objURL":"(.*?)",', res.text, re.S) # 正则匹配出需要的url
if os.path.exists("LearningMaterials/" + self.word + "/") == False: # 目录不存在就创建
os.makedirs("LearningMaterials/" + self.word + "/")
num = 0 # 统计下载数
for i in range(len(pic_urls)):
pic_urls[i] = self.decode_url(pic_urls[i]) # 地址解码
pic_urls[i] = re.search('src=.*?&', pic_urls[i]) # 正则匹配src源地址
if pic_urls[i] == None: # 跳过没匹配到的
continue
pic_urls[i] = pic_urls[i].group()[4:-1] # 截取源地址src=xxx -> xxx
if self.filter_url(pic_urls[i]) == False: # 过滤不合法地址
continue
if os.path.exists("LearningMaterials/" + self.word + "/" + pic_urls[i]) == True: # 跳过重复下载
continue
with open("LearningMaterials/" + self.word + "/" + pic_urls[i], 'wb') as pic: # 下载图片
url = urllib.parse.unquote(pic_urls[i])
res = requests.get(url, headers=self.headers, verify=False)
pic.write(res.content)
num += 1
print("下载{}条学习资料".format(num))
self.begin_page_num += 1
if __name__ == '__main__':
word = "美女" # 搜索关键字
begin_page_num = 0 # 开始页数
end_page_num = 3 # 结束页数
lsp = LspFighting(word, begin_page_num, end_page_num)
lsp.run()