效果:
分析
通过requests.get()
发出GET请求,通过response.text
接收html页面,通过xpath
获取我们需要的a
标签,
再次通过requests.get()
访问这个图片链接,获取资源,通过IO
写入文件,通过for循环增加URL
页码数来循环获取。
源码
第一种方式
# author: LiuShihao
# data: 2020/12/4 12:57 下午
# youknow: 各位老铁,我的这套代码曾经有人出价三个亿我没有卖,如今拿出来和大家分享,不求别的,只求大家免费的小红心帮忙点一点,这里谢过了。
# desc:
import requests
import time
from lxml import etree
import os.path
#多线程
from multiprocessing.dummy import Pool as ThreadPool
"""
目标网站:https://www.mzitu.com
https://www.mzitu.com/211011
https://www.mzitu.com/225958
211050、217280
[201645,211050,210888,251535,217479]
"""
# driverfile_path = r'/Users/LiuShihao/Downloads/ChromeDriver/chromedriver'
# option = webdriver.ChromeOptions()
# option.add_experimental_option('useAutomationExtension', False)
# option.add_experimental_option('excludeSwitches', ['enable-automation'])
# driver = webdriver.Chrome(executable_path=driverfile_path, options=option)
# headers = {
# "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
# }
headers = {'User-Agent': 'User-Agent',
'Referer': 'https://www.mzitu.com/222157'}
# url = r'https://www.mzitu.com/205245/%d'
def download():
start = time.time()
# 开4个 worker,没有参数时默认是 cpu 的核心数
pool = ThreadPool(4)
# map() :第一个参数是方法名 ,第二个参数是方法的参数
pool.map(downloadfile, range(51))
pool.close()
pool.join()
print('time consume %s' % (time.time() - start))
def downloadfile(page):
try:
url = r'https://www.mzitu.com/222157/%d'
print(url)
url1 = str(url) % page
response = requests.get(url1, headers=headers)
# 获取网页内容Elements
# content = driver.page_source
content = response.text
# 你直接把你的text()改成@href就可以了,这个就能直接拿到链接,不用再加text()了
h2 = etree.HTML(content).xpath('//h2/text()')[0]
title = h2.rsplit('(')[0]
# print("title:",title)
img_url = etree.HTML(content).xpath('//img[@class="blur"]/@src')[0]
print(f"{img_url}正在下载")
# 下载图片
image = requests.get(img_url, headers=headers)
file = image.content
# 处理文件名
file_name = img_url.rsplit('/')[-2] + img_url.rsplit('/')[-1]
# 如果目录不存在则创建
if not os.path.exists("./images/" + title + "/"):
os.mkdir("./images/" + title + "/")
# 文件写入
with open('./images/' + title + "/" + file_name, 'wb') as wfile:
wfile.write(file)
print(f"{file_name}下载成功")
# time.sleep(2)
except IndexError:
print(title,"已下载成功")
pass
if __name__ == '__main__':
download()
第二种方式
# author: LiuShihao
# data: 2020/12/4 2:38 下午
# youknow: 各位老铁,我的这套代码曾经有人出价三个亿我没有卖,如今拿出来和大家分享,不求别的,只求大家免费的小红心帮忙点一点,这里谢过了。
# desc:
import requests
import time
from lxml import etree
import os.path
headers = {'User-Agent': 'User-Agent',
'Referer': 'https://www.mzitu.com/'}
def download(url):
response = requests.get(url, headers=headers)
# 获取网页内容Elements
content = response.text
ul = etree.HTML(content).xpath('//ul[@id="pins"]/li')
print("ul类型:",type(ul),"ul长度:",len(ul))
try:
for li in ul:
href = li.xpath('./a/@href')[0]
print("href:",href)
# https://www.mzitu.com/223872
download2(href)
except IndexError:
print("爬取完毕!")
def download2(url):
print("----",url)
headers = {'User-Agent': 'User-Agent',
'Referer': 'https://www.mzitu.com/'}
headers['Referer'] = url;
response = requests.get(url, headers=headers)
content = response.text
h2 = etree.HTML(content).xpath('//h2/text()')[0]
title = h2.rsplit('(')[0]
# print("title:",title)
img_url = etree.HTML(content).xpath('//img[@class="blur"]/@src')[0]
downloadFile(img_url, title)
# time.sleep(1)
page_list = etree.HTML(content).xpath('//div[@class="pagenavi"]/a')
max_page = page_list[len(page_list)-2]
# print("max_page:",max_page)
max_page_num = int(max_page.xpath('./span/text()')[0])
print("max_page_num:",max_page_num)
for i in range(max_page_num+1):
# print("url:",url+r'/%d'%i)
response = requests.get(url+r'/%d'%i, headers=headers)
content = response.text
h2 = etree.HTML(content).xpath('//h2/text()')[0]
title = h2.rsplit('(')[0]
# print("title:",title)
img_url = etree.HTML(content).xpath('//img[@class="blur"]/@src')[0]
# print(img_url)
downloadFile(img_url,title)
print(title,"已爬取完毕!")
def downloadFile(img_url,title):
print(f"{img_url}正在下载")
# 下载图片
image = requests.get(img_url, headers=headers)
file = image.content
# 处理文件名
file_name = img_url.rsplit('/')[-2] + img_url.rsplit('/')[-1]
# 如果目录不存在则创建
if not os.path.exists("./images/" + title + "/"):
os.mkdir("./images/" + title + "/")
# 文件写入
with open('./images/' + title + "/" + file_name, 'wb') as wfile:
wfile.write(file)
print(f"{file_name}下载成功")
if __name__ == '__main__':
url = 'https://www.mzitu.com/'
download(url)
# download2("https://www.mzitu.com/247153")
Bug 反扒
解决办法:
修改请求头。