我今天发现一个网站很好爬的,非常适合新手,我没有设置请求头什么的爬了很多遍很没有封我ip和给我返回403之类的,所以他对我们第一次玩爬虫的人来说很友好。这个网站就是(手动滑稽)。
废话不多说,直接上代码:
import requests
from lxml import etree
import re
import os
# 定制请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
# 下载图片
def download(link, num):
r = requests.get(link, headers=headers)
pic = r.content
try:
with open('{}.jpg'.format(num + 1), 'wb') as f:
f.write(pic)
print('{}.jpg-----下载成功'.format(num + 1))
except:
print('下载失败!')
def get_img(url):
# 获得每一图集的图片链接及图片数量
req = requests.get(url, headers=headers)
html = req.content.decode()
tree = etree.HTML(html)
# 获得图片的数量
img_num = tree.xpath('//span[@style="color: #DB0909"]/text()')
# 获得图片链接
links = tree.xpath('//ul[@id="hgallery"]/img/@src')
ret = re.compile(r'(.*?)张照片')
nums = ret.findall(img_num[0])[0]
link_li = links[0][0:-5]
for num in range(int(nums)):
# 判断num的位数
if num == 0:
link = link_li + '{}.jpg'.format(num)
elif num < 10:
link = link_li + '00{}.jpg'.format(num)
else:
link = link_li + '0{}.jpg'.format(num)
download(link, num)
def main():
# 构造启始url,网页有100多页
pages = 100
for page in range(pages):
url = 'https://*****/gallery/{}.html'.format(page + 1)
if page + 1 == 1:
url = 'https://*****/gallery/'
# 得到每一页的url,再依次请求
req = requests.get(url, headers=headers)
html = req.content.decode('utf-8')
tree = etree.HTML(html)
# 获得每一页图集链接
img_link = tree.xpath('//div[@id="listdiv"]/ul/li[@class="galleryli"]/div[@class="galleryli_title"]/a/@href')
# 获得每一页图集标题
title_list = tree.xpath('//div[@id="listdiv"]/ul/li[@class="galleryli"]/div[@class="galleryli_title"]/a/text()')
# 循环遍历图集链接
count = 0
for img in img_link:
# 创建目录存储图片(这里你填写你要存储的路径)
path = 'F:\\图片文件\\图片\\{}\\'.format(title_list[count])
if not os.path.exists(path):
os.makedirs(path)
# 改变当前工作目录
os.chdir(path)
print('------------{}----------正在下载-----'.format(title_list[count]))
# 拼接成图集真正url
img_url = 'https://*****' + img
get_img(img_url)
count = count + 1
if __name__ == '__main__':
main()
因为写的匆忙,没有写多线程,爬取会有的慢,等有时间在补一个多线程爬虫。python批量爬取妹子图(小白也能看懂)