直接上代码,下来有时间单独发一篇进行讲解,文章底部有效果图
仅用学习,如有侵权立即删除
有两个地方需要改一下
1、header 改成自己浏览器的
2、localpath 改成自己本地任意路径
import requests
import re
import time
import os
"""
爬取目标:http://www.cosplay8.com/pic/
时间:2021/6/23
奥力给
"""
header = {
"User-Agent": "**改成自己的浏览器的header**"
}
# 生成待爬取的列表
def run(category,start,end):
# chinacos 国内cos
wait_url = [
f"http://www.cosplay8.com/pic/chinacos/list_{category}_{i}.html" for i in range(int(start),int(end) + 1)
]
print(wait_url)
url_list = []
for item in wait_url:
print("item:",item)
ret = get_list(item)
print(type(ret))
print(f"已经抓取{len(ret)}条数据")
url_list.extend(ret)
print("url_list:",url_list)
# 获取详情页图片素材,并对抓取到的图片进行保存
for url in url_list:
get_detail(f"http://www.cosplay8.com{url}")
#获取全部详情页链接
def get_list(url):
all_list = []
res = requests.get(url=url,headers=header)
# res.encoding('utf-8')
res = res.text
# print("res:",res)
pattern = re.compile('<li><a href="(.*?)">')
# re.findall()在字符串中找到正则表达式所匹配的所有子串,并返回一个列表
all_list = pattern.findall(res)
print("all_list:",all_list)
return all_list
def get_detail(url):
res = requests.get(url=url,headers=header)
#设置编码
res.encoding = "utf-8"
#拿到网页编码
html = res.text
# print(html)
# 拆解页码 保存第一张图片
size_pattern = re.compile('<span>共(.*?)页: </span>')
# 获取标题,后续发现发表存在差异,顾正则表达式有修改
# title_pattern = re.compile('<title>(.*?)-Cosplay中国</title>')
title_pattern = re.compile('<title>(.*?)-Cosplay中国</title>')
# 设置图片的正则表达式
first_img_pattern = re.compile("<img src='(.*?)' id='bigimg'")
try:
page_size = size_pattern.search(html).group(1)
title = title_pattern.search(html).group(1)
first_img = first_img_pattern.search(html).group(1)
print("url 对应的数据为:{} {}".format(title,first_img))
# 生成路径 **改成自己的本地路径**
localpath = '/Users/Bob/PycharmProjects/ModeTest/img'
path = f'{localpath}/{title}'
#路径是否有这个路径
if not os.path.exists(path):
os.makedirs(path)
# 请求第一张图片
save_img(path,title,first_img,1)
# urls1 = [f"{url[0:url.rindex('.')]}_{i}.html" for i in range(2, int(page_size) + 1)]
urls = [url[0:url.rindex('.')]+f"_{i}.html" for i in range(2,int(page_size)+1)]
print("urls:",urls)
for index,child_url in enumerate(urls): #enumerate 同时输出下标和数据,(list,str,tu)
try:
print(child_url)
res = requests.get(url=child_url,headers=header)
"""
知识点:
resp.text返回的是Unicode型的数据。
resp.content返回的是bytes型也就是二进制的数据
文本类型用text,图片、文件类型用contexnt
"""
html = res.text
first_img_pattern = re.compile("<img src='(.*?)' id='bigimg'")
first_img = first_img_pattern.search(html).group(1)
save_img(path,title,first_img,index)
except Exception as er :
print("抓取子页:",er)
except Exception as er:
print(url, er)
else:
print(page_size,title,first_img)
# 把图片保存
def save_img(path, title, first_img, index):
try:
# 请求图片
img_res = requests.get(url=f"http://www.cosplay8.com{first_img}", headers=header)
img_data = img_res.content
with open(f"{path}/{title}_{index}.png",'wb') as f:
f.write(img_data)
except Exception as e:
print("save_img",e)
if __name__ == '__main__':
# get_detail('http://www.cosplay8.com/pic/chinacos/2021/0531/61821.html')
category = input("请输入分类编号:") #22
start = input("请输入起始页:") #1
end = input("请输入结束页:") #随意 尽量小一点
run(category, start, end)
效果图
参考地址:https://dream.blog.csdn.net/article/details/117918309
在此感谢 梦想橡皮擦 大佬的优秀文章