最新的某131网站精美套图爬取代码出炉,截至2020年4月13日有效,之后就不知道啦。
来,各位看客老爷们可以搞一下:
import requests
import json
import re
import time
import os
def find_img_src(html): #这个函数其实没用到
replace_pattern = r'<[meta|META].*?/>'
img_url_pattern = r'.+?content="(\S+)"'
img_url_list = []
need_replace_list = re.findall(replace_pattern, html)
for tag in need_replace_list:
img_url_list.append(re.findall(img_url_pattern, tag))
return img_url_list
def find_set_span(html):
paging_pattern = r'<div class="paging".*?</div>'
span_pattern = r'<span.*?</span>'
need_paging_list = re.findall(paging_pattern,html)
# print(need_paging_list)
for span_statement in need_paging_list:
need_span_list = re.findall(span_pattern,span_statement)
# print(need_span_list)
final_pattern = r'>.*?<'
span_number = re.findall(final_pattern,need_span_list[0])
return (int(span_number[0][3:5]))
headers = {
"User-Agent": "Mozilla/5.0",
"referer":"https://m.mm131.net",
"Host":"m.mm131.net",
}
for i in range(5300,5350): #这个套数ID数可以自己测试
time.sleep(2)
main_url = "https://m.mm131.net/xinggan/"+str(i)+".html"
response = requests.get(main_url,headers = headers)
if response.status_code == 200:
try:
# print(find_img_src(response.content.decode("gb2312"))[3],end="") #返回url列表
span_number = find_set_span(response.text)
print(main_url, end="")
print(" 此套图页数:"+str(span_number))
except:
print(" 网页解码异常")
#创建文件夹
pic_path = r'D:\******\WWW\photo\mm131\%d' % (i)
isExists = os.path.exists(pic_path)
if not isExists:
os.makedirs(pic_path)
print(pic_path + " 创建成功")
#爬取图片
for sequence in range(1, span_number + 1):
if sequence == 1:
referer_url = "https://m.mm131.net/xinggan/" + str(i) + ".html"
else:
referer_url = "https://m.mm131.net/xinggan/" + str(i) + "_" + str(sequence) + ".html"
# print("referer: "+referer_url + " ", end="")
pic_headers = {
"User-Agent": "Mozilla/5.0",
"referer": referer_url,
"host": "img1.mmmw.net",
}
img_url = "https://img1.mmmw.net/pic/" + str(i) + "/" + str(sequence) + ".jpg"
# print("img_url: "+img_url,end="")
#下载图片
pic_response = requests.get(img_url,headers = pic_headers)
time.sleep(0.5)
if (pic_response.content!=None):
open(r'D:\******\WWW\photo\mm131\%d\%d.jpg' % (i,sequence),'wb').write(pic_response.content) # 将内容写入图片
# print(" 图片已写入")
else:
print(pic_path + " 目录已存在")
else:
print(main_url+" 访问错误")