一般来说很多人在学会爬虫之后会去妹子图这个网站爬取一些套图来作为练习,但是最近这个网站有了反爬而且套图无法下载,在经过好长一段时间的搜寻过后,我发现了一个新的网站:http://www.win4000.com/meinvtag4_1.html
首先导入必要的包:
import requests
from bs4 import BeautifulSoup
import re
import time
import os
获取套图的链接和标题
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
def geturl():
original_code = re.compile(r'<a href="(.*?)" target="_blank">')
title_code = title_code = re.compile(r'<img alt="(.*?)" data-original')
# URL = 'http://www.win4000.com/meinvtag4_1.html'
html_url = []
title = []
for i in range(1,6):
URL = 'http://www.win4000.com/meinvtag4_%d.html'%i
session = requests.Session()
response = session.get(URL,headers = headers)
page_code = response.text
soup = BeautifulSoup(page_code,'html.parser')
for item in soup.find_all('div', class_="Left_bar"):
item = str(item)
html_url = html_url + re.findall(original_code,item)
title = title + re.findall(title_code,item)
return html_url,title
html_url,title = geturl()
保存套图
savepath = '.\\bizhi'
pic_code = re.compile(r'data-original="(.*?)" src')
for i in range(len(html_url)):
pic_url = []
url = html_url[i]
session = requests.Session()
response = session.get(url, headers=headers)
page_code = response.text
soup = BeautifulSoup(page_code, 'html.parser')
for item in soup.find_all('div', class_="scroll-img-cont scroll-img-cont02"):
item = str(item)
pic_url += re.findall(pic_code, item)
temppath = savepath + '\\%s' % title[i]
os.makedirs(temppath)
for j in range(len(pic_url)):
path = temppath + '\\%d.jpg' % (j + 1)
temp = pic_url[j][0:-12] + pic_url[j][-4:]
response = session.get(temp, headers=headers)
f = open(path, 'wb')
f.write(response.content)
f.close()
print('已经处理好第%d组'%(i+1))
print('休息一下')
time.sleep(0.3)
效果展示