爬虫爬取网址:http://www.gamersky.com/news/201804/1039678.shtml
此网页为静态网页,双层套图,无需登录
下面附上一个图片的两张套图的源码截图:
附上源码:
import requests
from bs4 import BeautifulSoup
import os
import re
def getHtmlurl(url): #获取网址
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def getpic(html): #获取图片地址并下载
soup =BeautifulSoup(html,'html.parser')
all_p=soup.find('div',class_='Mid2L_con').find_all("p",attrs={'align':'center'})
for p in all_p:
img_i=p.find('a')
img_h=img_i['href']
img_url=img_h.split('?')[-1]
title=img_url.split('/')[-1].split('.')[0]
print (img_url)
root='/home/suwex/图片/'
#root='/home/suwex/test2/'
path = root + title[:22] + '.jpg'
try: #创建或判断路径图片是否存在并下载
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(img_url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
def main():
for i in range(1,12):
if i==1:
url='http://www.gamersky.com/news/201804/1039678.shtml'
else:
url='http://www.gamersky.com/news/201804/1039678_' + str(i) +'.shtml'
html=(getHtmlurl(url))
print(str(i)+" : ")
print(getpic(html))
main()