直接上代码吧,复制到pycharm中,install相关的库就可以直接运行!
有个问题就是当我爬取了1.2GB多资源的时候,后面的图片就全部重复了不知道是为什么,也许是这个网站的资源有限!
import requests
from bs4 import BeautifulSoup
import os
import urllib.request
############批量爬取手机图片资源###################
# 爬取页面资源
def getPage(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
r = requests.get(url, headers=headers)
try:
r.raise_for_status()
except:
print("访问异常:" + r.status_code)
data = r.text
# print(data)
soup = BeautifulSoup(data, 'html.parser')
# print(soup)
items = soup.find_all(class_='pic')
for item in items[:-3]:
name = item.em.text
print("图片主题:" + name)
the_url = 'http://sj.zol.com.cn' + item.get('href')
print('http://sj.zol.com.cn' + item.get('href'))
downImg(the_url, "IMG_data/" + name)
# 下载图片:
def downImg(url, path):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
r = requests.get(url, headers=headers)
try:
r.raise_for_status()
except:
print("访问异常:" + r.status_code)
data = r.text
# print(data)
soup = BeautifulSoup(data, 'html.parser')
# 需要爬取的数量
num = int(str(soup.span.text).split('/')[1].split(')')[0]);
# print(num)
# 默认选择第一个清晰度的图片
type = str(soup.dd.a.get('href'))
# 拼接高清图片的位置:
type_0 = type.split('_')[0] + '_'
type_1 = '_' + type.split('_')[-1]
# print(soup.dd.a.get('href'))
# print(type_0)
# print(type_1)
for id in range(num):
the_imgid = 'img' + str((id + 1))
# print('img'+str((id+1)))
# 缩略图位置
the_url = soup.find(id=the_imgid).a.get('href')
imgId = str(the_url).split('_')[-1].split(".")[0]
img_url = 'http://sj.zol.com.cn' + type_0 + imgId + type_1
print(img_url)
# 判断文件是否存在,如果不存在则进行删除(犹豫碰到了线程阻塞问题,为了避免重复下载,只要该主题的文件夹存在就直接掠过,不爬取)
if not os.path.exists(path):
os.mkdir(path)
else:
return
rrr = requests.get(img_url, headers=headers)
try:
rrr.raise_for_status()
ss = BeautifulSoup(rrr.text, 'html.parser')
aa = ss.img.get('src')
urllib.request.build_opener().addheaders = headers
if not os.path.exists(path + '/' + str(id) + '.jpg'):
urllib.request.urlretrieve(aa, path + '/' + str(id) + '.jpg')
except:
print('获取失败!')
print(path + ",下载完成!")
if __name__ == '__main__':
# url = ""
# getPage(url)http://sj.zol.com.cn/bizhi/new_1.html
for num in range(100):
url = "http://sj.zol.com.cn/bizhi/new_" + str(num + 1) + ".html"
getPage(url)