f12查看网页源码,当下拉的时候会出现ajax请求
点击上图中的url,在headers最后一栏,比对后发现网页的url基本相同除了page会变
以下是源码
import requests, os
from urllib.parse import urlencode
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from hashlib import md5
from multiprocessing.pool import Pool
#设置爬取页数
GROUP_START = 1
GROUP_END = 5
def get_image(page):
browser = webdriver.PhantomJS()
base_url = 'http://huaban.com/search/?'
params = {
'q': '水滴',
'type': 'pins',
'jk3p8ngq': '',
'page': page,
'per_page': '20',
'wfl': '1',
}
url = base_url + urlencode(params)
browser = webdriver.PhantomJS()
browser.get(url)
html = browser.page_source
soup = bs(html, 'lxml')
print(soup.title.string)
doc = soup.find(attrs={'id': 'waterfall'})
for items in doc.children:
if items.img:
yield{
'title': items.a.string,
'image': items.img['src']
}
def save_image(item):
if not os.path.exists(item['title']): #判断路径是否存在
os.mkdir(item['title']) #如果不存在新建文件夹
try:
image_url = item['image']
rsp = requests.get('http:' + image_url)
if rsp.status_code == 200:
file_path = '{0}/{1}.{2}'.format(item['title'], md5(rsp.content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(rsp.content)
else:
print('Already dowmload', file_path)
except requests.ConnectionError:
print('Fail to download image')
def main(page):
for item in get_image(page):
if item['title'] != None:
print(item)
save_image(item)
if __name__ == '__main__':
pool = Pool()
groups = ([x for x in range(GROUP_START, GROUP_END)])
pool.map(main, groups)
pool.close()
pool.join()
开始网页请求是用requests,但是得不到网页源码,所以改用Phantom JS
参考 python3 网络爬虫开发实战