本程序利用requests,bs4实现对http://meizitu.com大分类图片集中所有链接进行遍历,然后抓取图片:
# -*- coding:utf-8 -*-
# @Time : 2017-02-10 22:51
# @Author : Vincen_Shen
# @Site :
# @File : mm.py
# @Software : PyCharm
'''
该脚本实现www.meizitu.com自动爬取图片,需要注意的是实际图片存储在mm.howkuai.com。
两个网站都要求get请求时带上hearders,否则会被过滤。
'''
from bs4 import BeautifulSoup
import requests
import time
def images_down(urls):
"""
下载图片后以当前时间戳命名保存
:param urls: 接收image的实际地址列表
"""
heard = {'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Connection': 'keep-alive',
'Host': 'mm.howkuai.com'}
for url in urls:
time.sleep(1)
print(url)
try:
image = requests.get(url, headers=heard, stream=True, timeout=5)
image_name = str(int(time.time())) + '.jpg'
with open(image_name, 'wb') as f:
f.write(image.content)
except Exception:
print('Error!!!')
continue
def images_urls(url):
"""
:param url: 接收一个有美女的url
:return: 返回提取url中所有美女图片的url地址列表
"""
mm_links = []
heard = {'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Connection': 'keep-alive',
'Host': 'www.meizitu.com'}
response = requests.get(url, headers=heard, timeout=5)
soup = BeautifulSoup(response.text, 'html.parser')
urls = soup.find_all(id='picture')
soup2 = BeautifulSoup(str(urls)[1:-1], 'html.parser')
urls2 = soup2.find_all('img')
for url in urls2:
mm_links.append(url.get('src'))
return mm_links
def index_urls():
"""
提取每个页面中的单个美女图片集url
"""
urls = []
for i in range(1,12):
urls.append('http://www.meizitu.com/a/xinggan_2_%s.html' %i)
heard = {'User-Agent':r'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Connection': 'keep-alive',
'Host':'www.meizitu.com'}
for url in urls:
response = requests.get(url, headers = heard, timeout=5)
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, 'html.parser')
urls_image = soup.find_all(class_="tit")
for url in urls_image:
links = url.find('a')
mm_links = images_urls(links.get('href')) # 调用images_urls函数,获取美女图片集url列表
images_down(mm_links) # 调用images_down函数,下载美女图片
if __name__ == '__main__':
index_urls()