爬取地址:
上源码:
#!/usr/bin/env python
#coding=utf-8
import requests
import os
import re
from bs4 import BeautifulSoup
import argparse
import hashlib
import base64
import gzip
import time
import io
class YirenSpider(object):
headers = {"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}
yiren_str = 'http://www.zybus.com/yiren/'
html_suffix = '.html'
def get_html_data(self,url):
text=requests.get(url,headers= self.headers)
return text.content
def get_soup(self,text):
soup = BeautifulSoup(text, 'lxml')
return soup
def get_yiren_htmls(self,url):
text = self.get_html_data(url)
soup = self.get_soup(text)
totals = soup.find_all('a')
img_url_list=[]
for link in totals:
a_link = link.get('href')
if len(a_link) >= (len(self.yiren_str)+len(self.html_suffix)) and a_link.startswith(self.yiren_str):
if len(img_url_list)== 0:
img_url_list.append(a_link)
else:
if img_url_list.count(a_link) > 0:
pass
else:
img_url_list.append(a_link)
return img_url_list
def get_timestr(self):
time_now = int(time.time())
time_local = time.localtime(time_now)
dt = time.strftime("%Y-%m-%d_%H:%M:%S",time_local)
return dt
def get_img_url(self,url):#得到 图片的 html 地址
img_prefix = 'src'
img_suffix ='.jpg'
text = self.get_html_data(url)
soup = self.get_soup(text)
totals = soup.find_all('div',class_='yiren_big_pic')
img_urls = []
for link in totals:
img_src = str(link)
img_src_len = len(img_src)
index = img_src.find(img_prefix)
index_r = img_src.find(img_suffix)
img_urls.append(img_src[index+5:index_r+4])
#print(img_src[index+5:index_r+4])
return img_urls
def download_images(self,urls,dirpath):#保存到指定的 文件夹
if not os.path.exists(dirpath):
os.makedirs(dirpath)
for u in urls:
u_img_data = self.get_html_data(u)
suffix = 'jpg'
#suffix = suffix[len(suffix) - 1]
time.sleep(1)
filename = str(self.get_timestr())
with open(dirpath +'/' + filename + '.' + suffix, 'wb') as f:
f.write(u_img_data)
def spider(self,url,dirpath):
htmllist = self.get_yiren_htmls(url)#得到主页 艺人的html 地址
for html in htmllist:
urls =self.get_img_url(html) #在某艺人的 主页上获取他那张最大的pic 图像
self.download_images(urls,dirpath) #保存下来
pass
if __name__ == '__main__':
url = 'http://www.zybus.com/dlyr/'
dir ='/home/menethis/work/Test/pythonTest/YiRen/dlyr'
Yiren = YirenSpider()
Yiren.spider(url,dir)