#_*_ coding:utf-8 _*_
import urllib.request
from bs4 import BeautifulSoup
import os
import re
from selenium import webdriver
class Spider:
#页面初始化
def __init__(self):
self.siteURL = "http://mm.taobao.com/json/request_top_list.htm"
# 获取索引页面的内容
def getPage(self, pageIndex):
url = self.siteURL + "?page=" + str(pageIndex)
print ("第%s页淘女郎网址:"%str(pageIndex)+url)
print ("分别为:")
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
html=response.read().decode("gbk")
soup=BeautifulSoup(html,"html.parser")
return soup
# 获取索引界面所有MM的信息,list格式
def getContents(self, pageIndex): #获取该页下淘女郎的信息
page = self.getPage(pageIndex)
contents=[]
for link in page.findAll("p",{"class":"top"}):
# print(link.a.get_text()) # name
# print (link.em.get_text()) #age
# print (link.span.get_text()) #address
# print (link.a.attrs["href"]) #link
name=link.a.get_text()
age=link.em.get_text()
address=link.span.get_text()
link=link.a.attrs["href"]
mm=[name,age,address,link]
# print (mm)
contents.append(mm)
# print (contents)
# for each in contents:
# print (each)
return contents
# 获取MM个人详情页面
def getDetailPage(self, infoURL):
response = urllib.request.urlopen(infoURL)
return response.read().decode("gbk")
#获取个人文字简介
# def getBrief(self,page):
# pattern = re.compile('<div class="mm-aixiu-content".*?>(.*?)<!--',re.S)
# result = re.search(pattern,page)
# return self.tool.replace(result.group(1))
#获取这个淘女淘的个人域名(该网页为动态加载,直接用BeautifulSoup无法提出)
def get_Peronsal_address(self,detailURL):
driver = webdriver.PhantomJS()
driver.get(detailURL)
pageSource = driver.page_source
# soup = BeautifulSoup(pageSource, "html.parser")
# # print (soup.prettify())
# 第一种方法
# <span>//mm.taobao.com/titikatrina</span>
# link=soup.find("div",class_="mm-p-info mm-p-domain-info").find("span").get_text()
# print (link)
# 另外的方法,使用正则表达式
try:
link = re.findall(r'<span>//mm\.taobao\.com/.*</span>', pageSource)
str_link = str(link[0])
str_link = str_link.lstrip("<span>").rstrip("</span>")
return str_link
except IndexError as e:
print("她没有个人域名:" + str(e))
return False
#获取页面所有图片
def getAllImg(self,detail_Page_link):
detail_Page_link = "https:" + detail_Page_link
request = urllib.request.Request(detail_Page_link)
response = urllib.request.urlopen(request)
html = response.read().decode("gbk")
soup = BeautifulSoup(html, "html.parser")
images = soup.findAll("img", {"src": re.compile("//img.alicdn.com/.*")})
# print (images)
return images
# for image in images:
# # return(images)
# print(image["src"])
#保存多张写真图片
def saveImgs(self,images,name):
number = 1
print ("发现",name,"共有",len(images),"张照片")
for image in images:
imageURL="http:"+image["src"]
# print (imageURL)
# splitPath = imageURL.split('.')
# fTail = splitPath.pop()
# if len(fTail) > 3:
# fTail = "jpg"
fileName = name + "/" + str(number) + ".jpg"
self.saveImg(imageURL,fileName)
number += 1
# print (fileName)
#传入图片地址,文件名,保存单张图片
def saveImg(self,imageURL,fileName):
# 判断图片是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(fileName)
# 判断结果
if not isExists:
# 如果图片不存在则下载图片
try:
u = urllib.request.urlopen(imageURL)
data = u.read()
f = open(fileName, 'wb')
f.write(data)
print("正在悄悄保存她的一张图片为", fileName)
f.close()
# return True
except urllib.error.HTTPError as reason:
print(reason)
else:
# 如果目录存在则不创建,并提示目录已存在
print("名为", fileName, "的图片已经成功下载")
# return False
#创建新目录
def mkdir(self,path):
path = path.strip()
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
print ("偷偷新建了名字叫做",path,"的文件夹")
# 创建目录操作函数
os.makedirs(path)
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print ("名为",path,"的文件夹已经创建成功")
return False
#将一页淘宝MM的信息保存起来
def savePageInfo(self,pageIndex):
#获取第一页淘宝MM列表
contents = self.getContents(pageIndex)
for item in contents: #此处使用切面可以决定选取某页淘女淘的个数
#item[0]姓名,item[1]年龄,item[2]居住地,item[3]网址
print ("发现一位模特,名字叫",item[0],"芳龄",item[1],",她住在",item[2])
print ("正在偷偷地保存",item[0],"的信息")
print ("又意外地发现她的地址是","https:"+item[3]+"&is_coment=false")
#个人详情页面的URL
detailURL = "https:"+item[3]+"&is_coment=false"
# #获取个人简介
# brief = self.getBrief(detailPage)
#
#得到个人详情页面代码
# detailPage = self.getDetailPage(detailURL)
detail_Page_link=self.get_Peronsal_address(detailURL)
# print (item[0]+"的个人域名为:"+detail_Page_link)
#某些淘女郎没有个人域名,返回的个人链接为空,则无法从链接提取。
#可以设置返回false,遇到false就break跳出。
if detail_Page_link!=False:
# #获取所有图片列表
images_links = self.getAllImg(detail_Page_link)
self.mkdir(item[0])
#
# #保存个人简介
# self.saveBrief(brief,item[0])
# #保存头像
# self.saveIcon(item[1],item[0])
#保存图片
self.saveImgs(images_links,item[0])
else:
break
#传入起止页码,获取MM图片
def savePagesInfo(self,start,end):
for i in range(start,end+1):
print ("正在偷偷寻找第"+str(i)+"个地方,看看MM们在不在")
self.savePageInfo(i)
spider = Spider()
# spider.getContents(1)
spider.savePagesInfo(1,2)