Spider.py
#-*-coding:utf-8-*-
import requests
from bs4 import BeautifulSoup
import time
import json
import os
head ={'Accept':'*/*',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With':'XMLHttpRequest',
'Referer':'http://www.zhihu.com',
'Accept-Language':'zh-CN',
'Accept-Encoding':'gzip, deflate',
'User-Agent':'Mozilla/5.0(Windows NT 6.1;WOW64;Trident/7.0;rv:11.0)like Gecko',
'Host':'www.zhihu.com'}
class Spider(object):
def haha(self):
pass
class Login(object):
def login_zhihu(self,email,password):
url = "http://www.zhihu.com"
s = requests.session()
html = s.get(url)
soup = BeautifulSoup(html.text,'html.parser')
xrsf = soup.find('input')
##验证码
yanzhen_url = 'https://www.zhihu.com/captcha.gif?r='+str(int(time.time()*1000))
haha = s.get(yanzhen_url,headers = head)
with open('code.jpg','wb') as f:
f.write(haha.content)
print '请输入验证码'
yanzhen = raw_input()
print '验证码:'+yanzhen
login_data = {'_xsrf':xrsf,
'password':password,
'captcha':yanzhen,
'email':email,
'remember_me':'true'}
r = s.post('https://www.zhihu.com/login/email',data=login_data,headers = head)
print r.text
jsonre = json.loads(r.text)
print jsonre["msg"]
return s
class parserHtml(object):
def getAnswerDiv_collection(self,collection,s):
div_list = []
collection_url = "http://www.zhihu.com/collection/"+collection
res = s.get(collection_url)
#print res.text
if not (res.status_code == 200):
return []
#print res.text
soup = BeautifulSoup(res.text,'html.parser')
#找到收藏夹中所有的页数,以获取所有问题答案
page_num = soup.find("div",class_="zm-invite-pager").find_all("span")[-2].find("a").text.encode('utf-8')
page_num = int(page_num)
div_list = []
page = 1
#获取所有的页面中所有的答案
while page <= page_num:
print "正在获取%d/%d页的答案" % (page,int(page_num))
more_url = collection_url + "?page=" + str(page)
res = s.get(more_url)
if not res.status_code == 200:
break
soup_temp = BeautifulSoup(res.text,'html.parser')
div_list_temp = soup.find_all('div',class_='zm-item')
div_list = div_list + div_list_temp
page = page + 1
print "目前获取"+str(len(div_list))+"份答案"
return div_list
def getImgURL_collection(self,div_list):
url_list = []
count = 0
for each in div_list:
count = count + 1
print "正在获取%d/%d个答案的图片" % (count,len(div_list))
#每个答案的全文在隐藏内容中
text_hidden = each.find('textarea',class_="content hidden")
if text_hidden == None:
continue
text_hidden = text_hidden.text.encode('utf-8')
#print text_hidden
soup_temp = BeautifulSoup(text_hidden,"lxml")
img_list_item = soup_temp.find_all("img")
img_list = []
#获取img标签中的src属性
for img_item in img_list_item:
img_list.append(img_item['src'])
print "获取到%d张图片" % len(img_list)
url_list = url_list + img_list
print "目前一共获取到%d张图片" % len(url_list)
return url_list
class Download(object):
def downloadImg(self,url_list,path):
ss = requests.session()
img_count = 0
url_list = url_list[9078:]
for img_url in url_list:
print "正在下载%d/%d张图" % (img_count,len(url_list))
img_content = ss.get(img_url)
name = img_url.split(".")
with open(os.path.join(path,str(img_count) +"."+name[-1]),'wb') as f:
f.write(img_content.content)
img_count = img_count + 1
__init__.py
#-*-coding:utf-8-*-
import sys
import os
import Spider
reload(sys)
sys.setdefaultencoding('utf-8')
def downCollection(collection):
l = Spider.Login()
s = l.login_zhihu("xxxxxx@xxx.com","xxxxxxxxxx")
parser = Spider.parserHtml()
div_list = parser.getAnswerDiv_collection(collection,s)
print "div_list length %d" % len(div_list)
url_list = parser.getImgURL_collection(div_list)
print "url_list length %d" % len(url_list)
path = os.path.join(os.path.abspath("."),"sou"+collection)
if not os.path.isdir(path):
os.mkdir(path)
download = Spider.Download()
download.downloadImg(url_list,path)
if __name__ == '__main__':
downCollection("12345678")