#coding:utf-8
import http.cookiejar as cookielib
import urllib.request
import requests
from urllib.parse import quote
import re
from bs4 import BeautifulSoup
import json
import time
import socket
socket.setdefaulttimeout(60)
file_tieba=open('12535.txt','a+',encoding='utf-8')
key='python'
def login():
agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
headers = {
"Host": "www.zhihu.com",
"Referer": "https://www.zhihu.com/",
'User-Agent': agent
}
session = requests.session()
session.headers=headers
session.cookies = cookielib.LWPCookieJar(filename='cookies')
try:
session.cookies.load(ignore_discard=True)
except:
print("Cookie 未能加载")
hander=urllib.request.HTTPCookieProcessor(session.cookies)
opener=urllib.request.build_opener(hander)
return opener,session
opener=login()[0]
session=login()[1]
def search(key):
result=[]
offset=0
while offset<100:
wenti="https://www.zhihu.com/r/search?q={0}&type=content&offset={1}".format(quote(key),offset)
print(wenti)
data=opener.open(wenti).read()
items=parser(data)
for item in items:
time.sleep(1)
question=question_infor(item)
offset+=10
print(offset)
session.close()
return result
def parser(data):
htmls=json.loads(data)['htmls']
result=[]
for html in htmls:
item={}
try:
soup=BeautifulSoup(html,'html.parser')
item['title']=soup.find('a').get_text()
#print(item['title'])
soup=soup.find('div',{'class':'content'})
item['question-id']=soup.find('meta',{'itemprop':'answer-id'}).get('content')
item['href']=soup.find('link',{'itemprop':"url"}).get('href')
#print(item['href'])
wentihuida=('\r\n href: '+item['href'])
pattern=re.compile(r'question/([0-9]{1,})/answer')
item['question-url-token']=pattern.findall(wentihuida)[0]
pattern=re.compile(r'/answer/([0-9]{1,})')
item['answer-url-token']=pattern.findall(wentihuida)[0]
#file_tieba.write('\r\n answer-url-token'+str(item['answer-url-token']))
item['votecount']=soup.find('span',{'class':'count'}).get_text()
#file_tieba.write('\r\n votecount: '+item['votecount'])
try:
item['author']=soup.find('a',{'class':'author author-link'}).get_text()
except:
item['author']='匿名用户'
#
content=soup.find('script',{'type':'text'}).get_text()
content=re.sub("[A-Za-z0-9\!\%\[\]\,\。\>\<\/\=\"\-\:\.]", "",content).strip()
#file_tieba.write('\r\n content: '+content)
item['date']=soup.find('a',{'class':'time text-muted'}).get('data-tooltip')
#
item['answer-comment-count']=soup.find('span',{'class':'label'}).get_text()
#file_tieba.write('\r\n answer-comment-count: '+item['answer-comment-count'])
if '添加评论' in item['answer-comment-count']:
item['answer-comment-count']=0
result.append(item)
except:
continue
return result
def copyone(wenti2):
data=opener.open(wenti2).read()
htmls=json.loads(data)
data=htmls['data']
Next=htmls['paging']['next']
for i in data:
file_tieba.write('\r\n答主:'+i['author']['name']+' ')
file_tieba.write(str(i['comment_count'])+'人赞同了该回答')
soup=BeautifulSoup(i['content'],'html.parser')
file_tieba.write('\r\n '+soup.text.replace('\xa0',''))
time.sleep(2)
def question_infor(item):
num=0
wenti2='https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={1}&limit=3&sort_by=default'.format(item['question-url-token'],num)
file_tieba.write('\r\n 标题: '+item['title'])
try:
file_tieba.write('\r\n date: '+item['date'])
except:
pass
file_tieba.write("\r\n question-id:"+item['question-id'])
file_tieba.write('\r\n url: '+item['href'])
file_tieba.write('\r\n question-url-token: '+str(item['question-url-token']))
file_tieba.write('\r\n author: '+item['author'])
copyone(wenti2)
get_comments(item['question-id'])
if num<=10:
num+=3
wenti2='https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={1}&limit=3&sort_by=default'.format(item['question-url-token'],num)
copyone(wenti2)
def get_comments(answerid):
comments=[]
page=0
pre=[]
while True:
try:
html=opener.open('https://www.zhihu.com/r/answers/%s/comments?page=%s'%(answerid,page)).read()
data=json.loads(html)['data']
if data==pre:
break
pre=data
for item in data:
file_tieba.write('\r\n'+item['author']['name'])
file_tieba.write('\r\n '+item['content'])
file_tieba.write('\r\n'+item[ 'createdTime'])
print('Get comments',answerid,page,'ok')
page+=30
print(page)
time.sleep(1)
except:
break
return comments
result=search(key)
print("文件保存在D:\Program Files (x86)\python3.5\python3.6\\12535.txt")
file_tieba.close()
如何利用已有的cook模拟登陆知乎,并爬取相关问题下所有的答案
最新推荐文章于 2022-04-11 14:05:19 发布