用scrapy框架爬取了知乎网站,下面spider文件代码。
import scrapy
import re
from urllib import parse
from scrapy.loader import ItemLoader
from ArticleSpider.items import ZhihuquesitionItem,ZhihuanswerItem
import datetime
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/']
start_answer_url ='https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics&limit={1}&offset={2}'
headers ={
'Host' : 'www.zhihu.com',
'Referer':'https://www.zhihu.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
login_url = 'https://www.zhihu.com/login/phone_num'
def parse(self, response):
all_urls = response.css("a::attr(href)").extract()
all_urls = [parse.urljoin(response.url,url) for url in all_urls]
all_urls = filter(lambda x: True if x.startswith("https") else False,all_urls)
for url in all_urls:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",url)
if match_obj:
question_url = match_obj.group(1)
question_id = match_obj.group(2)
yield scrapy.Request(question_url,headers=self.headers,callback=self.parse_question,meta={"question_id":question_id,'cookiejar':response.meta['cookiejar']})
else:
pass
# yield scrapy.Request(url,headers=self.headers,callback=self.parse,meta={'cookiejar':response.meta['cookiejar']})
def parse_question(self,response):
question_id = response.meta.get("question_id","")
item_loader = ItemLoader(item=ZhihuquesitionItem(),response=response)
item_loader.add_value("zhihu_id",question_id)
item_loader.add_css("topics","div.Tag .Popover div::text")
item_loader.add_value("url",response.url)
item_loader.add_css("title",".QuestionHeader-main .QuestionHeader-title::text")
item_loader.add_xpath('content',"//span[@class='RichText']//text()")
item_loader.add_css("answer_num",".List-headerText span::text")
item_loader.add_css("comments_num",".QuestionHeader-Comment button::text")
item_loader.add_css("watch_user_num",".NumberBoard.QuestionFollowStatus-counts .NumberBoard-item:nth-child(1) div:nth-child(2)::text")
item_loader.add_css("click_num",".NumberBoard.QuestionFollowStatus-counts .NumberBoard-item:nth-child(3) div:nth-child(2)::text")
item_loader.add_value("crawl_time",datetime.datetime.now())
question_item = item_loader.load_item()
# self.parse(response)
yield scrapy.Request(self.start_answer_url.format(question_id,20,0),headers=self.headers,callback=self.parse_answer,meta={"cookiejar":response.meta['cookiejar']})
#传给item pipeline
yield question_item
def parse_answer(self,response):
import json
answer_json = json.loads(response.text)
is_end = answer_json['paging']['is_end']
next_url = answer_json['paging']['next']
for answer in answer_json['data']:
answer_item = ZhihuanswerItem()
answer_item['zhihu_id'] = answer['id']
answer_item['url'] = answer['url']
answer_item['question_id'] = answer['question']['id']
answer_item['author_id'] = answer['author']['id'] if 'id' in answer['author'] else None
answer_item['content'] = answer['content']if 'content' in answer else None
answer_item['prasise_num'] = answer['voteup_count']
answer_item['comments_num'] = answer['comment_count']
answer_item['create_time'] = datetime.datetime.fromtimestamp(answer['created_time']).strftime("%Y-%m-%d %H:%M:%S")
answer_item['update_time'] = datetime.datetime.fromtimestamp(answer['updated_time']).strftime("%Y-%m-%d %H:%M:%S")
answer_item['crawl_time'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
yield answer_item
pass
if not is_end:
yield scrapy.Request(next_url,headers=self.headers,callback=self.parse_answer,meta={'cookiejar':response.meta['cookiejar']})
def start_requests(self):
cookies = {
# "q_c1":"d4e553adf7c14bc28c23199cbfb9519e|1510714851000|1510714851000",
# "_zap":"f993e688-9387-44f7-9556-12f3c8fee42f",
# "r_cap_id":"NjkxZDQ3YjE2MTI3NDBkOTljOWYzZTY1YjdkMGI0M2U=|1511230668|7fd983e9c2fcb1e5c6a0fe2110c78979a6de7841",
# "cap_id":"NWY1N2E5ZTU3NDA3NGYyZWI5ZWNhNzliNjlkMTM3NTE=|1511230668|c8073f3a4d5aea2b4d0734b2a240ac0a031f8299",
# "d_c0":"AJDCsvJJsgyPTi9djTvhJM7Oj6IAX36r1D8=|1510903193",
# "__utma":"51854390.37862073.1510903200.1510992871.1511230675.3",
# "__utmz":"51854390.1510992871.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/",
# "__utmv":"51854390.000--|3=entry_date=20171115=1",
# "aliyungf_tc":"AQAAAMzWNArYdAIA+7aDd34OA71MImXf",
# "_xsrf":"50d072e7b1eb93735b0d965a68f06c41",
# "__utmb":"51854390.0.10.1511230675",
# "__utmc":"51854390",
# "z_c0":"Mi4xMlJ3UUFBQUFBQUFBa01LeThrbXlEQmNBQUFCaEFsVk40OW9BV3dDTzNveWhSUEV2bnk0SHVSWVAxcTZOTzNLNUln|1511230691|f9aad6fed7414196a558e4d0acc84ce025e05606"
}
return [scrapy.Request('https://www.zhihu.com/',callback=self.login,headers=self.headers,meta={'cookiejar':1})]
def login(self,response):
if response.css("div.verification.input-wrapper"):
response_text = response.text
match_obj= re.match('.*name="_xsrf" value="?(.*)"/>',response_text,re.DOTALL)
_xsrf = match_obj.group(1)
if _xsrf:
post_data={
'_xsrf':_xsrf,
'password':'yjh158180158',
'phone_num':'13790751251',
'captcha':""
}
import time
randomNum = str(int(time.time())*1000 )
yield scrapy.Request("https://www.zhihu.com/captcha.gif?r={}&type=login&lang=cn".format(randomNum),headers= self.headers,callback=self.login_after_captcha,meta={
'post_data':post_data,'cookiejar':response.meta['cookiejar']
})#为什么使用yield
else:
for url in self.start_urls:
yield scrapy.Request(url,headers=self.headers,dont_filter=True,meta={'cookiejar':response.meta['cookiejar']})
def login_after_captcha(self,response):
with open("captcha.jpg","wb") as f:
f.write(response.body)
f.close()
from zheye import zheye
z = zheye()
positions = z.Recognize("captcha.jpg")
post_data = response.meta.get("post_data",{})
if len(positions) == 2:
if positions[0][1] < positions[1][1]:
# pos.append([positions[1][1],positions[1][0]],[positions[0][1],positions[0][0]])
post_data['captcha'] = '{"img_size":[200,44],"input_points":[[%.2f,%f],[%.2f,%f]]}'%(positions[1][1]/2,positions[1][0]/2,positions[0][1]/2,positions[0][0]/2)
else:
post_data['captcha'] = '{"img_size":[200,44],"input_points":[[%.2f,%f],[%.2f,%f]]}'%(positions[0][1]/2,positions[0][0]/2,positions[1][1]/2,positions[1][0]/2)
else:
# pos.append(positions[0][1],positions[0][0])
post_data['captcha'] = '{"img_size":[200,44],"input_points":[%.2f,%f]}'%(positions[0][1]/2,positions[0][0]/2)
post_data['captcha_type'] = 'cn'
post_data['source'] = 'index_nav'
return [scrapy.FormRequest(
url = self.login_url,
formdata =post_data,
headers=self.headers,
callback = self.login_check,
meta={'cookiejar':response.meta['cookiejar']}
)]
def login_check(self,response):
#返回登陆后首页
import json
re_txt = json.loads(response.text)
if "msg" in re_txt and re_txt['msg'] == "登录成功":
for url in self.start_urls:
yield scrapy.Request(url,headers=self.headers,dont_filter=True,meta={'cookiejar':response.meta['cookiejar']})