1 #-*- coding: utf-8 -*-
2 importbase643 importjson4 importurlparse5 importre6 from datetime importdatetime7 importscrapy8 from scrapy.loader importItemLoader9 from ..items importZhiHuQuestionItem, ZhiHuAnswerItem10
11
12 classZhihuSpider(scrapy.Spider):13 name = 'zhihu'
14 allowed_domains = ['www.zhihu.com']15 start_urls = ['https://www.zhihu.com']16 start_answer_url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset={}&sort_by=default"
17
18 headers ={19 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0',20 'Referer': 'https://www.zhihu.com',21 'HOST': 'www.zhihu.com',22 'Authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'
23 }24 points_list = [[20, 27], [42, 25], [65, 20], [90, 25], [115, 32], [140, 25], [160, 25]]25
26 defstart_requests(self):27 """
28 重写父类的start_requests()函数,在这里设置爬虫的起始url为登录页面的url。29 :return:30 """
31 yieldscrapy.Request(32 url='https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',33 callback=self.captcha,34 headers=self.headers,35 )36
37 defcaptcha(self, response):38 show_captcha = json.loads(response.body)['show_captcha']39 ifshow_captcha:40 print u'有验证码'
41 yieldscrapy.Request(42 url='https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',43 method='PUT',44 headers=self.headers,45 callback=self.shi_bie46 )47 else:48 print u'没有验证码'
49 #直接进行登录的操作
50 post_url = 'https://www.zhihu.com/api/v3/oauth/sign_in'
51 post_data ={52 'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',53 'grant_type': 'password',54 'timestamp': '1515391742289',55 'source': 'com.zhihu.web',56 'signature': '6d1d179e50a06d1c17d6e8b5c89f77db34f406ac',57 'username': '',#账号58 'password': '',#密码59 'captcha': '',60 'lang': 'cn',61 'ref_source': 'homepage',62 'utm_source': ''
63 }64
65 yieldscrapy.FormRequest(66 url=post_url,67 headers=self.headers,68 formdata=post_data,69 callback=self.index_page70 )71
72 defshi_bie(self, response):73 try:74 img= json.loads(response.body)['img_base64']75 exceptException, e:76 print '获取img_base64的值失败,原因:%s'%e77 else:78 print '成功获取加密后的图片地址'
79 #将加密后的图片进行解密,同时保存到本地
80 img = img.encode('utf-8')81 img_data =base64.b64decode(img)82 with open('zhihu_captcha.GIF', 'wb') as f:83 f.write(img_data)84
85 captcha = raw_input('请输入倒立汉字的位置:')86 if len(captcha) == 2:87 #说明有两个倒立的汉字
88 pass
89 first_char = int(captcha[0]) - 1 #第一个汉字对应列表中的索引
90 second_char = int(captcha[1]) - 1 #第二个汉字对应列表中的索引
91 captcha = '{"img_size":[200,44],"input_points":[%s,%s]}' %(self.points_list[first_char], self.points_list[second_char])92 else:93 #说明只有一个倒立的汉字
94 pass
95 first_char = int(captcha[0]) - 1
96 captcha = '{"img_size":[200,44],"input_points":[%s]}' %(97 self.points_list[first_char])98
99 data ={100 'input_text': captcha101 }102 yieldscrapy.FormRequest(103 url='https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',104 headers=self.headers,105 formdata=data,106 callback=self.get_result107 )108
109 defget_result(self, response):110 try:111 yan_zheng_result = json.loads(response.body)['success']112 exceptException, e:113 print '关于验证码的POST请求响应失败,原因:{}'.format(e)114 else:115 ifyan_zheng_result:116 print u'验证成功'
117 post_url = 'https://www.zhihu.com/api/v3/oauth/sign_in'
118 post_data ={119 'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',120 'grant_type': 'password',121 'timestamp': '1515391742289',122 'source': 'com.zhihu.web',123 'signature': '6d1d179e50a06d1c17d6e8b5c89f77db34f406ac',124 'username': '',#账号125 'password': '',#密码126 'captcha': '',127 'lang': 'cn',128 'ref_source': 'homepage',129 'utm_source': ''
130 }
#以上数据需要在抓包中获取131
132 yieldscrapy.FormRequest(133 url=post_url,134 headers=self.headers,135 formdata=post_data,136 callback=self.index_page137 )138 else:139 print u'是错误的验证码!'
140
141 defindex_page(self, response):142 for url inself.start_urls:143 yieldscrapy.Request(144 url=url,145 headers=self.headers146 )147
148 defparse(self, response):149 """
150 提取首页中的所有问题的url,并对这些url进行进一步的追踪,爬取详情页的数据。151 :param response:152 :return:153 """
154 #/question/19618276/answer/267334062
155 all_urls = response.xpath('//a[@data-za-detail-view-element_name="Title"]/@href').extract()156 all_urls = [urlparse.urljoin(response.url, url) for url inall_urls]157 for url inall_urls:158 #https://www.zhihu.com/question/19618276/answer/267334062
159 #同时提取:详情的url;文章的ID;
160 result = re.search('(.*zhihu.com/question/(\d+))', url)161 ifresult:162 detail_url = result.group(1)163 question_id = result.group(2)164 #将详情url交由下载器去下载网页源码
165 yieldscrapy.Request(166 url=detail_url,167 headers=self.headers,168 callback=self.parse_detail_question,169 meta={170 'question_id': question_id,171 }172 )173
174 #在向详情url发送请求的同时,根据问题的ID,同时向问题的url发送请求。由于问题和答案是两个独立的url。而答案其实是一个JSON的API接口,直接请求即可,不需要和问题url产生联系。
175 yieldscrapy.Request(176 #参数:问题ID,偏移量。默认偏移量为0,从第一个答案开始请求
177 url=self.start_answer_url.format(question_id, 0),178 headers=self.headers,179 callback=self.parse_detail_answer,180 meta={181 'question_id': question_id182 }183 )184
185 break
186
187 defparse_detail_question(self, response):188 """
189 用于处理详情页面关于question问题的数据,比如:问题名称,简介,浏览数,关注者数等190 :param response:191 :return:192 """
193 item_loader = ItemLoader(item=ZhiHuQuestionItem(), response=response)194 item_loader.add_value('question_id', response.meta['question_id'])195 item_loader.add_xpath('question_title', '//div[@class="QuestionHeader"]//h1/text()')196 item_loader.add_xpath('question_topic', '//div[@class="QuestionHeader-topics"]//div[@class="Popover"]/div/text()')197 #获取的问题中,可能会不存在简介
198 item_loader.add_xpath('question_content', '//span[@class="RichText"]/text()')199 item_loader.add_xpath('question_watch_num', '//button[contains(@class, "NumberBoard-item")]//strong/text()')200 item_loader.add_xpath('question_click_num', '//div[@class="NumberBoard-item"]//strong/text()')201 item_loader.add_xpath('question_answer_num', '//h4[@class="List-headerText"]/span/text()')202 item_loader.add_xpath('question_comment_num', '//div[@class="QuestionHeader-Comment"]/button/text()')203 item_loader.add_value('question_url', response.url)204 item_loader.add_value('question_crawl_time', datetime.now())205
206 question_item =item_loader.load_item()207 yieldquestion_item208
209 defparse_detail_answer(self, response):210 """
211 用于解析某一个问题ID对应的所有答案。212 :param response:213 :return:214 """
215 answer_dict =json.loads(response.body)216 is_end = answer_dict['paging']['is_end']217 next_url = answer_dict['paging']['next']218
219 for answer in answer_dict['data']:220 answer_item =ZhiHuAnswerItem()221 answer_item['answer_id'] = answer['id']222 answer_item['answer_question_id'] = answer['question']['id']223 answer_item['answer_author_id'] = answer['author']['id']224 answer_item['answer_url'] = answer['url']225 answer_item['answer_comment_num'] = answer['comment_count']226 answer_item['answer_praise_num'] = answer['voteup_count']227 answer_item['answer_create_time'] = answer['created_time']228 answer_item['answer_content'] = answer['content']229 answer_item['answer_crawl_time'] =datetime.now()230 answer_item['answer_update_time'] = answer['updated_time']231
232 yieldanswer_item233
234 #判断is_end如果值为False,说明还有下一页
235 if notis_end:236 yieldscrapy.Request(237 url=next_url,238 headers=self.headers,239 callback=self.parse_detail_answer240 )