scrapy mysql 多线程_scrapy 爬取知乎问题、答案 ,并异步写入数据库(mysql)

该博客介绍了如何使用Scrapy爬虫框架爬取知乎上的问题和答案,同时异步地将数据存储到MySQL数据库中。爬虫首先获取验证码并进行验证,然后登录知乎并获取问题列表,接着提取问题详情和答案,最后将数据存储到数据库。整个过程中涉及多线程和API接口的调用。
摘要由CSDN通过智能技术生成

1 #-*- coding: utf-8 -*-

2 importbase643 importjson4 importurlparse5 importre6 from datetime importdatetime7 importscrapy8 from scrapy.loader importItemLoader9 from ..items importZhiHuQuestionItem, ZhiHuAnswerItem10

11

12 classZhihuSpider(scrapy.Spider):13 name = 'zhihu'

14 allowed_domains = ['www.zhihu.com']15 start_urls = ['https://www.zhihu.com']16 start_answer_url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset={}&sort_by=default"

17

18 headers ={19 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0',20 'Referer': 'https://www.zhihu.com',21 'HOST': 'www.zhihu.com',22 'Authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'

23 }24 points_list = [[20, 27], [42, 25], [65, 20], [90, 25], [115, 32], [140, 25], [160, 25]]25

26 defstart_requests(self):27 """

28 重写父类的start_requests()函数,在这里设置爬虫的起始url为登录页面的url。29 :return:30 """

31 yieldscrapy.Request(32 url='https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',33 callback=self.captcha,34 headers=self.headers,35 )36

37 defcaptcha(self, response):38 show_captcha = json.loads(response.body)['show_captcha']39 ifshow_captcha:40 print u'有验证码'

41 yieldscrapy.Request(42 url='https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',43 method='PUT',44 headers=self.headers,45 callback=self.shi_bie46 )47 else:48 print u'没有验证码'

49 #直接进行登录的操作

50 post_url = 'https://www.zhihu.com/api/v3/oauth/sign_in'

51 post_data ={52 'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',53 'grant_type': 'password',54 'timestamp': '1515391742289',55 'source': 'com.zhihu.web',56 'signature': '6d1d179e50a06d1c17d6e8b5c89f77db34f406ac',57 'username': '',#账号58 'password': '',#密码59 'captcha': '',60 'lang': 'cn',61 'ref_source': 'homepage',62 'utm_source': ''

63 }64

65 yieldscrapy.FormRequest(66 url=post_url,67 headers=self.headers,68 formdata=post_data,69 callback=self.index_page70 )71

72 defshi_bie(self, response):73 try:74 img= json.loads(response.body)['img_base64']75 exceptException, e:76 print '获取img_base64的值失败,原因:%s'%e77 else:78 print '成功获取加密后的图片地址'

79 #将加密后的图片进行解密,同时保存到本地

80 img = img.encode('utf-8')81 img_data =base64.b64decode(img)82 with open('zhihu_captcha.GIF', 'wb') as f:83 f.write(img_data)84

85 captcha = raw_input('请输入倒立汉字的位置:')86 if len(captcha) == 2:87 #说明有两个倒立的汉字

88 pass

89 first_char = int(captcha[0]) - 1 #第一个汉字对应列表中的索引

90 second_char = int(captcha[1]) - 1 #第二个汉字对应列表中的索引

91 captcha = '{"img_size":[200,44],"input_points":[%s,%s]}' %(self.points_list[first_char], self.points_list[second_char])92 else:93 #说明只有一个倒立的汉字

94 pass

95 first_char = int(captcha[0]) - 1

96 captcha = '{"img_size":[200,44],"input_points":[%s]}' %(97 self.points_list[first_char])98

99 data ={100 'input_text': captcha101 }102 yieldscrapy.FormRequest(103 url='https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',104 headers=self.headers,105 formdata=data,106 callback=self.get_result107 )108

109 defget_result(self, response):110 try:111 yan_zheng_result = json.loads(response.body)['success']112 exceptException, e:113 print '关于验证码的POST请求响应失败,原因:{}'.format(e)114 else:115 ifyan_zheng_result:116 print u'验证成功'

117 post_url = 'https://www.zhihu.com/api/v3/oauth/sign_in'

118 post_data ={119 'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',120 'grant_type': 'password',121 'timestamp': '1515391742289',122 'source': 'com.zhihu.web',123 'signature': '6d1d179e50a06d1c17d6e8b5c89f77db34f406ac',124 'username': '',#账号125 'password': '',#密码126 'captcha': '',127 'lang': 'cn',128 'ref_source': 'homepage',129 'utm_source': ''

130 }

#以上数据需要在抓包中获取131

132 yieldscrapy.FormRequest(133 url=post_url,134 headers=self.headers,135 formdata=post_data,136 callback=self.index_page137 )138 else:139 print u'是错误的验证码!'

140

141 defindex_page(self, response):142 for url inself.start_urls:143 yieldscrapy.Request(144 url=url,145 headers=self.headers146 )147

148 defparse(self, response):149 """

150 提取首页中的所有问题的url,并对这些url进行进一步的追踪,爬取详情页的数据。151 :param response:152 :return:153 """

154 #/question/19618276/answer/267334062

155 all_urls = response.xpath('//a[@data-za-detail-view-element_name="Title"]/@href').extract()156 all_urls = [urlparse.urljoin(response.url, url) for url inall_urls]157 for url inall_urls:158 #https://www.zhihu.com/question/19618276/answer/267334062

159 #同时提取:详情的url;文章的ID;

160 result = re.search('(.*zhihu.com/question/(\d+))', url)161 ifresult:162 detail_url = result.group(1)163 question_id = result.group(2)164 #将详情url交由下载器去下载网页源码

165 yieldscrapy.Request(166 url=detail_url,167 headers=self.headers,168 callback=self.parse_detail_question,169 meta={170 'question_id': question_id,171 }172 )173

174 #在向详情url发送请求的同时,根据问题的ID,同时向问题的url发送请求。由于问题和答案是两个独立的url。而答案其实是一个JSON的API接口,直接请求即可,不需要和问题url产生联系。

175 yieldscrapy.Request(176 #参数:问题ID,偏移量。默认偏移量为0,从第一个答案开始请求

177 url=self.start_answer_url.format(question_id, 0),178 headers=self.headers,179 callback=self.parse_detail_answer,180 meta={181 'question_id': question_id182 }183 )184

185 break

186

187 defparse_detail_question(self, response):188 """

189 用于处理详情页面关于question问题的数据,比如:问题名称,简介,浏览数,关注者数等190 :param response:191 :return:192 """

193 item_loader = ItemLoader(item=ZhiHuQuestionItem(), response=response)194 item_loader.add_value('question_id', response.meta['question_id'])195 item_loader.add_xpath('question_title', '//div[@class="QuestionHeader"]//h1/text()')196 item_loader.add_xpath('question_topic', '//div[@class="QuestionHeader-topics"]//div[@class="Popover"]/div/text()')197 #获取的问题中,可能会不存在简介

198 item_loader.add_xpath('question_content', '//span[@class="RichText"]/text()')199 item_loader.add_xpath('question_watch_num', '//button[contains(@class, "NumberBoard-item")]//strong/text()')200 item_loader.add_xpath('question_click_num', '//div[@class="NumberBoard-item"]//strong/text()')201 item_loader.add_xpath('question_answer_num', '//h4[@class="List-headerText"]/span/text()')202 item_loader.add_xpath('question_comment_num', '//div[@class="QuestionHeader-Comment"]/button/text()')203 item_loader.add_value('question_url', response.url)204 item_loader.add_value('question_crawl_time', datetime.now())205

206 question_item =item_loader.load_item()207 yieldquestion_item208

209 defparse_detail_answer(self, response):210 """

211 用于解析某一个问题ID对应的所有答案。212 :param response:213 :return:214 """

215 answer_dict =json.loads(response.body)216 is_end = answer_dict['paging']['is_end']217 next_url = answer_dict['paging']['next']218

219 for answer in answer_dict['data']:220 answer_item =ZhiHuAnswerItem()221 answer_item['answer_id'] = answer['id']222 answer_item['answer_question_id'] = answer['question']['id']223 answer_item['answer_author_id'] = answer['author']['id']224 answer_item['answer_url'] = answer['url']225 answer_item['answer_comment_num'] = answer['comment_count']226 answer_item['answer_praise_num'] = answer['voteup_count']227 answer_item['answer_create_time'] = answer['created_time']228 answer_item['answer_content'] = answer['content']229 answer_item['answer_crawl_time'] =datetime.now()230 answer_item['answer_update_time'] = answer['updated_time']231

232 yieldanswer_item233

234 #判断is_end如果值为False,说明还有下一页

235 if notis_end:236 yieldscrapy.Request(237 url=next_url,238 headers=self.headers,239 callback=self.parse_detail_answer240 )

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值