scrapy爬取知乎

用scrapy框架爬取了知乎网站,下面spider文件代码。

import scrapy
import re
from urllib import parse
from scrapy.loader import ItemLoader
from ArticleSpider.items import ZhihuquesitionItem,ZhihuanswerItem
import datetime


class ZhihuSpider(scrapy.Spider):
    name = 'zhihu'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    start_answer_url ='https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics&limit={1}&offset={2}'
    headers ={
    'Host' : 'www.zhihu.com',
    'Referer':'https://www.zhihu.com',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
    login_url = 'https://www.zhihu.com/login/phone_num'
    def parse(self, response):
        all_urls = response.css("a::attr(href)").extract()
        all_urls = [parse.urljoin(response.url,url) for url in all_urls]
        all_urls = filter(lambda x: True if x.startswith("https")  else False,all_urls)
        for url in all_urls:
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",url)
            if match_obj:
                question_url = match_obj.group(1)
                question_id = match_obj.group(2)
                yield scrapy.Request(question_url,headers=self.headers,callback=self.parse_question,meta={"question_id":question_id,'cookiejar':response.meta['cookiejar']})
            else:
                pass
                # yield scrapy.Request(url,headers=self.headers,callback=self.parse,meta={'cookiejar':response.meta['cookiejar']})


    def parse_question(self,response):
        question_id = response.meta.get("question_id","")
        item_loader = ItemLoader(item=ZhihuquesitionItem(),response=response)
        item_loader.add_value("zhihu_id",question_id)
        item_loader.add_css("topics","div.Tag .Popover div::text")
        item_loader.add_value("url",response.url)
        item_loader.add_css("title",".QuestionHeader-main .QuestionHeader-title::text")
        item_loader.add_xpath('content',"//span[@class='RichText']//text()")
        item_loader.add_css("answer_num",".List-headerText span::text")
        item_loader.add_css("comments_num",".QuestionHeader-Comment button::text")
        item_loader.add_css("watch_user_num",".NumberBoard.QuestionFollowStatus-counts .NumberBoard-item:nth-child(1) div:nth-child(2)::text")
        item_loader.add_css("click_num",".NumberBoard.QuestionFollowStatus-counts .NumberBoard-item:nth-child(3) div:nth-child(2)::text")
        item_loader.add_value("crawl_time",datetime.datetime.now())
        question_item = item_loader.load_item()

        # self.parse(response)
        yield scrapy.Request(self.start_answer_url.format(question_id,20,0),headers=self.headers,callback=self.parse_answer,meta={"cookiejar":response.meta['cookiejar']})
        #传给item pipeline
        yield question_item

    def parse_answer(self,response):
        import json
        answer_json = json.loads(response.text)
        is_end = answer_json['paging']['is_end']
        next_url = answer_json['paging']['next']

        for answer in answer_json['data']:
            answer_item = ZhihuanswerItem()
            answer_item['zhihu_id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer['author']['id'] if 'id' in answer['author'] else None
            answer_item['content'] = answer['content']if 'content' in answer else None
            answer_item['prasise_num'] = answer['voteup_count']
            answer_item['comments_num'] = answer['comment_count']
            answer_item['create_time'] = datetime.datetime.fromtimestamp(answer['created_time']).strftime("%Y-%m-%d %H:%M:%S")
            answer_item['update_time'] = datetime.datetime.fromtimestamp(answer['updated_time']).strftime("%Y-%m-%d %H:%M:%S")
            answer_item['crawl_time'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            yield answer_item
            pass
        if not is_end:
            yield scrapy.Request(next_url,headers=self.headers,callback=self.parse_answer,meta={'cookiejar':response.meta['cookiejar']})


    def start_requests(self):
        cookies = {
                # "q_c1":"d4e553adf7c14bc28c23199cbfb9519e|1510714851000|1510714851000",
                # "_zap":"f993e688-9387-44f7-9556-12f3c8fee42f",
                # "r_cap_id":"NjkxZDQ3YjE2MTI3NDBkOTljOWYzZTY1YjdkMGI0M2U=|1511230668|7fd983e9c2fcb1e5c6a0fe2110c78979a6de7841",
                # "cap_id":"NWY1N2E5ZTU3NDA3NGYyZWI5ZWNhNzliNjlkMTM3NTE=|1511230668|c8073f3a4d5aea2b4d0734b2a240ac0a031f8299",
                # "d_c0":"AJDCsvJJsgyPTi9djTvhJM7Oj6IAX36r1D8=|1510903193",
                # "__utma":"51854390.37862073.1510903200.1510992871.1511230675.3",
                # "__utmz":"51854390.1510992871.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/",
                # "__utmv":"51854390.000--|3=entry_date=20171115=1",
                # "aliyungf_tc":"AQAAAMzWNArYdAIA+7aDd34OA71MImXf",
                # "_xsrf":"50d072e7b1eb93735b0d965a68f06c41",
                # "__utmb":"51854390.0.10.1511230675",
                # "__utmc":"51854390",
                # "z_c0":"Mi4xMlJ3UUFBQUFBQUFBa01LeThrbXlEQmNBQUFCaEFsVk40OW9BV3dDTzNveWhSUEV2bnk0SHVSWVAxcTZOTzNLNUln|1511230691|f9aad6fed7414196a558e4d0acc84ce025e05606"
        }
        return [scrapy.Request('https://www.zhihu.com/',callback=self.login,headers=self.headers,meta={'cookiejar':1})]


    def login(self,response):
        if response.css("div.verification.input-wrapper"):
            response_text = response.text
            match_obj= re.match('.*name="_xsrf" value="?(.*)"/>',response_text,re.DOTALL)
            _xsrf = match_obj.group(1)
            if _xsrf:
                post_data={
                    '_xsrf':_xsrf,
                    'password':'yjh158180158',
                    'phone_num':'13790751251',
                    'captcha':""
                }
                import time
                randomNum = str(int(time.time())*1000 )
                yield scrapy.Request("https://www.zhihu.com/captcha.gif?r={}&type=login&lang=cn".format(randomNum),headers= self.headers,callback=self.login_after_captcha,meta={
                            'post_data':post_data,'cookiejar':response.meta['cookiejar']
                    })#为什么使用yield
        else:
             for url in self.start_urls:
                yield scrapy.Request(url,headers=self.headers,dont_filter=True,meta={'cookiejar':response.meta['cookiejar']})

    def login_after_captcha(self,response):
            with open("captcha.jpg","wb") as f:
                f.write(response.body)
                f.close()
            from zheye import zheye
            z = zheye()
            positions = z.Recognize("captcha.jpg")
            post_data = response.meta.get("post_data",{})
            if len(positions) == 2:
                if positions[0][1] < positions[1][1]:
                    # pos.append([positions[1][1],positions[1][0]],[positions[0][1],positions[0][0]])
                    post_data['captcha'] = '{"img_size":[200,44],"input_points":[[%.2f,%f],[%.2f,%f]]}'%(positions[1][1]/2,positions[1][0]/2,positions[0][1]/2,positions[0][0]/2)
                else:
                    post_data['captcha'] = '{"img_size":[200,44],"input_points":[[%.2f,%f],[%.2f,%f]]}'%(positions[0][1]/2,positions[0][0]/2,positions[1][1]/2,positions[1][0]/2)
            else:
                # pos.append(positions[0][1],positions[0][0])
                post_data['captcha'] = '{"img_size":[200,44],"input_points":[%.2f,%f]}'%(positions[0][1]/2,positions[0][0]/2)
            post_data['captcha_type'] = 'cn'
            post_data['source'] =  'index_nav'
            return [scrapy.FormRequest(
                    url = self.login_url,
                    formdata =post_data,
                    headers=self.headers,
                    callback = self.login_check,
                    meta={'cookiejar':response.meta['cookiejar']}
                        )]

    def login_check(self,response):
        #返回登陆后首页
        import json
        re_txt = json.loads(response.text)
        if "msg" in re_txt and re_txt['msg'] == "登录成功":
            for url in self.start_urls:
                yield scrapy.Request(url,headers=self.headers,dont_filter=True,meta={'cookiejar':response.meta['cookiejar']})
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值