微博,移动端,未登陆成功

微博,移动端,未登陆成功

## -*- coding: utf-8 -*-

import scrapy
from one.items import OneItem
from urllib.parse import urlencode
import json
import urllib.request
from pyquery import PyQuery

class PublicSentimentSpider(scrapy.Spider):

    name = "publicSentiment"
    allowed_domains = ["m.weibo.com"]
    start_urls=["https://passport.weibo.cn/signin/login?"]#start_urls里面存放爬虫框架开始时的链接,该链接必须以列表形式存放

    Referer = {"Referer": "https://m.weibo.cn/p/searchall?containerid=100103type%3D1%26q%3D" + urllib.parse.quote("迪丽热巴")}

    def parse(self,response):
        print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
        yield scrapy.FormRequest.from_response(
                response,
                formdata = {'username':'17649969048','password':'gfwqf31248','savestate':'1','r':'https://m.weibo.cn/detail/4366060574884436',
                            'ec':'0','pagerefer':'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=https%3A%2F%2Fm.weibo.cn%2Fdetail%2F4366060574884436',
                            'entry':'mweibo','wentry':'','loginfrom':'','client_id':'','code':'','qq':'','mainpageflag':'1','hff':''},
                callback=self.parse1,headers=self.Referer,dont_filter=True)

    def parse1(self,response):
        yield scrapy.Request(url='https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&page_type=searchall',
                             callback=self.parse2,meta={"page": 1, "keyword": "迪丽热巴"},dont_filter=True)

    def parse2(self, response):
        item=OneItem()
        base_url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&page_type=searchall&page="
        #转为json格式
        results = json.loads(response.text, encoding="utf-8")
        page = response.meta.get("page")
        keyword = response.meta.get("keyword")
        #获取下一页的数字(进行翻页)与微博文章所在的位置
        next_page = results.get("data").get("cardlistInfo").get("page")
        result = results.get("data").get("cards")
        # 爬取微博文章
        for j in result:
            card_type = j.get("card_type")
            show_type = j.get("show_type")
            if show_type == 1 and card_type == 11:
                 for i in j.get("card_group"):  # 爬取微博文章
                    item["text"] = (PyQuery(i.get("mblog").get("text")).text())
                    user = i.get("mblog").get("user").get("screen_name")
                    comments_count = i.get("mblog").get("comments_count")
                    print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
                    print(comments_count)
                    print(user)
                    print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")


                   # if comments_count :
                        #id = i.get("mblog").get("id")  # 爬取微博文章评论时需要用到
                        #yield scrapy.Request(
                            #url="https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id_type=0" % (id, id),
                            #callback=self.parse_comment1,meta={"keyword": keyword, "id": id, 'item["text"]': item["text"]}, dont_filter=True)
                   # else:
                    yield item

        #对微博文章进行翻页
        #if page != next_page:
            #yield scrapy.Request(url=base_url + str(next_page), headers=self.Referer,
               #meta={"page": next_page, "keyword": keyword},callback=self.parse2,dont_filter=True)


    def parse_comment1(self,response):
        item=OneItem()
#        item['text'] = response.meta['item["text"]']
        base_url="https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id=%s&max_id_type=%s"
        id = response.meta.get("id")
        keyword = response.meta.get("keyword")
        #转为json格式
        results = json.loads(response.text, encoding="utf-8")

        if results.get("ok"):
            max_id = results.get("data").get("max_id")
            max_id_type = results.get("data").get("max_id_type")
            list1=[]

            datas = results.get("data").get("data")
            for data in datas:#爬取文章
                comment = PyQuery(data.get("text")).text()
                list1.append(comment)
            item['comment']=list1
            #print(item)
            if max_id:#翻页
                yield scrapy.Request(url=base_url % (id, id, str(max_id), str(max_id_type)),
                                    callback=self.parse_comment1, meta={"keyword": keyword, "id": id,"item":item},dont_filter=True)
            else:
                yield item
    def parse_comment2(self,response):

        item = response.meta['item']
        list1 = []
        list1.append(item['comment'])
        base_url="https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id=%s&max_id_type=%s"
        id = response.meta.get("id")
        keyword = response.meta.get("keyword")
        print(response.text)

        #转为json格式
        results = json.loads(response.text, encoding="utf-8")
        print(results)

        if results.get("ok"):
            max_id = results.get("data").get("max_id")
            max_id_type = results.get("data").get("max_id_type")


            datas = results.get("data").get("data")
            for data in datas:#爬取文章
                comment = PyQuery(data.get("text")).text()
                list1.append(comment)
            item['comment']=list1
            print(item)
            if max_id:#翻页
                yield scrapy.Request(url=base_url % (id, id, str(max_id), str(max_id_type)),
                                     callback=self.parse_comment2, meta={"keyword": keyword, "id": id,},dont_filter=True)
            else:
                yield item




















  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值