微博,移动端,未登陆成功
## -*- coding: utf-8 -*-
import scrapy
from one.items import OneItem
from urllib.parse import urlencode
import json
import urllib.request
from pyquery import PyQuery
class PublicSentimentSpider(scrapy.Spider):
name = "publicSentiment"
allowed_domains = ["m.weibo.com"]
start_urls=["https://passport.weibo.cn/signin/login?"]#start_urls里面存放爬虫框架开始时的链接,该链接必须以列表形式存放
Referer = {"Referer": "https://m.weibo.cn/p/searchall?containerid=100103type%3D1%26q%3D" + urllib.parse.quote("迪丽热巴")}
def parse(self,response):
print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
yield scrapy.FormRequest.from_response(
response,
formdata = {'username':'17649969048','password':'gfwqf31248','savestate':'1','r':'https://m.weibo.cn/detail/4366060574884436',
'ec':'0','pagerefer':'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=https%3A%2F%2Fm.weibo.cn%2Fdetail%2F4366060574884436',
'entry':'mweibo','wentry':'','loginfrom':'','client_id':'','code':'','qq':'','mainpageflag':'1','hff':''},
callback=self.parse1,headers=self.Referer,dont_filter=True)
def parse1(self,response):
yield scrapy.Request(url='https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&page_type=searchall',
callback=self.parse2,meta={"page": 1, "keyword": "迪丽热巴"},dont_filter=True)
def parse2(self, response):
item=OneItem()
base_url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&page_type=searchall&page="
#转为json格式
results = json.loads(response.text, encoding="utf-8")
page = response.meta.get("page")
keyword = response.meta.get("keyword")
#获取下一页的数字(进行翻页)与微博文章所在的位置
next_page = results.get("data").get("cardlistInfo").get("page")
result = results.get("data").get("cards")
# 爬取微博文章
for j in result:
card_type = j.get("card_type")
show_type = j.get("show_type")
if show_type == 1 and card_type == 11:
for i in j.get("card_group"): # 爬取微博文章
item["text"] = (PyQuery(i.get("mblog").get("text")).text())
user = i.get("mblog").get("user").get("screen_name")
comments_count = i.get("mblog").get("comments_count")
print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
print(comments_count)
print(user)
print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
# if comments_count :
#id = i.get("mblog").get("id") # 爬取微博文章评论时需要用到
#yield scrapy.Request(
#url="https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id_type=0" % (id, id),
#callback=self.parse_comment1,meta={"keyword": keyword, "id": id, 'item["text"]': item["text"]}, dont_filter=True)
# else:
yield item
#对微博文章进行翻页
#if page != next_page:
#yield scrapy.Request(url=base_url + str(next_page), headers=self.Referer,
#meta={"page": next_page, "keyword": keyword},callback=self.parse2,dont_filter=True)
def parse_comment1(self,response):
item=OneItem()
# item['text'] = response.meta['item["text"]']
base_url="https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id=%s&max_id_type=%s"
id = response.meta.get("id")
keyword = response.meta.get("keyword")
#转为json格式
results = json.loads(response.text, encoding="utf-8")
if results.get("ok"):
max_id = results.get("data").get("max_id")
max_id_type = results.get("data").get("max_id_type")
list1=[]
datas = results.get("data").get("data")
for data in datas:#爬取文章
comment = PyQuery(data.get("text")).text()
list1.append(comment)
item['comment']=list1
#print(item)
if max_id:#翻页
yield scrapy.Request(url=base_url % (id, id, str(max_id), str(max_id_type)),
callback=self.parse_comment1, meta={"keyword": keyword, "id": id,"item":item},dont_filter=True)
else:
yield item
def parse_comment2(self,response):
item = response.meta['item']
list1 = []
list1.append(item['comment'])
base_url="https://m.weibo.cn/comments/hotflow?id=%s&mid=%s&max_id=%s&max_id_type=%s"
id = response.meta.get("id")
keyword = response.meta.get("keyword")
print(response.text)
#转为json格式
results = json.loads(response.text, encoding="utf-8")
print(results)
if results.get("ok"):
max_id = results.get("data").get("max_id")
max_id_type = results.get("data").get("max_id_type")
datas = results.get("data").get("data")
for data in datas:#爬取文章
comment = PyQuery(data.get("text")).text()
list1.append(comment)
item['comment']=list1
print(item)
if max_id:#翻页
yield scrapy.Request(url=base_url % (id, id, str(max_id), str(max_id_type)),
callback=self.parse_comment2, meta={"keyword": keyword, "id": id,},dont_filter=True)
else:
yield item