Scrapy实现微博关键词爬虫(爬虫结果写入mongodb)

爬取字段信息有:

  1. 关键词
  2. 微博ID
  3. 微博内容信息
  4. 微博赞的个数
  5. 微博转发个数
  6. 微博评论个数
  7. 转发微博的转发原因
  8. 微博日期
  9. 转发源ID
  10. 原微博的赞个数
  11. 原微博的评论个数
  12. 原微博的转发个数
  13. 存入数据库的ID值(可忽略)

spiders文件夹下的microBlogSpider.py里这样写:

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider, Request, FormRequest
from scrapy.selector import Selector
import datetime
import random
from blogSpider.items import microBlogItem, keyWordItem
from blogSpider.items import keyWordItem
import json

class MicroblogspiderSpider(scrapy.Spider):
    name = 'microBlogSpider'
    allowed_domains = ['weibo.cn']
    search_url = 'https://weibo.cn/search/mblog'
    # 默认100页
    max_page = 100
    myCookie = 'xxxxxxxxxxx'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': ' max-age=0',
        'Connection': ' keep-alive',
        'Content-Type': ' application/x-www-form-urlencoded',
        'Host': ' weibo.cn',
        'Origin': ' https://weibo.cn',
        'Upgrade-Insecure-Requests': ' 1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
    }

    def start_requests(self):
        keyword = 罗志祥'
        startTime = '2020-05-01'
        endTime = '2020-05-02'

        start_time = datetime.datetime.strptime(startTime, '%Y-%m-%d')
        end_time = datetime.datetime.strptime(endTime, '%Y-%m-%d')
        # print(start_time)
        # print(end_time)
        cookie = {}
        for i in self.myCookie.split(';')[:-1]:
            cookie[i.split('=')[0]] = i.split('=')[1]
        while start_time <= end_time:
            currentTime = start_time
            # 处理时间格式
            start = str(currentTime)
            end = str(currentTime)
            start_T = start[:start.index(' ')].replace('-', '')
            end_T = end[:end.index(' ')].replace('-', '')
            url = '{url}?hideSearchFrame=&keyword={keyword}&starttime={st}&endtime={et}&sort=hot'.format(url=self.search_url, keyword=keyword, st=start_T, et=end_T)
            yield scrapy.FormRequest(
                url,
                callback=self.parse_index,
                cookies=cookie,
                headers=self.headers,
                meta={
                    'time': start_time,
                    'keyWord': keyword,
                }
            )
            start_time += datetime.timedelta(days=1)


    def parse_index(self, response):
        if not response.body:
            return
        dateTime = response.meta['time']
        keyWord = response.meta['keyWord']
        # print(dateTime)
        cookie = {}
        for i in self.myCookie.split(';')[:-1]:
            cookie[i.split('=')[0]] = i.split('=')[1]
        url = '{url}&page={page}'.format(url=response.url, page=1)
        yield FormRequest(
            url,
            headers=self.headers,
            callback=self.getInfo,
            cookies=cookie,
            meta={
                'time': dateTime,
                'keyWord': keyWord,

            }
        )

    def getInfo(self, response):
        pageInfo = Selector(response)

        # print(response.body.decode('utf-8'))
        # allDiv是获取当前页面中所有含有微博信息的div标签
        allDiv = pageInfo.xpath('//div[starts-with(@id,"M_")]')
        # print(allDiv)
        # print(len(allDiv))
        # # 然后对所有DIV进行解析
        for div in allDiv:
            # microBlogInfo = microBlogItem()
            # print(div)
            # print(type(div))
            # 用来标注是否存入数据
            flag = 0
            microBlogInfo = microBlogItem()
            microBlogInfo['keyWord'] = response.meta['keyWord']
            microBlogInfo['blogDate'] = str(response.meta['time']).split(' ')[0]
            microBlogInfo['microBlogId'] = div.css('a.nk ::text').extract()[0]
            microBlogInfo['originalBlogSupportNum'] = ''
            microBlogInfo['originalBlogCommentsNum'] = ''
            microBlogInfo['originalBlogForwardingNum'] = ''
            microBlogInfo['forwardingReason'] = ''
            microBlogInfo['fromBlogId'] = ''
            # print(microBlogInfo['microBlogId'])
            divs = div.xpath('div')
            divNum = len(divs)
            # print(divNum)

            if divNum == 3:
                # 包含3个DIV的类型为带图带转发微博
                try:
                    microBlogInfo['fromBlogId'] = div.xpath('div[1]/span[@class="cmt"]/a/text()').extract()[0]
                except:
                    flag = 1
                # print(microBlogInfo['fromBlogId'])
                if flag == 0:
                    contents = div.xpath('div[1]/span[@class="ctt"]/text()').extract()
                    microBlogInfo['contentInfo'] = ''
                    for content in contents:
                        content = "".join(content.split())
                        microBlogInfo['contentInfo'] += content
                    #  去掉冒号
                    if microBlogInfo['contentInfo'][0] == ':':
                        microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
                    # print(microBlogInfo['contentInfo'])
                    # 包含赞和转发数['赞[8322]', '原文转发[2927]']
                    originalInfo = div.xpath('div[2]/span[@class="cmt"]/text()').extract()
                    oSNum = originalInfo[0][originalInfo[0].index('[')+1:originalInfo[0].index(']')]
                    oFNum = originalInfo[1][originalInfo[1].index('[')+1:originalInfo[1].index(']')]
                    # 包含评论数['原文评论[333]']
                    originalComment = div.xpath('div[2]/a[last()]/text()').extract()
                    oCNum = originalComment[0][originalComment[0].index('[') + 1:originalComment[0].index(']')]
                    # print(originalInfo)
                    # print(oSNum)
                    # print(oFNum)
                    # print(originalComment)
                    # print(oCNum)
                    microBlogInfo['originalBlogSupportNum'] = oSNum
                    microBlogInfo['originalBlogCommentsNum'] = oCNum
                    microBlogInfo['originalBlogForwardingNum'] = oFNum
                    lastDivInfo1 = div.xpath('div[3]/text()').extract()
                    lastDivInfo2 = div.xpath('div[3]/*/text()').extract()
                    microBlogInfo['forwardingReason'] = ''
                    for info1 in lastDivInfo1:
                        info1 = "".join(info1.split())
                        microBlogInfo['forwardingReason'] += info1
                    # print(microBlogInfo['forwardingReason'])
                    for info2 in lastDivInfo2:
                        info2 = "".join(info2.split())
                        if info2.startswith('赞['):
                            microBlogInfo['numOfSupport'] = info2[info2.index('[')+1:info2.index(']')]
                        elif info2.startswith('评论['):
                            microBlogInfo['numOfComments'] = info2[info2.index('[')+1:info2.index(']')]
                        elif info2.startswith('转发['):
                            microBlogInfo['numOfForwarding'] = info2[info2.index('[')+1:info2.index(']')]
                        else:
                            continue
                    # print('赞的个数:'+microBlogInfo['numOfSupport'])
                    # print('评论个数:'+microBlogInfo['numOfComments'])
                    # print('转发个数:'+microBlogInfo['numOfForwarding'])
            elif divNum == 2:
                firstSpan = div.xpath('div[1]/span[1]/@class').extract()[0]
                # print(firstSpan)
                # 第一个span的class=‘cmt’则证明是非带图转发微博
                if firstSpan == 'cmt':
                    try:
                        microBlogInfo['fromBlogId'] = div.xpath('div[1]/span[@class="cmt"]/a/text()').extract()[0]
                    except:
                        flag = 1
                    if flag == 0:
                        contents = div.xpath('div[1]/span[@class="ctt"]/text()').extract()
                        microBlogInfo['contentInfo'] = ''
                        for content in contents:
                            content = "".join(content.split())
                            microBlogInfo['contentInfo'] += content
                        #  去掉冒号
                        if microBlogInfo['contentInfo'][0] == ':':
                            microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]

                        originalInfo = div.xpath('div[1]/span[@class="cmt"]/text()').extract()
                        oSNum = originalInfo[len(originalInfo)-2][originalInfo[len(originalInfo)-2].index('[')+1:originalInfo[len(originalInfo)-2].index(']')]
                        oFNum = originalInfo[len(originalInfo)-1][originalInfo[len(originalInfo)-1].index('[')+1:originalInfo[len(originalInfo)-1].index(']')]
                        # print(oSNum)
                        # print(oFNum)
                        originalComment = div.xpath('div[1]/a[last()]/text()').extract()
                        oCNum = originalComment[0][originalComment[0].index('[') + 1:originalComment[0].index(']')]
                        # print(oCNum)
                        microBlogInfo['originalBlogSupportNum'] = oSNum
                        microBlogInfo['originalBlogCommentsNum'] = oCNum
                        microBlogInfo['originalBlogForwardingNum'] = oFNum
                        lastDivInfo1 = div.xpath('div[2]/text()').extract()
                        lastDivInfo2 = div.xpath('div[2]/*/text()').extract()
                        microBlogInfo['forwardingReason'] = ''
                        for info1 in lastDivInfo1:
                            info1 = "".join(info1.split())
                            microBlogInfo['forwardingReason'] += info1
                        # print(microBlogInfo['forwardingReason'])
                        for info2 in lastDivInfo2:
                            info2 = "".join(info2.split())
                            if info2.startswith('赞['):
                                microBlogInfo['numOfSupport'] = info2[info2.index('[') + 1:info2.index(']')]
                            elif info2.startswith('评论['):
                                microBlogInfo['numOfComments'] = info2[info2.index('[') + 1:info2.index(']')]
                            elif info2.startswith('转发['):
                                microBlogInfo['numOfForwarding'] = info2[info2.index('[') + 1:info2.index(']')]
                            else:
                                continue
                        # print('赞的个数:'+microBlogInfo['numOfSupport'])
                        # print('评论个数:'+microBlogInfo['numOfComments'])
                        # print('转发个数:'+microBlogInfo['numOfForwarding'])

                # ctt则为带图非转发微博
                elif firstSpan == 'ctt':
                    contents = div.xpath('div[1]/span[@class="ctt"]/text()').extract()
                    microBlogInfo['contentInfo'] = ''
                    for content in contents:
                        content = "".join(content.split())
                        microBlogInfo['contentInfo'] += content
                    #  去掉冒号
                    if microBlogInfo['contentInfo'][0] == ':':
                        microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
                    lastDivInfo = div.xpath('div[2]/a/text()').extract()
                    # print(lastDivInfo)
                    for info in lastDivInfo:
                        info = "".join(info.split())
                        if info.startswith('赞['):
                            microBlogInfo['numOfSupport'] = info[info.index('[') + 1:info.index(']')]
                        elif info.startswith('评论['):
                            microBlogInfo['numOfComments'] = info[info.index('[') + 1:info.index(']')]
                        elif info.startswith('转发['):
                            microBlogInfo['numOfForwarding'] = info[info.index('[') + 1:info.index(']')]
                        else:
                            continue
                    # print('赞的个数:'+microBlogInfo['numOfSupport'])
                    # print('评论个数:'+microBlogInfo['numOfComments'])
                    # print('转发个数:'+microBlogInfo['numOfForwarding'])
            # 原创不带图微博
            elif divNum == 1:
                contents = div.xpath('div/span[@class="ctt"]/text()').extract()
                microBlogInfo['contentInfo'] = ''
                for content in contents:
                    content = "".join(content.split())
                    microBlogInfo['contentInfo'] += content
                #  去掉冒号
                if microBlogInfo['contentInfo'][0] == ':':
                    microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
                # print(microBlogInfo['contentInfo'])
                lastDivInfo = div.xpath('div/a/text()').extract()
                # print(lastDivInfo)
                for info in lastDivInfo:
                    info = "".join(info.split())
                    if info.startswith('赞['):
                        microBlogInfo['numOfSupport'] = info[info.index('[') + 1:info.index(']')]
                    elif info.startswith('评论['):
                        microBlogInfo['numOfComments'] = info[info.index('[') + 1:info.index(']')]
                    elif info.startswith('转发['):
                        microBlogInfo['numOfForwarding'] = info[info.index('[') + 1:info.index(']')]
                    else:
                        continue
                # print('赞的个数:'+microBlogInfo['numOfSupport'])
                # print('评论个数:'+microBlogInfo['numOfComments'])
                # print('转发个数:'+microBlogInfo['numOfForwarding'])
            if flag == 0:
                yield microBlogInfo

注意:mycookie填写你登录之后的cookie信息就行,具体操作步骤如下:
爬虫网站登录之后,F12打开浏览器网页详细界面,按照下面的点击,然后复制cookie粘贴到爬虫文件里即可
在这里插入图片描述
items.py里这样写:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class microBlogItem(scrapy.Item):
    '''
    微博信息
    '''
    collection = 'microBlogData'
    keyWord = scrapy.Field()
    microBlogId = scrapy.Field()
    contentInfo = scrapy.Field()
    numOfSupport = scrapy.Field()
    numOfComments = scrapy.Field()
    numOfForwarding = scrapy.Field()
    forwardingReason = scrapy.Field()
    blogDate = scrapy.Field()
    fromBlogId = scrapy.Field()
    originalBlogSupportNum = scrapy.Field()
    originalBlogCommentsNum = scrapy.Field()
    originalBlogForwardingNum = scrapy.Field()
    id = scrapy.Field()

pipelines.py里这样写:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo

class BlogspiderPipeline(object):

    def __init__(self):
        '''
        初始化mongodb的各项参数
        :param mongoUrl:
        :param mongoPort:
        :param mongoDB:
        '''
        self.mongoUrl = '127.0.0.1'
        self.mongoPort = 27017
        self.mongoDB = 'SpiderTest'

    def open_spider(self, spider):
        '''
        开启爬虫时,链接数据库
        :param spider:
        :return:
        '''
        self.client = pymongo.MongoClient(self.mongoUrl, self.mongoPort)
        self.db = self.client[self.mongoDB]

    def process_item(self, item, spider):
        '''
        将数据写入数据库
        :param item:
        :param spider:
        :return:
        '''
       
        exist = self.db[item.collection].find({'microBlogId': item['microBlogId'], 'contentInfo': item['contentInfo'], 'blogDate': item['blogDate']}).count()
            if exist == 0:
                count = self.db[item.collection].count()
                item['id'] = count + 1
                self.db[item.collection].insert_one(dict(item))
      
        return item

    def close_spider(self, spider):
        '''
        关闭爬虫,关闭数据库
        :param spider:
        :return:
        '''
        self.client.close()

接下来,执行爬虫就可以啦!

©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页