完整的scrapy爬虫（爬取《有妖气》榜单前三十的所有漫画最近两天的所有评论（动态界面））

最新推荐文章于 2024-05-19 16:47:54 发布

谦丶丶

最新推荐文章于 2024-05-19 16:47:54 发布

阅读量1.4k

点赞数

分类专栏： Python 文章标签： Python 爬虫

本文链接：https://blog.csdn.net/qq_37858326/article/details/80269997

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-
'''
@author Mrpora
'''

import json
import os
import sys
from ipindex_spider_py.utils.MongoHandler import MongoHandler
from ipindex_spider_py.items.Ip.baseCaAuthorItem import BaseCaAuthorItem
from ipindex_spider_py.items.Ip.baseCaCommentItem import BaseCaCommentItem
from ipindex_spider_py.items.Ip.baseCaDataItem import BaseCaDataItem
from ipindex_spider_py.items.Ip.baseCaTopItem import BaseCaTopItem

from ipindex_spider_py.utils import dateUtil


# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class FilePipeline(object):
    def __init__(self):
        reload(sys)
        sys.setdefaultencoding('utf8')
        # table:fileName
        fileNames = {
            'caAuthor':'data/ip_top2/caAuthor' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json',
            'caComment':'data/ip_top2/caComment' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json',
            'caData':'data/ip_top2/caData' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json',
            'caTop':'data/ip_top2/caTop' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json',
        }
        for fileName in fileNames:
            parent_path = os.path.abspath(os.path.dirname(fileNames[fileName]))
            if not os.path.exists(parent_path):
                os.makedirs(parent_path)

        self.file_caAuthor = open(fileNames['caAuthor'],'wb')
        self.file_caComment = open(fileNames['caComment'], 'wb')
        self.file_caData = open(fileNames['caData'], 'wb')
        self.file_caTop = open(fileNames['caTop'], 'wb')
        pass


    def process_item(self, item, spider):
        if isinstance(item, BaseCaAuthorItem):
            line = json.dumps(dict(item), ensure_ascii=False) + '\n'
            self.file_caAuthor.write(line)
        elif isinstance(item, BaseCaCommentItem):
            line = json.dumps(dict(item), ensure_ascii=False) + '\n'
            self.file_caComment.write(line)
        elif isinstance(item, BaseCaDataItem):
            line = json.dumps(dict(item), ensure_ascii=False) + '\n'
            self.file_caData.write(line)
        elif isinstance(item, BaseCaTopItem):
            line = json.dumps(dict(item), ensure_ascii=False) + '\n'
            self.file_caTop.write(line)
        return item
        pass

    def open_spider(self, spider):
        pass

    def close_spider(self, spider):
        self.file_caAuthor.close()
        self.file_caComment.close()
        self.file_caData.close()
        self.file_caTop.close()
        pass


class MongoDBPipeline(object):
    def __init__(self, host, port, user, password, db):
        self.mongo_host = host
        self.mongo_port = port
        self.mongo_user = user
        self.mongo_password = password
        self.mongo_db = db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_host'],
            port=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_port'],
            user=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_user'],
            password=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_password'],
            db=crawler.settings.get('MONGODB_CONFIGURATION')['mongo_db']
        )

    def open_spider(self, spider):
        self.mongoHandler = MongoHandler(host=self.mongo_host, port=self.mongo_port, db=self.mongo_db,
                                         user=self.mongo_user,
                                         password=self.mongo_password)

    pass

    def close_spider(self, spider):
        self.mongoHandler.close()
        pass

    def process_item(self, item, spider):
        if isinstance(item, BaseCaTopItem):
            self.mongoHandler.save_item('tmp_comicTop', dict(item))
        elif isinstance(item, BaseCaAuthorItem):
            self.mongoHandler.save_item('tmp_comicAuthor', dict(item))
        elif isinstance(item, BaseCaDataItem):
            self.mongoHandler.save_item('tmp_comicData', dict(item))
        elif isinstance(item, BaseCaCommentItem):
            self.mongoHandler.save_item('tmp_comicComment', dict(item))

return item

# -*- coding: utf-8 -*-
import os

if __name__ == '__main__':
    # os.system("scrapy crawl acqqTopSpider")
    # os.system("scrapy crawl youyaoqiTopSpider")
    # os.system("scrapy crawl youyaoqiDataSpider")
    #os.system("scrapy crawl youyaoqiAuthorSpider")
    os.system("scrapy crawl youyaoqiCommentSpider")
    # os.system("scrapy crawl dajiaochongTopSpider")

# -*- coding: utf-8 -*-
import scrapy
import datetime
import  re

from ipindex_spider_py.items.Ip.youyaoqi.youyaoqiCommentItem import YouyaoqiCommentItem
from ipindex_spider_py.utils import dateUtil, commonUtil
class YouyaoqiCommentSpider(scrapy.Spider):
    name = "youyaoqiCommentSpider"
    start_urls = []
    for i in 1, 2:
        for j in 1, 2:
            start_urls.append('http://comic.u17.com/rank/t2.html?group='+str(i)+'&page=' +str(j))
    custom_settings = {
        'DEFAULT_REQUEST_HEADERS': {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
            'Referer': 'http://comic.u17.com/'},
        'ITEM_PIPELINES': {'ipindex_spider_py.pipelines.FilePipeline': 300},
        'FILE_NAME': 'data/ip_top2/youyaoqiTop' + str(dateUtil.getStrFormatTime('%Y%m')) + '.json'
    }
    def parse(self, response):
        papers = response.xpath("//div[@class='rank_more_box']/ul/li")
        for paper in papers:
            rank= paper.xpath(".//i/text()").extract()[0]
            if (int(rank)> 30):
                continue
            item = YouyaoqiCommentItem()
            item['platform'] = "youyaoqi"
            item['spiderTime'] = str(datetime.datetime.now())
            link = paper.xpath(".//div[1]/div[@class='categray']/a/@href").extract()[0]
            item['oid'] = commonUtil.MD5Util(link)
            authorlink= paper.xpath(".//div[2]/a/@href").extract()[0]
            item['aid'] = commonUtil.MD5Util(authorlink)
            item['dataVersion'] = 'V'+str(datetime.datetime.now().strftime("%Y%m"))
            yield scrapy.Request(link, callback=self.parse_authorInfo, meta={'item': item})
    def parse_authorInfo(self, response):
        item = response.meta['item']
        comic=re.findall(r"comic_id=(\d*)",response.text,re.I)
        thread=re.findall(r"thread_id:(\d*)",response.text,re.I)
        url="http://www.u17.com/comment/ajax.php?mod=thread&act=get_comment_php_v4&_=1525857397287&sort=create_time&thread_id="+''.join(thread)+"&object_type=custom&object_id=0&page=1&page_size=20&face=small&comic_id="+"".join(comic)
        yield scrapy.Request(url, callback=self.parse_nextauthorInfo, meta={'item': item})

    def parse_nextauthorInfo(self, response):
        papers=response.xpath("//body/div")
        flag=True
        for paper in papers:
            item = response.meta['item']
            d1 = datetime.datetime.now() - datetime.timedelta(hours=48)
            dt = paper.xpath(".//dt/i/@title").extract()[0]
            dd = datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
            if(d1<=dd):
                flag=True;
                item['datetime']=dt
                content=paper.xpath(".//div[@class='ncc_content_right_text']/text()").extract()[0]
                item['content']=content
                yield item
            else:
                flag=False
        if(flag):
            i=int(''.join(re.findall(r"page=(\d*)",response.url,re.I)))+1
            par="page="+str(i)+""
            url=re.sub(r"page=(\d*)",par,response.url)
            par='''"total_page":(\d*)'''
            type(response.text)
            last=int(''.join(re.findall(par,response.text,re.I)))
            if(i<=last):
                yield scrapy.Request(url.encode("utf-8"), callback=self.parse_nextauthorInfo, meta={'item': response.meta['item']})