python tkinter界面 多进程启动scrapy爬取百度贴吧的回复,显示爬取进度,并可以搜索回帖人,指定时间生成词云图,用pyinstaller打包成exe(五)

scrapy项目里的各个模块:

 

settings.py

BOT_NAME = 'tieba'

SPIDER_MODULES = ['tieba.spiders']
NEWSPIDER_MODULE = 'tieba.spiders'
RETRY_ENABLED = False
DOWNLOAD_DELAY = 0.5
COOKIES_ENABLED = True

DEFAULT_REQUEST_HEADERS ={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Referer':'https://tieba.baidu.com/'
}

#设定日志,查看error,不过每次启动爬虫,就会删除重建
#print('设定日志')
LOG_FILE='scrapy日志.log'
LOG_LEVEL='WARNING'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
    'tieba.pipelines.TiebaPipeline': 300,
}

 

items.py

import scrapy


class TiebaItem(scrapy.Item):

    title=scrapy.Field()
    author = scrapy.Field()
    tid = scrapy.Field()
    pages = scrapy.Field()
    reply_num = scrapy.Field()
    last_reply_author = scrapy.Field()
    last_reply_time = scrapy.Field()
    post_list = scrapy.Field()
    #page_range = scrapy.Field()

class TieziItem(scrapy.Item):

    title=scrapy.Field()
    author = scrapy.Field()
    tid = scrapy.Field()
    pages = scrapy.Field()
    reply_num = scrapy.Field()
    post_list = scrapy.Field()
    file_name = scrapy.Field()
    #last_reply_time = scrapy.Field()
    #page_range = scrapy.Field()

 

pipelines.py(重点)

'''
操作步骤:
》open_spider设定多个参数
    》从"config",获取输入的信息,设定贴吧、存储路径、
    》根据当前是哪个spider,设定页数范围、tid,再实例化Log

》返回item时,根据当前是哪个spider,分别保存json
》结束时调用close_spider,把这次爬取的数量时间等log日志'''

class TiebaPipeline(object):

    def process_item(self, item, spider):
        '''保存返回的item,根据spider分别处理'''
        #爬某个贴吧,返回的item都放在各自以tid命名的json文件
        if spider.name == 'one_tieba':
            tid = item['tid']
            path=spider.path + os.sep + str(tid) + '.json'
            with codecs.open(path, 'a', encoding='utf-8') as f:
                line = json.dumps(item, ensure_ascii=False) + "\n"
                f.write(line)

        #爬单个帖子,根据item里面的file_name保存(每100页为一个文件)
        elif spider.name == 'one_tiezi':
            the_tiezi = dict(item)
            path=the_tiezi.pop('file_name')
            with codecs.open(path, 'a', encoding='utf-8') as f:#file_path在spider里设定的
                line = json.dumps(the_tiezi, ensure_ascii=False) + "\n"
                f.write(line)

    def open_spider(self,spider):
        '''设定贴吧名、存储路径、页数等多个数据,传入到spider进行爬取
            实例化log,结束时调用'''

        print('现在运行的spider是:',spider.name)
        config_file=Open_json('config')
        config_info=config_file.read()[0]

        spider.kw=config_info['tieba_name']
        spider.dir_path=config_info['save_path']

        if spider.name=='one_tieba':
            spider.start_kw_page = config_info['the_pages'][0]
            spider.end_kw_page = config_info['the_pages'][1]
            self.Log_one_TieBa=Log_one_TieBa()
        elif spider.name == 'one_tiezi':
            spider.tid = config_info['tid']
            spider.start_tiezi_page = config_info['the_pages'][0]
            spider.end_tiezi_page = config_info['the_pages'][1]
            self.Log_one_TieZi=Log_one_TieZi()


    def close_spider(self, spider):
        '''爬取结束时,log其处理记录'''

        #爬取贴吧的帖子时,记录其实际爬取数量、被删数量等
        if spider.name == 'one_tieba':
            tiezi_count = spider.tiezi_count  # 理论爬取帖子总数
            del_count = spider.del_count  # 帖子第一页发现被删 总数
            unchanged_count = spider.unchanged_count  # 没变动帖子数
            actual_count = tiezi_count - del_count - unchanged_count  # 实际爬取的帖子总数
            
            items_count = spider.return_count  # 把页数分配后(每十页),实际返回的item总数
            print('被删帖子数:%d,没变动帖子数:%d' % (del_count, unchanged_count))

            page_range = '第%d~%d页' % (spider.start_kw_page,spider.end_kw_page)
            self.Log_one_TieBa.log(spider.kw,page_range,tiezi_count,actual_count,items_count)

        #爬取单个帖子时,记录其信息
        elif spider.name=='one_tiezi':
            page_range= '第%d~%d页' % (spider.start_tiezi_page, spider.end_tiezi_page)
            self.Log_one_TieZi.log(spider.kw,spider.tiezi_info,page_range,spider.item_counts,spider.return_items_count)

 

 

好了,接下来就是最难受的spider部分。。。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值