python tkinter界面多进程启动scrapy爬取百度贴吧的回复，显示爬取进度，并可以搜索回帖人，指定时间生成词云图，用pyinstaller打包成exe(五)

最新推荐文章于 2021-12-12 10:33:28 发布

行者刘6

最新推荐文章于 2021-12-12 10:33:28 发布

阅读量284

点赞数

分类专栏： python 爬取百度贴吧的帖子

本文链接：https://blog.csdn.net/qq_38282706/article/details/99230296

版权

python 爬取百度贴吧的帖子专栏收录该内容

9 篇文章 4 订阅

订阅专栏

scrapy项目里的各个模块：

settings.py

BOT_NAME = 'tieba'

SPIDER_MODULES = ['tieba.spiders']
NEWSPIDER_MODULE = 'tieba.spiders'
RETRY_ENABLED = False
DOWNLOAD_DELAY = 0.5
COOKIES_ENABLED = True

DEFAULT_REQUEST_HEADERS ={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Referer':'https://tieba.baidu.com/'
}

#设定日志，查看error，不过每次启动爬虫，就会删除重建
#print('设定日志')
LOG_FILE='scrapy日志.log'
LOG_LEVEL='WARNING'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
    'tieba.pipelines.TiebaPipeline': 300,
}

items.py

import scrapy


class TiebaItem(scrapy.Item):

    title=scrapy.Field()
    author = scrapy.Field()
    tid = scrapy.Field()
    pages = scrapy.Field()
    reply_num = scrapy.Field()
    last_reply_author = scrapy.Field()
    last_reply_time = scrapy.Field()
    post_list = scrapy.Field()
    #page_range = scrapy.Field()

class TieziItem(scrapy.Item):

    title=scrapy.Field()
    author = scrapy.Field()
    tid = scrapy.Field()
    pages = scrapy.Field()
    reply_num = scrapy.Field()
    post_list = scrapy.Field()
    file_name = scrapy.Field()
    #last_reply_time = scrapy.Field()
    #page_range = scrapy.Field()

pipelines.py(重点)

'''
操作步骤：
》open_spider设定多个参数
》从"config"，获取输入的信息，设定贴吧、存储路径、
》根据当前是哪个spider，设定页数范围、tid，再实例化Log

》返回item时，根据当前是哪个spider，分别保存json
》结束时调用close_spider，把这次爬取的数量时间等log日志'''

class TiebaPipeline(object):

    def process_item(self, item, spider):
        '''保存返回的item，根据spider分别处理'''
        #爬某个贴吧，返回的item都放在各自以tid命名的json文件
        if spider.name == 'one_tieba':
            tid = item['tid']
            path=spider.path + os.sep + str(tid) + '.json'
            with codecs.open(path, 'a', encoding='utf-8') as f:
                line = json.dumps(item, ensure_ascii=False) + "\n"
                f.write(line)

        #爬单个帖子，根据item里面的file_name保存(每100页为一个文件)
        elif spider.name == 'one_tiezi':
            the_tiezi = dict(item)
            path=the_tiezi.pop('file_name')
            with codecs.open(path, 'a', encoding='utf-8') as f:#file_path在spider里设定的
                line = json.dumps(the_tiezi, ensure_ascii=False) + "\n"
                f.write(line)

    def open_spider(self,spider):
        '''设定贴吧名、存储路径、页数等多个数据，传入到spider进行爬取
            实例化log，结束时调用'''

        print('现在运行的spider是：',spider.name)
        config_file=Open_json('config')
        config_info=config_file.read()[0]

        spider.kw=config_info['tieba_name']
        spider.dir_path=config_info['save_path']

        if spider.name=='one_tieba':
            spider.start_kw_page = config_info['the_pages'][0]
            spider.end_kw_page = config_info['the_pages'][1]
            self.Log_one_TieBa=Log_one_TieBa()
        elif spider.name == 'one_tiezi':
            spider.tid = config_info['tid']
            spider.start_tiezi_page = config_info['the_pages'][0]
            spider.end_tiezi_page = config_info['the_pages'][1]
            self.Log_one_TieZi=Log_one_TieZi()


    def close_spider(self, spider):
        '''爬取结束时，log其处理记录'''

        #爬取贴吧的帖子时，记录其实际爬取数量、被删数量等
        if spider.name == 'one_tieba':
            tiezi_count = spider.tiezi_count  # 理论爬取帖子总数
            del_count = spider.del_count  # 帖子第一页发现被删 总数
            unchanged_count = spider.unchanged_count  # 没变动帖子数
            actual_count = tiezi_count - del_count - unchanged_count  # 实际爬取的帖子总数
            
            items_count = spider.return_count  # 把页数分配后(每十页)，实际返回的item总数
            print('被删帖子数:%d，没变动帖子数:%d' % (del_count, unchanged_count))

            page_range = '第%d~%d页' % (spider.start_kw_page,spider.end_kw_page)
            self.Log_one_TieBa.log(spider.kw,page_range,tiezi_count,actual_count,items_count)

        #爬取单个帖子时，记录其信息
        elif spider.name=='one_tiezi':
            page_range= '第%d~%d页' % (spider.start_tiezi_page, spider.end_tiezi_page)
            self.Log_one_TieZi.log(spider.kw,spider.tiezi_info,page_range,spider.item_counts,spider.return_items_count)