python tkinter界面多进程启动scrapy爬取百度贴吧的回复，显示爬取进度，并可以搜索回帖人，指定时间生成词云图，用pyinstaller打包成exe(四)

最新推荐文章于 2022-09-03 23:25:20 发布

行者刘6

最新推荐文章于 2022-09-03 23:25:20 发布

阅读量376

点赞数

分类专栏： python 爬取百度贴吧的帖子

本文链接：https://blog.csdn.net/qq_38282706/article/details/99229697

版权

python 爬取百度贴吧的帖子专栏收录该内容

9 篇文章 4 订阅

订阅专栏

接着直接进入主题，scrapy的启动文件begin.py：

'''
本scrapy的运行顺序：
》初始的begin.py ，打开tk界面，输入各个参数，保存在config文件，点击运行
》先进入pipeleines.py，调用open_spider，获取config里的数据，设定spider各个参数
》回到spider.py，正式开始运行parse
》程序结束时，再调用pipeleines.py的close_spider，log此次爬取日志'''

#设定log的输出设置
logging.basicConfig(level=logging.WARNING,
                    format='asctime:        %(asctime)s \n'  # 时间
                           'bug_line:       line:%(lineno)d \n'  # 文件名_行号
                           'level:          %(levelname)s \n'  # log级别
                           'message:        %(message)s \n',  # log信息
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename='日志.log',  # sys.path[1]获取当前的工作路径
                    filemode='a')  # 如果模式为'a'，则为续写（不会抹掉之前的log）


#用以启动爬虫
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from tieba.spiders.one_tieba_spider import One_tiebaSpider
from tieba.spiders.one_tiezi_spider import One_tieziSpider



#继承my_tk，完善run函数
class strat_scrapy(My_tk):
    def __init__(self,tk):
        super(strat_scrapy, self).__init__(tk)#继承已经设定好的tkinter界面
        self.spider_pid=11111               #设定初始pid(没什么卵用，不让程序报错而已)

    #点击【爬取】按钮调用的函数
    def run(self):
        # 对输入的参数进行筛查检测，填好了才能启动爬虫
        if self.to_assert():
            the_args=self.to_assert()
            print('设定了爬取条件：',the_args)
            if os.path.exists('爬虫日志'):  # 如果存在日志，删掉
                os.remove('爬虫日志')
            super().run()  #调用原先my_tk里的方法
            # 启动爬虫！
            self.start()

            self.display_text()#弹出进度窗口
            self.root.update() #刷新tk窗口
            self.start_time = time.time()  # 启动时的时间戳
            time.sleep(4)
            self.crawling_window.show_it()#tk的进度详情窗口，显示爬取进度

    def to_assert(self):
        '''对爬虫需要用的参数 进行判断整理，没问题时存放到list，4个参数齐全时，保存到config文件，然后返回list
            否则弹出窗口，提示 参数可能有一个有问题'''
        the_args=[]
        # 排序输入的页数
        the_pages = sorted([abs(int(self.beginvar.get())), abs(int(self.endvar.get()))])
        the_args.append(the_pages)

        #爬取的贴吧名不为空
        if self.tiebavar.get()!='':
            the_args.append(self.tiebavar.get())

        #保存路径不存在，用回默认路径
        if os.path.exists(self.pathvar.get()):
            the_args.append(self.pathvar.get())

        #选择爬取单个帖子的话，Tid=True，此时输入的tid没问题，就放进the_args
        #如果是爬取贴吧，tid为None
        if self.Tid:#762788222
            if isinstance(self.tidvar.get(), int) and self.tidvar.get()!=0 \
                    and len(str(abs(self.tidvar.get())))>8:
                the_args.append(self.tidvar.get())
        else:
            the_args.append('None') 

        #4个参数齐全，就保存 爬取信息 到config文件内
        if len(the_args)>3:
            kw = ['the_pages', 'tieba_name', 'save_path', 'tid']
            config = dict(zip(kw, the_args))
            config_file = Open_json('config')
            config_file.rewrite(config)
            return the_args
        else:
            messagebox.showerror("爬取条件有问题", "请正确输入贴吧名/保存路径/tid！！")
            


    def start(self):
        '''多进程启动sracpy'''
        if self.Tid:
            spider=One_tieziSpider
        else:
            spider =One_tiebaSpider
        self.the_scrapy = multiprocessing.Process(target=start_crawl,args=(spider,))
        self.the_scrapy.start()
        self.spider_pid=self.the_scrapy.pid  #设定pid，爬取窗口用以判断程序是否还在运行



def start_crawl(spider):
    '''根据爬取贴吧/帖子 启动不同的scrapy'''
    process = CrawlerProcess(get_project_settings())
    process.crawl(spider)
    process.start()

def begin():
    #启动tk
    root=Tk()
    strat_scrapy(root)
    root.mainloop()

if __name__ == '__main__':
    # pyinstaller 打包多进程得有下面代码
    multiprocessing.freeze_support()
    begin()

所输入的参数，保存到文件config：

工具py文件，里面有多个scrapy运行时调用的函数：

'''存放各种所需工具的py文件
    creat_dir：创建所需文件夹
    Log_one_TieBa：爬取【贴吧】结束时记录
    Log_one_TieZi：爬取【帖子】结束时记录
    Log_Large_TieZi：爬取[贴吧]时，当前帖子页数>100，记录下该帖子
    Open_csv：      csv文件写入
    Open_json：     json文件读取、写入等
    Crawling_item_counts：记录理论要爬取item数目
    Record_Crawl：        记录返回item的信息(标题、发帖人等)
'''

###=========输入初始时间，返回格式化后(时分秒)的初始、结束、耗时=======#
def log_time(start_time):
    end_time = time.time()
    # 花费时间
    Total_time = end_time - start_time
    m, s = divmod(Total_time, 60)
    h, m = divmod(m, 60)
    elapsed_time = '%d时:%02d分:%02d秒' % (h, m, s)
    # 格式化实际时间
    start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
    end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time))
    return start_time,end_time,elapsed_time
###=========输入初始时间，返回格式化后(时分秒)的初始、结束、耗时=======#


###=========需要用上的文件夹，提前创建=======#
def creat_dir():
    dirs=['爬取某个贴吧记录','爬取单个帖子记录','爬取进度详情']
    for one in dirs:
        if os.path.exists(one) is False:
            os.mkdir(one)
creat_dir()
###=========需要用上的文件夹，提前创建========#


###=========爬取【贴吧】时用的log函数========#
class Log_one_TieBa:
    '''实例化时(open_spider)，创建对应的csv文件，写入表头，并记录当前时间
       爬虫结束时，调用log，把这次的爬取信息写入csv文件，并且写入到the_spider_counts，【进度详情】窗口结束时调用'''
    def __init__(self, ):
        #锁定每次爬取保存记录的文件
        self.log_path = r'爬取某个贴吧记录/spider_TieBa.csv'
        if not os.path.isfile(self.log_path):  # 不存文件就创建并写入表头
            header= ['贴吧名字', '页数范围', '理论爬取帖子数','实际爬取帖子数','返回item数量','开始时间', '结束时间', '总花费时间']
            Open_csv(self.log_path).rewrite(header) #写入方式是 'w'

        self.start_time = time.time()

    def log(self,tieba_name, page_range,tiezi_count,actual_count,items_count):
        start_time, end_time, elapsed_time=log_time(self.start_time)

        data=[tieba_name,page_range,tiezi_count,actual_count,items_count,start_time, end_time, elapsed_time]
        Open_csv(self.log_path).add(data)    #写入方式是 ‘a'

        ## #爬虫结束时，调用的结束text
        end_text='此次爬取的贴吧是:【%s】，页数:%s，实际爬取帖子数:%s，耗时:%s'%\
                 (tieba_name,page_range,actual_count,elapsed_time)
        Open_json(r'爬取进度详情/the_spider_counts.json').add(end_text)
###=========爬取【贴吧】时用的log函数========#


###=========爬取【帖子】时用的log函数========#
class Log_one_TieZi:
    '''实例化时(open_spider)，创建对应的csv文件，写入表头，并记录当前时间
       爬虫结束时，调用log，把这次的爬取信息写入csv文件，并且写入到the_spider_counts，【进度详情】窗口结束时调用'''
    def __init__(self):
        # 锁定每次爬取保存记录的文件
        self.log_path = r'爬取单个帖子记录/spider_TieZi.csv'
        if not os.path.isfile(self.log_path):  # 不存文件就创建并写入表头
            header=['贴吧名字', '标题', '发帖人','帖子编号','页数', '爬取页数范围','理论item数','实际item数',
                     '开始时间', '结束时间', '总花费时间']
            Open_csv(self.log_path).rewrite(header)#写入方式是 'w'

        self.start_time = time.time()

    def log(self,kw,tiezi_info,page_range,item_counts,return_items_count):
        start_time, end_time, elapsed_time = log_time(self.start_time)

        data=(kw, tiezi_info['title'],tiezi_info['author'] ,tiezi_info['tid'] ,tiezi_info['pages'],
                 page_range,item_counts,return_items_count,start_time, end_time, elapsed_time)
        Open_csv(self.log_path).add(data)#写入方式是 ‘a'

        #爬虫结束时，调用的结束text
        end_text = '此次爬取的帖子tid为:【%s】，标题:%s，所在贴吧:%s，页数:%s，item数量:%s，耗时:%s' % \
                   (tiezi_info['tid'],tiezi_info['title'],kw, page_range, return_items_count, elapsed_time)
        Open_json(r'爬取进度详情/the_spider_counts.json').add(end_text)
###=========爬取【帖子】时用的log函数========#


###=========爬取【贴吧】发现当前帖子页数>100时调用，记录========#
class Log_Large_TieZi:
    def __init__(self):
        self.log_path = r'爬取某个贴吧记录/Large_TieZi.csv'
    def log(self,kw,the_tiezi):
        # 创建每次爬取保存记录的文件
        if not os.path.isfile(self.log_path):  # 不存文件就写入表头
            header= ['贴吧名字', '标题', '发帖人','帖子编号','回复数量','页数', '最后回复时间', '最后回复人']
            Open_csv(self.log_path).rewrite(header)

        data=(kw, the_tiezi['title'], the_tiezi['author'], the_tiezi['tid'], the_tiezi['reply_num'], the_tiezi['pages'],
         the_tiezi['last_reply_time'], the_tiezi['last_reply_author'])
        Open_csv(self.log_path).add(data)
###=========爬取【贴吧】发现当前帖子页数>100时调用，记录========#


###=======================自写的csv包，====================#
#方法有: rewrite创建文件重写内容；add添加新的一行
class Open_csv():
    '''只适用于以excel格式写入csv文件'''
    def __init__(self,file):
        self.file=file

    def rewrite(self,data):
        with open(self.file, 'w', encoding='utf-8', newline='') as f:  # 爬取贴吧日志 写入具体内容
            csvwriter = csv.writer(f, dialect="excel")
            csvwriter.writerow(data)

    def add(self,data):
        with open(self.file, 'a', encoding='utf-8', newline='') as f:  # 爬取贴吧日志 写入具体内容
            csvwriter = csv.writer(f, dialect="excel")
            csvwriter.writerow(data)
###=================自写的csv包================#

###=======================自写的json包=====================#
'''方法有：
        read 返回文件的所有内容，为list
        rewrite 创建文件重写内容
        add     添加新的一行
        clear 清空所有内容'''
class Open_json():
    '''只适用于json文件，读取read，写入/覆盖rewrite,添加新的行'''
    def __init__(self,file):
        self.file=file

    def read(self):
        with codecs.open(self.file, 'r', encoding='utf-8') as f:
            data=[json.loads(line) for line in f.readlines()]
        return data

    def rewrite(self,data):
        with codecs.open(self.file, 'w', encoding='utf-8') as f:
            line = json.dumps(data, ensure_ascii=False)+"\n"
            f.write(line)

    def add(self,data):
        with codecs.open(self.file, 'a', encoding='utf-8') as f:
            line = json.dumps(data, ensure_ascii=False)+"\n"
            f.write(line)
    def clear(self):
        a = open(self.file, 'w')
        a.close()
###=======================自写的json包=====================#


###===============记录当前 理论爬取item的数量，用在进度条上================#
class Crawling_item_counts():
    '''初始item数设为50，  调用update_items更新数量(初次调用时需-50)'''
    def __init__(self,file_path):
        self.file=Open_json(file_path)
        self.file.rewrite(50)

    def update_items(self,count):
        old_count=self.file.read()[0]
        old_count+=count
        self.file.rewrite(old_count)
###===============记录当前 理论爬取item的数量，用在进度条上================#


###===============记录爬取完毕 返回的item信息，================#
#写入到文件，【进度详情】tree再调用显示
class Record_Crawl():
    def __init__(self,file_path):
        self.file=Open_json(file_path)
        self.file.clear()

    def tiezi_info(self,the_tiezi,situation=None):
        '''实际记录其爬取记录的帖子情况有3种
            1.被删：①帖子没进去第一页就被删；②再一次进入第一页被删
            2.帖子爬过，没有变动
            3.爬完了返回item'''
        #爬完的帖子是没有设定situation的，而且有post_list
        if situation==None:
            start_page=the_tiezi['post_list'][0]['page']
            end_page = the_tiezi['post_list'][-1]['page']
            situation='此次爬取的页数是：% s~ %s'%(start_page,end_page)
            #被删/爬过的帖子，是没有post_list的，所以删掉
            the_tiezi.pop('post_list')

        #补全帖子的信息
        the_tiezi.update({'situation':situation})
        #写入文件
        self.file.add(the_tiezi)
###===============记录爬取完毕 返回的item信息，================#