Python 爬虫模块 bs4 实战一：获取百度贴吧内容

最新推荐文章于 2024-08-08 15:42:55 发布

liweiwei1419

最新推荐文章于 2024-08-08 15:42:55 发布

阅读量1.6k

点赞数

分类专栏： python 爬虫文章标签：爬虫

本文链接：https://blog.csdn.net/lw_power/article/details/77995272

版权

python 同时被 2 个专栏收录

28 篇文章 0 订阅

订阅专栏

爬虫

1 篇文章 0 订阅

订阅专栏

getCommentInfo.py：

from bs4 import BeautifulSoup

import requests
from mylog import MyLog as mylog


# 《Python 网络爬虫实战》胡松涛著 P196

class Item():
    title = None
    firstAuthor = None
    firstTime = None
    reNum = None
    content = None
    lastAuthor = None
    lastTime = None


class GetTiebaInfo():
    def __init__(self, url):
        self.url = url
        self.log = mylog()
        self.pageSum = 1
        self.urls = self.getUrls(self.pageSum)
        self.items = self.spider(self.urls)
        self.pipelines(self.items)

        # http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100

        ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0

    ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50
    ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100
    ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150
    ## http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=200
    def getUrls(self, pageSum):
        urls = []
        pns = [str(i * 50) for i in range(pageSum)]
        ul = self.url.split("=")
        for pn in pns:
            ul[-1] = pn
            url = "=".join(ul)
            print(url)
            urls.append(url)
        return urls

    def spider(self, urls):
        items = []
        for url in urls:
            htmlContent = self.getResponseContent(url)
            soup = BeautifulSoup(htmlContent, 'lxml')
            # 注意：这里前面有个空格
            tagsli = soup.find_all('li', attrs={'class': ' j_thread_list clearfix'})
            for tag in tagsli:
                item = Item()
                item.title = tag.find('a', attrs={'class': 'j_th_tit'}).get_text().strip()
                # 注意，这里有一个 .a
                item.firstAuthor = tag.find('span', attrs={'class': 'frs-author-name-wrap'}).a.get_text().strip()
                item.firstTime = tag.find('span', attrs={'title': '创建时间'}).get_text().strip()
                item.reNum = tag.find('span', attrs={'title': '回复'}).get_text().strip()
                # 注意：这里后面有个空格
                item.content = tag.find('div',
                                        attrs={'class': 'threadlist_abs threadlist_abs_onlyline '}).get_text().strip()
                item.lastAuthor = tag.find('span', attrs={'class': 'tb_icon_author_rely j_replyer'}).get_text().strip()
                item.lastTime = tag.find('span', attrs={'title': '最后回复时间'}).get_text().strip()
                items.append(item)
                self.log.info('获取标题为《%s》的项成功 ...' % item.title)
        return items

    def pipelines(self, items):
        fileName = '百度贴吧_python.txt'
        with open(fileName, 'w', encoding='utf-8') as fp:
            for item in items:
                fp.write(
                    'title:{} \t author:{} \t firstTime:{} \ncontent:{} \n return:{} \n lastAuthor:{} \t lastTime:{} \n\n\n\n'
                        .format(item.title, item.firstAuthor, item.firstTime, item.content, item.reNum, item.lastAuthor,
                                item.lastTime))

    def getResponseContent(self, url):
        try:
            response = requests.get(url)
        except:
            self.log.error('Python 返回 URL:%s 数据失败' % url)
        else:
            self.log.info('Python 返回 URL:%s 数据成功' % url)
            return response.text


if __name__ == '__main__':
    url = 'http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50'
    GTI = GetTiebaInfo(url)

mylog.py

import logging
import getpass

import sys


class MyLog():
    def __init__(self):
        self.user = getpass.getuser()
        self.logger = logging.getLogger(self.user)
        self.logger.setLevel(logging.DEBUG)

        self.logFile = sys.argv[0][0:-3] + '.log'
        self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
        self.logHand = logging.FileHandler(self.logFile, encoding='utf8')
        self.logHand.setFormatter(self.formatter)
        self.logHand.setLevel(logging.DEBUG)

        self.logHandst = logging.StreamHandler()
        self.logHandst.setFormatter(self.formatter)
        self.logHandst.setLevel(logging.DEBUG)

        self.logger.addHandler(self.logHand)
        self.logger.addHandler(self.logHandst)

    # 日志的 5 个级别对应一下的 5 个函数
    def debug(self, msg):
        self.logger.debug(msg)

    def info(self, msg):
        self.logger.info(msg)

    def warn(self, msg):
        self.logger.warn(msg)

    def error(self, msg):
        self.logger.error(msg)

    def critical(self, msg):
        self.logger.critical(msg)


if __name__ == '__main__':
    mylog = MyLog()
    mylog.debug(u"I'm debug 测试中文")
    mylog.info("I'm info")
    mylog.warn("I'm warm")
    mylog.error(u"I'm error 测试中文")
    mylog.critical("I'm critical")