python爬取贴吧网页信息

最新推荐文章于 2023-11-21 21:38:15 发布

·慕晴·

最新推荐文章于 2023-11-21 21:38:15 发布

阅读量1k

点赞数

分类专栏： Python爬虫文章标签： python

本文链接：https://blog.csdn.net/chao_qing/article/details/78016097

版权

Python爬虫专栏收录该内容

5 篇文章 0 订阅

订阅专栏

第一步分析网页代码如下：

'''
author：superWang
date：2017-09-15
python：3.5
requests模板：2.18.4
bs4模板：4.6.0
json模板：2.0.9
time模板：无
爬取贴吧网页http://c.tieba.baidu.com/p/4994831746?pn=1 中的信息 ，url中pn=1表示第一页
'''

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import json
import time

#得到贴吧个楼层的信息
url = 'http://c.tieba.baidu.com/p/4994831746?pn=1'
res = requests.get(url)
#print(res.text)
soup = BeautifulSoup(res.text,'html5lib')
l_posts = soup.select("#j_p_postlist .l_post")
#print(len(l_posts))
#这里为什么要取l_posts[1] 而不去 l_posts[0]，因为二楼有回复评论的信息，所以我就取了二楼来做测试
print(l_posts[1].select(".d_author .d_name .p_author_name")[0].text)#名字
#print(l_posts[1].select(".d_author .l_badge .user_badge")[0]['title'])#等级
print(l_posts[1].select(".d_author .l_badge .user_badge")[0].text)#等级
print(l_posts[1].select(".d_post_content_main .p_content cc")[0].text)#内容
print(l_posts[1].select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[1].text)#楼数
print(l_posts[1].select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[2].text)#时间


#得到贴吧一个楼层中的回复评论信息
#print(l_posts[1]["data-field"])#回复人 查询的数据 tid
jd = json.loads(l_posts[1]["data-field"])#解码成Python的dict类型
#print(type(jd))
print(jd)
tid = jd["content"]["thread_id"]#回复人 查询的数据 tid
fid = jd["content"]["forum_id"]#回复人 查询的数据 fid
post_id = jd["content"]["post_id"]
url2 = "http://c.tieba.baidu.com/p/totalComment?pn=1&see_lz=0&tid="+str(tid)+"&fid="+str(fid)
#print(url2)
res2 = requests.get(url2)
print(res2.text)#本页所有回复的信息
jd2 = json.loads(res2.text)#解码成Python的dict类型
print(jd2["data"]["comment_list"][str(post_id)]["comment_info"])#得到回复人信息列表
#print(type(jd2["data"]["comment_list"][str(post_id)]["comment_info"]))
comment_info = jd2["data"]["comment_list"][str(post_id)]["comment_info"]
print(comment_info[0]["now_time"])
now_time = comment_info[0]["now_time"]#回复人的时间戳
username = comment_info[0]["username"]#回复人的姓名
content = comment_info[0]["content"]#回复的信息

time_now=time.localtime(now_time) #将时间戳转化成python的日期格式
print (time.strftime("%Y-%m-%d %H:%M:%S", time_now))#将日期转成正常显示的字串

#判断是否是楼主
print(len(l_posts[1].select(".d_author .louzhubiaoshi_wrap")))#为1 表示为楼主
print(len(l_posts[4].select(".d_author .louzhubiaoshi_wrap")))#为0 表示不为楼主

第二步，整理分析的代码如下：

'''
author：superWang
date：2017-09-15
requests模板：2.18.4
bs4模板：4.6.0
json模板：2.0.9
time模板：无
爬取贴吧网页http://c.tieba.baidu.com/p/4994831746?pn=1 中的信息 ，url中pn=1表示第一页
'''

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import json
import time

class GetTieBaInfo():

    #得到本页各楼层所有的信息
    def getInfoWithPage(self,page):
        url = 'http://c.tieba.baidu.com/p/4994831746?pn='+str(page)
        res = requests.get(url)
        #print(res.text)
        soup = BeautifulSoup(res.text,'html5lib')
        l_posts = soup.select("#j_p_postlist .l_post")

        # 得到本页各楼层回复的所有信息  所要提供的数据
        jd = json.loads(l_posts[0]["data-field"])  # 解码成Python的dict类型
        tid = jd["content"]["thread_id"]  # 回复人 查询的数据 tid
        fid = jd["content"]["forum_id"]  # 回复人 查询的数据 fid

        comment_list = self.getBackWithPage(page,tid,fid)   #comment_list存放着本页所有回复评论信息

        for l_post in l_posts:
            p_author_name = l_post.select(".d_author .d_name .p_author_name")[0].text    #名字
            user_badge = l_post.select(".d_author .l_badge .user_badge")[0].text         #等级
            p_content = l_post.select(".d_post_content_main .p_content cc")[0].text.strip()     #内容
            tail_info = l_post.select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[1].text    #楼数
            tail_time = l_post.select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[2].text    #时间
            print("名字:"+p_author_name+"\t等级:"+user_badge+"\t内容:"+p_content+"\t楼数:"+tail_info+"\t时间:"+tail_time)

            jd = json.loads(l_post["data-field"])    # 解码成Python的dict类型
            post_id = jd["content"]["post_id"]      #通过这个post_id得到此楼层回复评论信息 在comment_list中得到此楼层回复评论信息
            try:
                comment_infos = comment_list[str(post_id)]["comment_info"]   #得到此楼层的所有回复评论信息

                print("\t此楼层回复：")
                for comment_info in comment_infos:
                    now_time = comment_info["now_time"]  # 回复人的时间戳
                    username = comment_info["username"]  # 回复人的姓名
                    content = comment_info["content"]  # 回复的信息

                    time_now = time.localtime(now_time)  # 将时间戳转化成python的日期格式
                    back_time = time.strftime("%Y-%m-%d %H:%M:%S", time_now)
                    #print(time.strftime("%Y-%m-%d %H:%M:%S", time_now))  # 将日期转成正常显示的字串

                    print("\t\t姓名:"+username+"\t\t信息:"+content+"\t\t时间:"+back_time )
            except KeyError:
                pass

    #得到楼主的信息
    def getAuthorInfoWithPage(self,page):
        url = 'http://c.tieba.baidu.com/p/4994831746?pn=' + str(page)
        res = requests.get(url)
        # print(res.text)
        soup = BeautifulSoup(res.text, 'html5lib')
        l_posts = soup.select("#j_p_postlist .l_post")

        # 得到本页各楼层回复的所有信息  所要提供的数据
        jd = json.loads(l_posts[0]["data-field"])  # 解码成Python的dict类型
        tid = jd["content"]["thread_id"]  # 回复人 查询的数据 tid
        fid = jd["content"]["forum_id"]  # 回复人 查询的数据 fid

        comment_list = self.getBackWithPage(page, tid, fid)  # comment_list存放着本页所有回复评论信息

        for l_post in l_posts:
            #print(len(l_posts[1].select(".d_author .louzhubiaoshi_wrap")))  # 为1 表示为楼主
            if len(l_post.select(".d_author .louzhubiaoshi_wrap")) == 1:
                p_author_name = l_post.select(".d_author .d_name .p_author_name")[0].text  # 名字
                user_badge = l_post.select(".d_author .l_badge .user_badge")[0].text  # 等级
                p_content = l_post.select(".d_post_content_main .p_content cc")[0].text.strip()  # 内容
                tail_info = l_post.select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[1].text  # 楼数
                tail_time = l_post.select(".d_post_content_main .core_reply .post-tail-wrap .tail-info")[2].text  # 时间
                print(
                    "名字:" + p_author_name + "\t等级:" + user_badge + "\t内容:" + p_content + "\t楼数:" + tail_info + "\t时间:" + tail_time)

                jd = json.loads(l_post["data-field"])  # 解码成Python的dict类型
                post_id = jd["content"]["post_id"]  # 通过这个post_id得到此楼层回复评论信息 在comment_list中得到此楼层回复评论信息
                try:
                    comment_infos = comment_list[str(post_id)]["comment_info"]  # 得到此楼层的所有回复评论信息

                    print("\t此楼层回复：")
                    for comment_info in comment_infos:
                        now_time = comment_info["now_time"]  # 回复人的时间戳
                        username = comment_info["username"]  # 回复人的姓名
                        content = comment_info["content"]  # 回复的信息

                        time_now = time.localtime(now_time)  # 将时间戳转化成python的日期格式
                        back_time = time.strftime("%Y-%m-%d %H:%M:%S", time_now)
                        # print(time.strftime("%Y-%m-%d %H:%M:%S", time_now))  # 将日期转成正常显示的字串

                        print("\t\t姓名:" + username + "\t\t信息:" + content + "\t\t时间:" + back_time)
                except KeyError:
                    pass

    #得到本页各楼层回复的所有信息
    def getBackWithPage(self,page,tid,fid):
        url = "http://c.tieba.baidu.com/p/totalComment?pn="+ str(page) +"&see_lz=0&tid=" + str(tid) + "&fid=" + str(fid)
        res2 = requests.get(url)
        jd2 = json.loads(res2.text)  # 解码成Python的dict类型
        #print(jd2["data"]["comment_list"][str(post_id)]["comment_info"])  # 得到回复人信息列表
        # print(type(jd2["data"]["comment_list"][str(post_id)]["comment_info"]))
        comment_list = jd2["data"]["comment_list"]    # 得到所有回复人信息字典

        return comment_list

if __name__ == "__main__":
    a = GetTieBaInfo()
    a.getAuthorInfoWithPage(1)