贴吧帖子标题 + 回复内容 + 回复图片爬虫

最新推荐文章于 2024-05-03 18:15:12 发布

zjLOVEcyj

最新推荐文章于 2024-05-03 18:15:12 发布

阅读量718

点赞数 2

分类专栏：爬虫框架文章标签： python xpath 爬虫

本文链接：https://blog.csdn.net/cyj5201314/article/details/105507541

版权

爬虫框架专栏收录该内容

33 篇文章 0 订阅

订阅专栏

import requests
from lxml import etree
import re
import json
import os
import time

class TieBaSpider():

    def __init__(self):
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
        }


    def get_html_str(self, url):
        response = requests.get(url, headers=self.headers)
        if response.status_code == 200:
            return response.content.decode("utf-8")
        else:
            return None


    def parse_list_page(self, html_str):
        posts_li = re.findall(r'<li class=" j_thread_list clearfix"(.*?)</li>', html_str, re.S)
        posts_li = map(lambda x: '<li class=" j_thread_list clearfix"' + x + "</li>", posts_li)
        posts_data = []
        for li in posts_li:
            li = etree.HTML(li)
            item = {}
            item["title"] = li.xpath(".//a[@class='j_th_tit ']/text()")[0]
            item["url"] = "https://tieba.baidu.com" + li.xpath(".//a[@class='j_th_tit ']/@href")[0]
            posts_data.append(item)

        return posts_data


    def parse_detail_page(self, html_str):
        html = etree.HTML(html_str)
        post_contents = html.xpath("//div[@class='d_post_content j_d_post_content  clearfix']/text()")
        post_contents = list(map(lambda x: x.strip(), post_contents))
        post_imgs_src = html.xpath("//div[@class='d_post_content j_d_post_content  clearfix']/img/@src")
        return post_contents, post_imgs_src

    def save_post_detail(self, item):
        fp = open("./data/tieba.json", "a", encoding="utf-8")
        json.dump(item, fp, ensure_ascii=False)
        fp.write("\n")
        fp.close()
        print(item["title"] + "\t详情写入成功...")


    def save_post_img(self, src_list, title):
        if len(src_list) == 0:
            print("当前帖子无图片内容...")
            return None
        else:
            title = re.sub(r'[\\|/|:|*|?|<|>\|\n]', "", title)
            for index, src in enumerate(src_list):
                end_name = os.path.splitext(src)[1][:4]
                file_name = title + str(index) + end_name
                with open("./data/tieba_imgs/" + file_name, "wb") as fp:
                    fp.write(requests.get(src, timeout=10).content)
                time.sleep(1)
                print(file_name + "写入成功...")



    def run(self):
        for i in range(20):
            pn = i * 50
            list_url = "https://tieba.baidu.com/f?kw=天津工业大学&ie=utf-8&pn={}".format(pn)
            html_str = self.get_html_str(list_url)
            print("当前爬取第{}页...".format(i+1))
            posts_data = self.parse_list_page(html_str)
            for post in posts_data:
                detail_html = self.get_html_str(post["url"])
                post_contents, post_imgs_src = self.parse_detail_page(detail_html)
                item = {
                    "title": post["title"],
                    "url": post["url"],
                    "content": post_contents,
                    "imgs_src": post_imgs_src
                }
                self.save_post_detail(item)
                self.save_post_img(item["imgs_src"], item["title"])

if __name__ == '__main__':
    tbs = TieBaSpider()
    tbs.run()