selenium + xpath + requests贴吧爬虫

代码运行需要selenium与mongodb环境,爬取母校贴吧10页的帖子内容与图片rul

import requests
from lxml import etree
from selenium import webdriver
import re
import pymongo


class TieBa(object):

    def __init__(self):
        self.list_urls = []
        for i in range(10):
            list_url = "https://tieba.baidu.com/f?kw=%E6%99%8B%E4%B8%AD%E5%AD%A6%E9%99%A2&ie=utf-8&pn={}".format(i * 50)
            self.list_urls.append(list_url)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
            "Referer": "https://tieba.baidu.com/index.html"
        }
        self.driver_path = r"D:/chromedriver_win32/chromedriver.exe"
        self.driver = webdriver.Chrome(executable_path=self.driver_path)


    def get_html(self, url):
        self.driver.get(url)
        text = self.driver.page_source
        return text


    def parse_list_page(self, text):
        html = etree.HTML(text)
        urls = html.xpath("//li[@class=' j_thread_list clearfix']//div[@class='threadlist_title pull_left j_th_tit ']/a[@rel='noreferrer']/@href")
        urls = list(map(lambda url: "https://tieba.baidu.com/" + url, urls))
        return urls


    def parse_detail_page(self, text):
        html = etree.HTML(text)
        title = html.xpath("//h1[@class='core_title_txt  ']/text()")[0]
        title = re.sub(r"@leaf", "", title)

        first_layer = html.xpath("//div[@class='l_post j_l_post l_post_bright noborder ']")[0]
        first_layer_content = first_layer.xpath(".//div[@class='d_post_content j_d_post_content  clearfix']//text()")
        first_layer_imgs_url = first_layer.xpath(".//img[@class='BDE_Image']/@src")

        other_layer = html.xpath("//div[@class='l_post j_l_post l_post_bright  ']")
        if (len(other_layer) != 0):
            other_review_data = []
            for o_l in other_layer:
                other_review = o_l.xpath(".//div[@class='d_post_content j_d_post_content  clearfix']//text()")[0].strip()
                other_img_urls = o_l.xpath(".//img[@class='BDE_Smiley']/@src")
                other_review_data.append((other_review, other_img_urls))
        else:
            other_review_data = []

        return {"标题": title, "一楼发言": first_layer_content, "一楼发图": first_layer_imgs_url, "吧友发言与发图": other_review_data}

    def save(self,data):
        myclient = pymongo.MongoClient("mongodb://localhost:27017/")
        mydb = myclient["tieba_spider"]
        mycol = mydb["tjgydx"]
        mycol.insert_one(data)
        myclient.close()


    def run(self):
        for list_index, list_url in enumerate(self.list_urls):
            list_text = self.get_html(list_url)
            detail_urls = self.parse_list_page(list_text)
            for detail_index, detail_url in enumerate(detail_urls):
                detail_text = requests.get(detail_url).text
                data = self.parse_detail_page(detail_text)
                self.save(data)
                print("第{}页第{}贴:  {}写入成功...".format(list_index, detail_index, data["标题"]))


tb = TieBa()
tb.run()

爬取结果保存在mongo数据库如下图:
在这里插入图片描述

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值