selenium + xpath + requests贴吧爬虫

最新推荐文章于 2021-09-17 15:57:28 发布

zjLOVEcyj

最新推荐文章于 2021-09-17 15:57:28 发布

阅读量268

点赞数

分类专栏：爬虫框架文章标签： xpath selenium mongodb python html

本文链接：https://blog.csdn.net/cyj5201314/article/details/105009796

版权

爬虫框架专栏收录该内容

33 篇文章 0 订阅

订阅专栏

代码运行需要selenium与mongodb环境，爬取母校贴吧10页的帖子内容与图片rul

import requests
from lxml import etree
from selenium import webdriver
import re
import pymongo


class TieBa(object):

    def __init__(self):
        self.list_urls = []
        for i in range(10):
            list_url = "https://tieba.baidu.com/f?kw=%E6%99%8B%E4%B8%AD%E5%AD%A6%E9%99%A2&ie=utf-8&pn={}".format(i * 50)
            self.list_urls.append(list_url)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
            "Referer": "https://tieba.baidu.com/index.html"
        }
        self.driver_path = r"D:/chromedriver_win32/chromedriver.exe"
        self.driver = webdriver.Chrome(executable_path=self.driver_path)


    def get_html(self, url):
        self.driver.get(url)
        text = self.driver.page_source
        return text


    def parse_list_page(self, text):
        html = etree.HTML(text)
        urls = html.xpath("//li[@class=' j_thread_list clearfix']//div[@class='threadlist_title pull_left j_th_tit ']/a[@rel='noreferrer']/@href")
        urls = list(map(lambda url: "https://tieba.baidu.com/" + url, urls))
        return urls


    def parse_detail_page(self, text):
        html = etree.HTML(text)
        title = html.xpath("//h1[@class='core_title_txt  ']/text()")[0]
        title = re.sub(r"@leaf", "", title)

        first_layer = html.xpath("//div[@class='l_post j_l_post l_post_bright noborder ']")[0]
        first_layer_content = first_layer.xpath(".//div[@class='d_post_content j_d_post_content  clearfix']//text()")
        first_layer_imgs_url = first_layer.xpath(".//img[@class='BDE_Image']/@src")

        other_layer = html.xpath("//div[@class='l_post j_l_post l_post_bright  ']")
        if (len(other_layer) != 0):
            other_review_data = []
            for o_l in other_layer:
                other_review = o_l.xpath(".//div[@class='d_post_content j_d_post_content  clearfix']//text()")[0].strip()
                other_img_urls = o_l.xpath(".//img[@class='BDE_Smiley']/@src")
                other_review_data.append((other_review, other_img_urls))
        else:
            other_review_data = []

        return {"标题": title, "一楼发言": first_layer_content, "一楼发图": first_layer_imgs_url, "吧友发言与发图": other_review_data}

    def save(self,data):
        myclient = pymongo.MongoClient("mongodb://localhost:27017/")
        mydb = myclient["tieba_spider"]
        mycol = mydb["tjgydx"]
        mycol.insert_one(data)
        myclient.close()


    def run(self):
        for list_index, list_url in enumerate(self.list_urls):
            list_text = self.get_html(list_url)
            detail_urls = self.parse_list_page(list_text)
            for detail_index, detail_url in enumerate(detail_urls):
                detail_text = requests.get(detail_url).text
                data = self.parse_detail_page(detail_text)
                self.save(data)
                print("第{}页第{}贴:  {}写入成功...".format(list_index, detail_index, data["标题"]))


tb = TieBa()
tb.run()

爬取结果保存在mongo数据库如下图:
在这里插入图片描述

zjLOVEcyj

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
selenium + xpath + requests贴吧爬虫

代码运行需要selenium与mongodb环境，爬取母校贴吧10页的帖子内容与图片rulimport requestsfrom lxml import etreefrom selenium import webdriverimport reimport pymongoclass TieBa(object): def __init__(self): sel...
复制链接

扫一扫