代码运行需要selenium与mongodb环境,爬取母校贴吧10页的帖子内容与图片rul
import requests
from lxml import etree
from selenium import webdriver
import re
import pymongo
class TieBa(object):
def __init__(self):
self.list_urls = []
for i in range(10):
list_url = "https://tieba.baidu.com/f?kw=%E6%99%8B%E4%B8%AD%E5%AD%A6%E9%99%A2&ie=utf-8&pn={}".format(i * 50)
self.list_urls.append(list_url)
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Referer": "https://tieba.baidu.com/index.html"
}
self.driver_path = r"D:/chromedriver_win32/chromedriver.exe"
self.driver = webdriver.Chrome(executable_path=self.driver_path)
def get_html(self, url):
self.driver.get(url)
text = self.driver.page_source
return text
def parse_list_page(self, text):
html = etree.HTML(text)
urls = html.xpath("//li[@class=' j_thread_list clearfix']//div[@class='threadlist_title pull_left j_th_tit ']/a[@rel='noreferrer']/@href")
urls = list(map(lambda url: "https://tieba.baidu.com/" + url, urls))
return urls
def parse_detail_page(self, text):
html = etree.HTML(text)
title = html.xpath("//h1[@class='core_title_txt ']/text()")[0]
title = re.sub(r"@leaf", "", title)
first_layer = html.xpath("//div[@class='l_post j_l_post l_post_bright noborder ']")[0]
first_layer_content = first_layer.xpath(".//div[@class='d_post_content j_d_post_content clearfix']//text()")
first_layer_imgs_url = first_layer.xpath(".//img[@class='BDE_Image']/@src")
other_layer = html.xpath("//div[@class='l_post j_l_post l_post_bright ']")
if (len(other_layer) != 0):
other_review_data = []
for o_l in other_layer:
other_review = o_l.xpath(".//div[@class='d_post_content j_d_post_content clearfix']//text()")[0].strip()
other_img_urls = o_l.xpath(".//img[@class='BDE_Smiley']/@src")
other_review_data.append((other_review, other_img_urls))
else:
other_review_data = []
return {"标题": title, "一楼发言": first_layer_content, "一楼发图": first_layer_imgs_url, "吧友发言与发图": other_review_data}
def save(self,data):
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["tieba_spider"]
mycol = mydb["tjgydx"]
mycol.insert_one(data)
myclient.close()
def run(self):
for list_index, list_url in enumerate(self.list_urls):
list_text = self.get_html(list_url)
detail_urls = self.parse_list_page(list_text)
for detail_index, detail_url in enumerate(detail_urls):
detail_text = requests.get(detail_url).text
data = self.parse_detail_page(detail_text)
self.save(data)
print("第{}页第{}贴: {}写入成功...".format(list_index, detail_index, data["标题"]))
tb = TieBa()
tb.run()
爬取结果保存在mongo数据库如下图: