百度贴吧
解决了上一次为解决的问题,引入selenium可以实现对动态页面的爬取,代码复杂化了,有很多步骤可以优化
百度系列:
上一次的百度贴吧1.0爬虫
百度贴吧实现翻译下小功能,里面的问题应该也可以用同样的思路被解决
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 29 15:33:00 2020
@author: Administrator
"""
import requests
from lxml import etree
import json
import os
from selenium import webdriver
import time
os.chdir(r"H:\实操\学习\01\tie")
class TiebaSpyder:
def __init__(self, tiename):
self.tiename = tiename
self.url_temp = "https://tieba.baidu.com/f?kw=" + self.tiename + "&ie=utf-8&tab=good&cid=0&pn=0"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
"Cookie": yourcookie}
self.part_url = "http://tieba.baidu.com/{}?red_tag=q2497054097"
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode(encoding="utf-8")
def get_url_content(self, html_str,url):
html_str_new = html_str.replace(r'<!--', '"').replace(r'-->', '"')
html = etree.HTML(html_str_new)
div_list = html.xpath("//div[contains(@class,'threadlist_title pull_left j_th_tit')]")
# print(div_list)
content_list = []
for index,div in enumerate(div_list):
print("*" * 20)
item = {}
item["title"] = div.xpath("./a/text()")[0] if len(div.xpath("./a/text()")) > 0 else None
# 百度这里是动态页面,有red_tag受限制,详情页无法正确获取连接
print(index)
item["href"],item["img_list"] = self.get_detail_url(url,index+1,[])
content_list.append(item)
return content_list
def get_detail_url(self,url,pos,total_img_list):
driver = webdriver.Chrome()
driver.get(url)
driver.find_elements_by_class_name("j_th_tit ")[pos].click()
driver.switch_to.window(driver.window_handles[1])
detail_html = etree.HTML(driver.page_source)
next_detail_url = driver.current_url
while next_detail_url:
img_list = detail_html.xpath("//img[@class='BDE_Image']/@src")
total_img_list.extend(img_list)
next_detail_url = detail_html.xpath("//a[text()='下一页']/@href")[0] if len(detail_html.xpath("//a[text()='下一页']/@href")) else None
print(next_detail_url)
return driver.current_url,total_img_list
def get_next_url(self,url):
driver = webdriver.Chrome()
driver.get(url)
time.sleep(10)
driver.find_element_by_class_name("next pagination-item ").click()
return driver.current_url
# def get_image_list(self, detail_html, total_img_list):
# print(detail_html)
# detail_html_str = self.parse_url(detail_html)
# detail_html = etree.HTML(detail_html_str)
# img_list = detail_html.xpath("//img[@class='BDE_Image']/@src")
# total_img_list.extend(img_list)
# next_detail_url = detail_html.xpath("//a[text()='下一页']/@href")
# if len(next_detail_url) > 0:
# # 百度贴吧内页的详情页不收限制,可以按照如此方法提取
# next_detail_url = next_detail_url[0]
# return self.get_image_list(next_detail_url, total_img_list)
#
# return total_img_list
def save_content(self, content_list, num):
filepath = "贴吧" + self.tiename + str(num) + ".txt"
with open(filepath, "a", encoding='utf-8') as f:
for content in content_list:
f.write(json.dumps(content, ensure_ascii=False, indent=2))
f.write("\n")
print("保存成功")
def run(self):
# 1.构建首页地址
num = 1
next_url = self.url_temp
while next_url is not None:
# 2.发送请求,获取响应
html_str = self.parse_url(next_url)
# 3。提取数据
# 3.1 提取标题
content_List = self.get_url_content(html_str,next_url)
# 3.2获取第一页详情页的连接 【如果有的话】
next_url = self.get_next_url(next_url)
# 3.3 发送请求,获取响应
# 3.4提取详情页图片信息
# 3.5 存储数据
# 3.6 获取下一页详情页地址
# 3.7 重复3.3-3.4
# 4.保存数据
self.save_content(content_List, num)
# 5.获取下一页地址
num += 1
# 6.重复2-4步骤
if __name__ == "__main__":
tieba = TiebaSpyder("摄影")
tieba.run()
视频学习完成以后,就开始学习更加系统化的敲代码
-------------------- 视频学习 小酥仙 2020.3.5 --------------------------------------