import requests
from lxml import etree
import re
import json
import os
import time
class TieBaSpider():
def __init__(self):
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
def get_html_str(self, url):
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response.content.decode("utf-8")
else:
return None
def parse_list_page(self, html_str):
posts_li = re.findall(r'<li class=" j_thread_list clearfix"(.*?)</li>', html_str, re.S)
posts_li = map(lambda x: '<li class=" j_thread_list clearfix"' + x + "</li>", posts_li)
posts_data = []
for li in posts_li:
li = etree.HTML(li)
item = {}
item["title"] = li.xpath(".//a[@class='j_th_tit ']/text()")[0]
item["url"] = "https://tieba.baidu.com" + li.xpath(".//a[@class='j_th_tit ']/@href")[0]
posts_data.append(item)
return posts_data
def parse_detail_page(self, html_str):
html = etree.HTML(html_str)
post_contents = html.xpath("//div[@class='d_post_content j_d_post_content clearfix']/text()")
post_contents = list(map(lambda x: x.strip(), post_contents))
post_imgs_src = html.xpath("//div[@class='d_post_content j_d_post_content clearfix']/img/@src")
return post_contents, post_imgs_src
def save_post_detail(self, item):
fp = open("./data/tieba.json", "a", encoding="utf-8")
json.dump(item, fp, ensure_ascii=False)
fp.write("\n")
fp.close()
print(item["title"] + "\t详情写入成功...")
def save_post_img(self, src_list, title):
if len(src_list) == 0:
print("当前帖子无图片内容...")
return None
else:
title = re.sub(r'[\\|/|:|*|?|<|>\|\n]', "", title)
for index, src in enumerate(src_list):
end_name = os.path.splitext(src)[1][:4]
file_name = title + str(index) + end_name
with open("./data/tieba_imgs/" + file_name, "wb") as fp:
fp.write(requests.get(src, timeout=10).content)
time.sleep(1)
print(file_name + "写入成功...")
def run(self):
for i in range(20):
pn = i * 50
list_url = "https://tieba.baidu.com/f?kw=天津工业大学&ie=utf-8&pn={}".format(pn)
html_str = self.get_html_str(list_url)
print("当前爬取第{}页...".format(i+1))
posts_data = self.parse_list_page(html_str)
for post in posts_data:
detail_html = self.get_html_str(post["url"])
post_contents, post_imgs_src = self.parse_detail_page(detail_html)
item = {
"title": post["title"],
"url": post["url"],
"content": post_contents,
"imgs_src": post_imgs_src
}
self.save_post_detail(item)
self.save_post_img(item["imgs_src"], item["title"])
if __name__ == '__main__':
tbs = TieBaSpider()
tbs.run()
爬取详情结果如下:
爬取图片结果如下: