1-7-2爬虫获取百度贴吧评论信息

coleman114

已于 2024-04-26 16:24:45 修改

阅读量248

点赞数 4

分类专栏：自然语言处理文章标签：爬虫

于 2024-04-10 19:50:52 首次发布

本文链接：https://blog.csdn.net/coleman114/article/details/137604152

版权

自然语言处理专栏收录该内容

12 篇文章 0 订阅

订阅专栏

使用爬虫方式获取网络资源信息，以下代码为百度贴吧评论信息：

import time import requests from bs4 import BeautifulSoup import jsonlines import json x = [] # URL豆瓣电影评论页面 url = 'https://tieba.baidu.com/p/8957953350' # GET请求模拟浏览器发送 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } # response = requests.get(url, headers=headers) # # # 检查请求是否成功 # if response.status_code == 200: # # HTML内容的打印返回 # print(response.text) # # URL初始页面 base_url = 'https://tieba.baidu.com/p/8928633330' # 逐页爬行 for i in range(2): # 假定爬行前10页的评论？ # 完整的URL拼接(每页20条评论) page_url = f'{base_url}?pn={i+1}' #page_url = 'abc{}bc{}d'.format(i*20,i*30) # 发送请求获取HTML内容 response = requests.get(page_url, headers=headers) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') comment_list = soup.find_all('div', class_='d_post_content j_d_post_content clearfix',style="display:;") # 同样的方法分析评论 for comment in comment_list: value = {} # commenter = comment.find('a', class_='').text content = comment.text.strip() # rating_tag = comment.find('span', class_='rating') # rating = rating_tag['title'] if rating_tag else '无评分' # # value['评论者'] = commenter # value['评分'] = rating value['评论内容'] = content x.append(value) # 设置延迟设置，以避免过快爬取被封ip。 time.sleep(2) # 将生成的JSONL格式数据写入train.jsonl文件 with jsonlines.open('train0324.jsonl', mode='w') as writer: writer.write_all(x)

import time
import requests
from bs4 import BeautifulSoup
import jsonlines
import json

x = []
# URL豆瓣电影评论页面
url = 'https://tieba.baidu.com/p/8957953350'
# GET请求模拟浏览器发送
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# response = requests.get(url, headers=headers)
#
# # 检查请求是否成功
# if response.status_code == 200:
#     # HTML内容的打印返回
#     print(response.text)
# # URL初始页面
base_url = 'https://tieba.baidu.com/p/8928633330'
# 逐页爬行
for i in range(2):  # 假定爬行前10页的评论？
    # 完整的URL拼接(每页20条评论)
    page_url = f'{base_url}?pn={i+1}'
    #page_url = 'abc{}bc{}d'.format(i*20,i*30)
    # 发送请求获取HTML内容
    response = requests.get(page_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        comment_list = soup.find_all('div', class_='d_post_content j_d_post_content clearfix',style="display:;")

        # 同样的方法分析评论
        for comment in comment_list:
            value = {}
            # commenter = comment.find('a', class_='').text
            content = comment.text.strip()
            # rating_tag = comment.find('span', class_='rating')
            # rating = rating_tag['title'] if rating_tag else '无评分'
            #
            # value['评论者'] = commenter
            # value['评分'] = rating
            value['评论内容'] = content
            x.append(value)

    # 设置延迟设置，以避免过快爬取被封ip。
    time.sleep(2)
    # 将生成的JSONL格式数据写入train.jsonl文件
    with jsonlines.open('train0324.jsonl', mode='w') as writer:
        writer.write_all(x)