使用BeautifulSoup爬取烂番茄

最新推荐文章于 2024-06-03 18:40:41 发布

自由的犇儿哥

最新推荐文章于 2024-06-03 18:40:41 发布

阅读量1k

点赞数 1

分类专栏：快乐学python

本文链接：https://blog.csdn.net/BCQCCB/article/details/116843591

版权

快乐学python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests

headers = {

    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
    'Referer': 'https://www.google.com/'
}
if __name__ == '__main__':
    data = {}  # 用于存放所有的数据

    id = 'a_week_away'

    name = 'https://www.rottentomatoes.com/m/%s/reviews' % (id)

    doc = requests.get(url=name, headers=headers)

    soup = BeautifulSoup(doc.content, 'lxml')

    # 找到一共需要遍历多少页面
    # 这里需要考虑一种情况，就是有的电影的评论少是不需要翻页的


    if len(soup.select('.pageInfo')) == 0:
        pageCount = 1
    else:
        pageCount = int(soup.select('.pageInfo')[0].string.split(' ')[-1])

    # 先处理第一张Page

    for item in soup.select('.review_table_row '):
        newsoup = BeautifulSoup(item.prettify(), 'lxml')

        # 获取评论
        review = newsoup.select('.the_review')[0].string.strip()
        # 获取分数
        scores = newsoup.select('.review-link ')[0].prettify().strip().split('\n')
        # 注意这一步得到的是一个字符串列表,分数字符串放在倒数第二的位置上的，但是不一定会有分数

        if scores[-2].find('|') == -1:
            # 说明没有分数
            data[review] = ' '  # 没有分数对应一个空格

        else:
            data[review] = scores[-2].strip()

    # 处理剩下的页面

    for pageId in range(2, pageCount + 1):
        print(pageId)
        newUrl = 'https://www.rottentomatoes.com/m/%s/reviews?type=&sort=&page=%d' % (id, pageId)
        tdoc = requests.get(url=newUrl, headers=headers)
        tsoup = BeautifulSoup(tdoc.content, 'lxml')
        for item in tsoup.select('.review_table_row '):
            newsoup = BeautifulSoup(item.prettify(), 'lxml')

            # 获取评论
            review = newsoup.select('.the_review')[0].string.strip()
            # 获取分数
            scores = newsoup.select('.review-link ')[0].prettify().strip().split('\n')
            # 注意这一步得到的是一个字符串列表,分数字符串放在倒数第二的位置上的，但是不一定会有分数

            if scores[-2].find('|') == -1:
                # 说明没有分数
                data[review] = ' '  # 没有分数对应一个空格

            else:
                data[review] = scores[-2].strip()

    print(data)

自由的犇儿哥

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
1
评论
使用BeautifulSoup爬取烂番茄

from pyquery import PyQuery as pqfrom bs4 import BeautifulSoupfrom bs4.element import Tagimport requestsheaders = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', 'Ref
复制链接

扫一扫