Python3爬虫项目集：猫眼电影排行榜top100

最新推荐文章于 2024-09-27 16:41:19 发布

雍飞宇

最新推荐文章于 2024-09-27 16:41:19 发布

阅读量365

点赞数

分类专栏：爬虫文章标签： python 爬虫

本文链接：https://blog.csdn.net/qinglianchen0851/article/details/102866454

版权

爬虫专栏收录该内容

9 篇文章

订阅专栏

文章目录

Github 地址： https://github.com/pasca520/Python3SpiderSet

一、概要

示例	python 库
爬取模块	request
解析模块	xpath
存储类型	文件（txt）

二、解析

根据图示复制 xpath（这也是我为何说 xpath 简单的原因）

三、代码示例

import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
from lxml import etree
import json



# 爬虫主体
def get_page(url, params):
    headers = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Referer': 'https://maoyan.com/board',
    }
    params = (
        ('offset', params),
    )
    try:
        response = requests.get(url=url, headers=headers, params=params)
        return response
    except ReadTimeout:  # 访问超时的错误
        print('Timeout')
    except ConnectionError:  # 网络中断连接错误
        print('Connect error')
    except RequestException:  # 父类错误
        print('Error')


# 解析网页
def parse_page(html):
    tree = etree.HTML(html)
    for movie in range(1, 11):
        """
        rank : 排名
        movie_name：电影名
        key_actor：主演
        up_time：上映时间
        movie_score:评分
        """
        rank = tree.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[%s]/i/text()' % movie)[0]
        movie_name = tree.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[%s]/div/div/div[1]/p[1]/a/text()' % movie)[0]
        key_actor = tree.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[%s]/div/div/div[1]/p[2]/text()' % movie)[
            0].strip()  # strip去空格
        up_time = tree.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[%s]/div/div/div[1]/p[3]/text()' % movie)[0]
        score1 = tree.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[%s]/div/div/div[2]/p/i[1]/text()' % movie)[0]
        score2 = tree.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[%s]/div/div/div[2]/p/i[2]/text()' % movie)[0]
        movie_score = score1 + score2
        rankList = []
        rankList.append({
            'rank': rank,
            'movie_name': movie_name,
            'key_actor': key_actor[3:],
            'up_time': up_time[5:],
            'movie_score': movie_score
        })
        return rankList


# 写入txt，注意
def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        print(type(json.dumps(content)))
        f.write(json.dumps(content, ensure_ascii=False) + '\n\n')
    f.close()

if __name__ == '__main__':
    url = 'https://maoyan.com/board/4'
    for page in range(0, 100, 10):
        html = get_page(url=url, params=page).text
        rankList = parse_page(html)
        for item in rankList:
            write_to_file(item)