python3 爬取猫眼电影Top100信息（正则+requests）

最新推荐文章于 2020-07-26 17:04:47 发布

hiudawn

最新推荐文章于 2020-07-26 17:04:47 发布

阅读量608

点赞数

分类专栏：爬虫文章标签： python3 爬虫正则 requests

本文链接：https://blog.csdn.net/hiudawn/article/details/79672454

版权

爬虫专栏收录该内容

5 篇文章 0 订阅

订阅专栏

import json
from multiprocessing.dummy import Pool
import requests
import re

def get_one_page(url):
    # 要加上这个请求头，假装自己是浏览器，不然猫眼不给你访问
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
    }
    # 加上浏览器头请求页面
    respone = requests.get(url,headers=headers)
    # 返回文本
    return respone.text

def parse_one_page(html):
    # 正则的匹配模式
    pattern = re.compile('<dd>.*?board-index.*?>(.*?)<.*?title="(.*?)".*?star">(.*?)</p>'
                         '.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)'
                         '</i>.*?</dd>',re.S)
    # 找到所有匹配内容放到表里
    result = re.findall(pattern,html)
    # 有了yield，可以理解为这个函数不再是函数，而是一个生成器，可以直接调用结果
    # 生成器里面的内容是一个字典
    for item in result:
        yield {
            'index':item[0],
            'title':item[1].strip(' ').strip('\n').strip(' '),
            'actor':item[2].strip(' ').strip('\n').strip(' '),
            'time':item[3],
            'score':item[4]+item[5]
        }

def write_to_file(content):
    # a是追加,教程里习惯手动关闭文件，我觉得是不需要
    with open('./output.txt','a',encoding='utf-8') as output_file:  # 文件打开方式为utf-8
        # 需要借助json才能把字典dict写入文件
        output_file.write(json.dumps(content,ensure_ascii=False)+'\n')   # 不给你用ascii
    # 这里如果不做编码处理，输出文件会有很多\u9b42\u65ad\u84dd\u6865类似的utf-8码

def read_from_file(file):
    data = []
    with open(file,'r',encoding='utf-8') as lines:
        # result = dict(line for line in lines if line)
        for line in lines:
            try:
                data.append(json.loads(line.rstrip('\n').replace('\'','\"')))
            except:
                print('something wrong')
    return data

def main(offset):
    '''调用各个函数'''
    url = 'http://maoyan.com/board/4?offset=%d' % offset
    html = get_one_page(url)
    result = parse_one_page(html)

    for item in result:
        # 把生成器里面的字典写入文件
        write_to_file(item)
if __name__ == '__main__':
    # 找齐Top100，需要翻页，串行版本，inidex顺序不会乱。
    # for i in range(10):
    #     main(i*10)

    # map,把数组中的每一个元素拿出来当作参数,多线程并行版本
    pool = Pool()
    # 虽然是并行的，但是index顺序是错乱的
    pool.map(main,[i*10 for i in range(10)])
    # 再写一个读取output，打印的
    # 因为上面的池导致写文件可能因为资源争用会发生部分错误，有时候能运行有时候不能
    data = read_from_file('./output.txt')
    print(data)

输出结果：

{"index": "21", "title": "海上钢琴师", "actor": "主演：蒂姆·罗斯,普路特·泰勒·文斯,比尔·努恩", "time": "上映时间：1998-10-28(意大利)", "score": "9.2"}
{"index": "22", "title": "指环王3：王者无敌", "actor": "主演：伊利亚·伍德,伊恩·麦克莱恩,丽芙·泰勒", "time": "上映时间：2004-03-15", "score": "9.2"}
{"index": "2", "title": "肖申克的救赎", "actor": "主演：蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿", "time": "上映时间：1994-10-14(美国)", "score": "9.5"}
{"index": "11", "title": "喜剧之王", "actor": "主演：周星驰,莫文蔚,张柏芝", "time": "上映时间：1999-02-13(中国香港)", "score": "9.2"}
{"index": "12", "title": "乱世佳人", "actor": "主演：费雯·丽,克拉克·盖博,奥利维娅·德哈维兰", "time": "上映时间：1939-12-15(美国)", "score": "9.1"}

{"index": "97", "title": "我爱你", "actor": "主演：宋在河,李顺才,尹秀晶", "time": "上映时间：2011-02-17(韩国)", "score": "9.0"}
{"index": "98", "title": "黄金三镖客", "actor": "主演：克林特·伊斯特伍德,李·范·克里夫,伊莱·瓦拉赫", "time": "上映时间：1966-12-23(意大利)", "score": "8.9"}
{"index": "99", "title": "迁徙的鸟", "actor": "主演：雅克·贝汉,菲利普·拉波洛", "time": "上映时间：2001-12-12(法国)", "score": "9.1"}
{"index": "100", "title": "英雄本色", "actor": "主演：狄龙,张国荣,周润发", "time": "上映时间：2017-11-17", "score": "9.2"}