Python-使用BeautifulSoup爬取豆瓣TOP250电影

# _*_ coding:utf-8 _*_
"""
file_name:py_movie
author:Sam
"""
from urllib import request
from bs4 import BeautifulSoup


def main(request_url=""):
    # 设置头部信息
    request_url = request.Request(request_url)
    request_url.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
    # 请求地址并转码为utf-8
    res = request.urlopen(request_url)
    html = res.read()
    html = html.decode('utf-8')

    print_data = []
    # 用BeautifulSoup获取信息
    soup = BeautifulSoup(html, "html.parser")
    # print(soup)
    for item in soup.find_all('div', class_="item"):
        item_data = {}
        # 排名
        item_data['rank'] = item.find('em').get_text()
        # 播放地址
        item_data['play_url'] = item.find('a').get('href')
        # 主图
        item_data['img'] = item.find('img').get("src")
        # 标题
        item_data['title'] = item.find('span', class_='title').get_text()
        # 其他
        item_data['other'] = item.find('span', class_='other').get_text()
        # 评分
        item_data['grade'] = item.find('span', class_='rating_num').get_text()
        # 评论人数
        people = item.find('div', class_='star')
        item_data['people'] = people.contents[7].get_text()
        # 评语 本来不打算写判断的,刚好排名246的那个刚好没有这个评语,报错了
        if item.find('span', class_='inq') is not None:
            item_data['inq'] = item.find('span', class_='inq').get_text()
        else:
            item_data['inq'] = '暂无'
        # info
        info = item.find('div', class_='bd')
        item_data['info'] = info.find('p').contents[0].strip()
        item_data['info'] += "  年份产地:"
        item_data['info'] += info.find('p').contents[2].strip()
        print_data.append(item_data)
        # print(print_data)

    fpath = 'd:/project/Python/demo/test_py_movie.txt'

    write_data = ""
    for item in print_data:
        write_data += "[排名]:"+item['rank']
        write_data += "\n"
        write_data += "[标题]:" + item['title']
        write_data += "\n"
        write_data += "[其他]:"+item['other']
        write_data += "\n"
        write_data += "[评语]:"+item['inq']
        write_data += "\n"
        write_data += "[详情]:"+item['info']
        write_data += "\n"
        write_data += "[评分]:"+item['grade']
        write_data += "\n"
        write_data += "[评分人数]:"+item['people']
        write_data += "\n"
        write_data += "[主图]:"+item['img']
        write_data += "\n"
        write_data += "[播放路径]:"+item['play_url']
        write_data += "\n"
        write_data += "\n"
        write_data += "**********************************************************************************************************************"
        write_data += "\n"
        write_data += "\n"

    with open(fpath, 'a+', encoding='utf-8') as f:
        f.write(write_data)
        f.close()


if __name__ == '__main__':
    url = "https://movie.douban.com/top250?start="
    for n in range(10):
        start = n*25
        # print(start)
        main(url+str(start))

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值