爬取豆瓣电影新片榜

最新推荐文章于 2024-04-02 00:01:19 发布

Arik (IoT)

最新推荐文章于 2024-04-02 00:01:19 发布

阅读量472

点赞数

分类专栏： python爬虫及部分可视化案例

本文链接：https://blog.csdn.net/qq_45770364/article/details/104946495

版权

python爬虫及部分可视化案例专栏收录该内容

37 篇文章 3 订阅

订阅专栏

python爬虫 — 豆瓣电影新片榜

豆瓣电影新片榜

import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib3.exceptions import RequestError

def get_one_page(url):
    try:
        headers = {
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.66"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestError:
        return None

def parse_one_page(html):
    soup = BeautifulSoup(html, "html.parser")
    messages = soup.find_all("tr",class_="item")
    top10 = []
    for message in messages:
        top10.append(message.a['title'])
    return top10

def write_to_file(content):
    index = [1,2,3,4,5,6,7,8,9,10]
    #用pandas保存csv文件
    test = pd.DataFrame({"index":index,'top10':content})
    test.to_csv(r'Douban_top10_movie.csv',sep=',',encoding='utf-8-sig',index=False)
    #用csv保存csv文件
    """with open("Douban_top10_movie.csv", "w",newline='',encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["index", "top10"])
        writer.writerows([index,content])"""

if __name__ =='__main__':
    html = get_one_page('https://movie.douban.com/chart')
    content = parse_one_page(html)
    write_to_file(content)