Python豆瓣热门电影爬虫

Python豆瓣热门电影爬虫

环境

PyCharm 2020.2.4 (Professional Edition)
bs4==0.0.1
urllib3==1.24.2
MongoDB 3.4.10
Python 3.7.4
pymongo 3.11.3

实现代码

import urllib
from bs4 import BeautifulSoup
import re
import pymongo


def crawl():
    url = "https://movie.douban.com/top250?start="
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}

    image_url_pattern = re.compile(r'<img .*src="(.*?)".*>')
    title_pattern = re.compile(r'<span class="title">(.*?)</span>')
    information_pattern = re.compile(r'<p class="">(.*?)</p>', re.S)
    rating_pattern = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
    evaluator_pattern = re.compile(r'<span>(.*?)人评价</span>')
    synopsis_pattern = re.compile(r'<span class="inq">(.*?)</span>')

    movies = []

    for i in range(0, 10):
        request = urllib.request.Request(url=url + str(25 * i), headers=headers)
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf-8')

        beautiful_soup = BeautifulSoup(html, "html.parser")
        for item in beautiful_soup.find_all('div', class_='item'):
            movie = {}
            item = str(item)
            image_url = re.findall(image_url_pattern, item)[0]
            movie["image_url"] = image_url
            title = re.findall(title_pattern, item)[0]
            movie["title"] = title
            information = re.findall(information_pattern, item)
            information_list = information[0].strip().replace('\xa0', ',').split(',')
            movie["country"] = information_list[-3]
            movie["category"] = information_list[-1]
            rating = re.findall(rating_pattern, item)[0]
            movie["rating"] = rating
            evaluator = re.findall(evaluator_pattern, item)[0]
            movie["evaluator"] = evaluator
            synopsis = re.findall(synopsis_pattern, item)

            if len(synopsis) > 0:
                movie["synopsis"] = synopsis[0]
            else:
                movie["synopsis"] = "None"

            movies.append(movie)

    return movies


def save_in_mongodb(uri, database_name, collection_name, data):
    mongo_client = pymongo.MongoClient(uri)
    database = mongo_client[database_name]
    collection = database[collection_name]
    collection.insert_many(data)


if __name__ == "__main__":
    movies = crawl()
    save_in_mongodb('mongodb://localhost:27017', 'douban', 'movies', movies)

测试结果

查询语句db.movies.find()

最后

  • 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值