Python豆瓣热门电影爬虫
环境
PyCharm 2020.2.4 (Professional Edition)
bs4==0.0.1
urllib3==1.24.2
MongoDB 3.4.10
Python 3.7.4
pymongo 3.11.3
实现代码
import urllib
from bs4 import BeautifulSoup
import re
import pymongo
def crawl():
url = "https://movie.douban.com/top250?start="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}
image_url_pattern = re.compile(r'<img .*src="(.*?)".*>')
title_pattern = re.compile(r'<span class="title">(.*?)</span>')
information_pattern = re.compile(r'<p class="">(.*?)</p>', re.S)
rating_pattern = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
evaluator_pattern = re.compile(r'<span>(.*?)人评价</span>')
synopsis_pattern = re.compile(r'<span class="inq">(.*?)</span>')
movies = []
for i in range(0, 10):
request = urllib.request.Request(url=url + str(25 * i), headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
beautiful_soup = BeautifulSoup(html, "html.parser")
for item in beautiful_soup.find_all('div', class_='item'):
movie = {}
item = str(item)
image_url = re.findall(image_url_pattern, item)[0]
movie["image_url"] = image_url
title = re.findall(title_pattern, item)[0]
movie["title"] = title
information = re.findall(information_pattern, item)
information_list = information[0].strip().replace('\xa0', ',').split(',')
movie["country"] = information_list[-3]
movie["category"] = information_list[-1]
rating = re.findall(rating_pattern, item)[0]
movie["rating"] = rating
evaluator = re.findall(evaluator_pattern, item)[0]
movie["evaluator"] = evaluator
synopsis = re.findall(synopsis_pattern, item)
if len(synopsis) > 0:
movie["synopsis"] = synopsis[0]
else:
movie["synopsis"] = "None"
movies.append(movie)
return movies
def save_in_mongodb(uri, database_name, collection_name, data):
mongo_client = pymongo.MongoClient(uri)
database = mongo_client[database_name]
collection = database[collection_name]
collection.insert_many(data)
if __name__ == "__main__":
movies = crawl()
save_in_mongodb('mongodb://localhost:27017', 'douban', 'movies', movies)
测试结果
查询语句db.movies.find()
最后
- 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!