功能分析:
-
电影评论爬取:
-
输入豆瓣电影ID(如《肖申克的救赎》ID为1292052)
-
爬取前3页评论(约60条)
-
-
情感分析:
-
使用TextBlob分析每条评论的情感
-
分类为正面/中性/负面
-
计算情感极性和主观性分数
-
-
数据存储:
-
使用SQLite数据库存储所有评论和分析结果
-
-
可视化展示:
-
饼图展示情感分布比例
-
卡片形式展示每条评论及分析结果
-
-
电影管理:
-
左侧显示已爬取的电影列表
-
点击电影名称查看对应评论和分析
-
import os
import re
import sqlite3
import time
from flask import Flask, render_template, request, jsonify
from textblob import TextBlob
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
app = Flask(__name__)
# 数据库初始化
def init_db():
conn = sqlite3.connect('movie_reviews.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS reviews
(id INTEGER PRIMARY KEY AUTOINCREMENT,
movie_id TEXT,
movie_title TEXT,
username TEXT,
rating INTEGER,
content TEXT,
sentiment TEXT,
polarity REAL,
subjectivity REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
conn.commit()
conn.close()
init_db()
# 爬取豆瓣电影评论
def crawl_douban_reviews(movie_id, max_pages=3):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 先获取电影标题
movie_url = f'https://movie.douban.com/subject/{movie_id}/'
response = requests.get(movie_url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
movie_title = soup.find('span', property='v:itemreviewed').text
reviews = []
def parse_page(page):
url = f'https://movie.douban.com/subject/{movie_id}/comments?start={(page - 1) * 20}&limit=20&status=P&sort=new_score'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('div', class_='comment-item')
for item in items:
try:
username = item.find('span', class_='comment-info').a.text
rating_tag = item.find('span', class_=re.compile('^allstar'))
rating = int(rating_tag['class'][0][7]) if rating_tag else 0
content = item.find('span', class_='short').text.strip()
# 情感分析
analysis = TextBlob(content)
if analysis.sentiment.polarity > 0:
sentiment = "正面"
elif analysis.sentiment.polarity < 0:
sentiment = "负面"
else:
sentiment = "中性"
review = {
'movie_id': movie_id,
'movie_title': movie_title,
'username': username,
'rating': rating,
'content': content,
'sentiment': sentiment,
'polarity': analysis.sentiment.polarity,
'subjectivity': analysis.sentiment.subjectivity
}
reviews.append(review)
# 存入数据库
conn = sqlite3.connect('movie_reviews.db')
c = conn.cursor()
c.execute('''INSERT INTO reviews
(movie_id, movie_title, username, rating, content, sentiment, polarity, subjectivity)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)''',
(movie_id, movie_title, username, rating, content, sentiment,
analysis.sentiment.polarity, analysis.sentiment.subjectivity))
conn.commit()
conn.close()
except Exception as e:
print(f"解析评论出错: {e}")
continue
# 使用线程池加速爬取
with ThreadPoolExecutor(max_workers=5) as executor:
executor.map(parse_page, range(1, max_pages + 1))
return reviews, movie_title
# 首页路由
@app.route('/')
def index():
# 获取数据库中已有的电影列表
conn = sqlite3.connect('movie_reviews.db')
c = conn.cursor()
c.execute("SELECT DISTINCT movie_id, movie_title FROM reviews ORDER BY created_at DESC LIMIT 10")
movies = c.fetchall()
conn.close()
return render_template('index.html', movies=movies)
# 爬取评论接口
@app.route('/crawl', methods=['POST'])
def crawl():
movie_id = request.form.get('movie_id', '').strip()
if not movie_id:
return jsonify({'error': '请输入电影ID'}), 400
try:
reviews, movie_title = crawl_douban_reviews(movie_id)
return jsonify({
'success': True,
'count': len(reviews),
'movie_title': movie_title
})
except Exception as e:
return jsonify({'error': str(e)}), 500
# 获取评论数据接口
@app.route('/reviews')
def get_reviews():
movie_id = request.args.get('movie_id', '')
conn = sqlite3.connect('movie_reviews.db')
c = conn.cursor()
if movie_id:
c.execute("SELECT * FROM reviews WHERE movie_id = ? ORDER BY created_at DESC", (movie_id,))
else:
c.execute("SELECT * FROM reviews ORDER BY created_at DESC LIMIT 100")
reviews = []
for row in c.fetchall():
reviews.append({
'id': row[0],
'movie_id': row[1],
'movie_title': row[2],
'username': row[3],
'rating': row[4],
'content': row[5],
'sentiment': row[6],
'polarity': row[7],
'subjectivity': row[8],
'created_at': row[9]
})
conn.close()
return jsonify(reviews)
# 获取统计信息接口
@app.route('/stats')
def get_stats():
movie_id = request.args.get('movie_id', '')
conn = sqlite3.connect('movie_reviews.db')
c = conn.cursor()
if movie_id:
c.execute("SELECT sentiment, COUNT(*) FROM reviews WHERE movie_id = ? GROUP BY sentiment", (movie_id,))
else:
c.execute("SELECT sentiment, COUNT(*) FROM reviews GROUP BY sentiment")
stats = {}
total = 0
for row in c.fetchall():
stats[row[0]] = row[1]
total += row[1]
# 计算百分比
for key in stats:
stats[key] = {
'count': stats[key],
'percentage': round(stats[key] / total * 100, 1)
}
conn.close()
return jsonify(stats)
if __name__ == '__main__':
port = int(os.environ.get('PORT', 5000))
app.run(host='0.0.0.0', port=port, debug=True)
前端代码:
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>电影评论爬取与情感分析系统</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/chart.js@3.7.1/dist/chart.min.css">
<style>
body {
background-color: #f8f9fa;
padding-top: 2rem;
}
.container {
max-width: 1200px;
}
.card {
margin-bottom: 1.5rem;
box-shadow: 0 0.5rem 1rem rgba(0, 0, 0, 0.1);
}
.review-card {
transition: transform 0.2s;
}
.review-card:hover {
transform: translateY(-5px);
}
.positive {
border-left: 5px solid #28a745;
}
.negative {
border-left: 5px solid #dc3545;
}
.neutral {
border-left: 5px solid #ffc107;
}
.loading {
display: none;
text-align: center;
padding: 2rem;
}
.rating {
color: #ffc107;
font-weight: bold;
}
.sentiment-badge {
font-size: 0.8rem;
padding: 0.3rem 0.6rem;
}
#chart-container {
height: 300px;
margin-bottom: 2rem;
}
</style>
</head>
<body>
<div class="container">
<div class="text-center mb-5">
<h1 class="display-4">🎬 电影评论爬取与情感分析系统</h1>
<p class="lead text-muted">爬取豆瓣电影评论并进行情感分析</p>
</div>
<div class="card mb-4">
<div class="card-header bg-primary text-white">
<h5 class="mb-0">电影评论爬取</h5>
</div>
<div class="card-body">
<form id="crawlForm">
<div class="row g-3">
<div class="col-md-8">
<label for="movie_id" class="form-label">豆瓣电影ID</label>
<input type="text" class="form-control" id="movie_id"
placeholder="例如:1292052 (肖申克的救赎)" required>
<div class="form-text">从豆瓣电影URL中获取ID,如: https://movie.douban.com/subject/1292052/</div>
</div>
<div class="col-md-4 d-flex align-items-end">
<button type="submit" class="btn btn-primary w-100">爬取评论</button>
</div>
</div>
</form>
<div id="crawlResult" class="mt-3 alert alert-success" style="display:none;"></div>
<div id="crawlError" class="mt-3 alert alert-danger" style="display:none;"></div>
</div>
</div>
<div class="row">
<div class="col-md-4">
<div class="card">
<div class="card-header bg-info text-white">
<h5 class="mb-0">电影列表</h5>
</div>
<div class="card-body">
<div class="list-group" id="movieList">
<!-- 电影列表将通过JS动态加载 -->
</div>
</div>
</div>
</div>
<div class="col-md-8">
<div class="card">
<div class="card-header bg-success text-white">
<h5 class="mb-0">情感分析结果</h5>
</div>
<div class="card-body">
<div id="chart-container">
<canvas id="sentimentChart"></canvas>
</div>
<div class="d-flex justify-content-between mb-3">
<h5 id="currentMovieTitle">选择电影查看评论</h5>
<div>
<span class="badge bg-success sentiment-badge">正面</span>
<span class="badge bg-warning text-dark sentiment-badge">中性</span>
<span class="badge bg-danger sentiment-badge">负面</span>
</div>
</div>
<div id="loadingReviews" class="loading">
<div class="spinner-border text-primary" role="status">
<span class="visually-hidden">加载中...</span>
</div>
<p class="mt-2">正在加载评论...</p>
</div>
<div id="reviewsContainer" class="row row-cols-1 g-3">
<!-- 评论将通过JS动态加载 -->
</div>
</div>
</div>
</div>
</div>
</div>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/chart.js@3.7.1/dist/chart.min.js"></script>
<script>
// 全局变量
let sentimentChart;
let currentMovieId = '';
// 初始化页面
document.addEventListener('DOMContentLoaded', function() {
loadMovieList();
// 表单提交事件
document.getElementById('crawlForm').addEventListener('submit', function(e) {
e.preventDefault();
const movieId = document.getElementById('movie_id').value.trim();
if (!movieId) return;
crawlReviews(movieId);
});
});
// 加载电影列表
function loadMovieList() {
fetch('/reviews?limit=10')
.then(response => response.json())
.then(data => {
const movieList = document.getElementById('movieList');
movieList.innerHTML = '';
// 获取不重复的电影列表
const uniqueMovies = {};
data.forEach(review => {
if (!uniqueMovies[review.movie_id]) {
uniqueMovies[review.movie_id] = review.movie_title;
}
});
// 显示电影列表
for (const [id, title] of Object.entries(uniqueMovies)) {
const item = document.createElement('button');
item.type = 'button';
item.className = 'list-group-item list-group-item-action';
item.textContent = title;
item.onclick = () => loadMovieReviews(id, title);
movieList.appendChild(item);
}
// 如果有电影,默认加载第一个
if (Object.keys(uniqueMovies).length > 0) {
const firstId = Object.keys(uniqueMovies)[0];
loadMovieReviews(firstId, uniqueMovies[firstId]);
}
});
}
// 爬取评论
function crawlReviews(movieId) {
const formData = new FormData();
formData.append('movie_id', movieId);
document.getElementById('crawlResult').style.display = 'none';
document.getElementById('crawlError').style.display = 'none';
fetch('/crawl', {
method: 'POST',
body: formData
})
.then(response => response.json())
.then(data => {
if (data.error) {
document.getElementById('crawlError').textContent = data.error;
document.getElementById('crawlError').style.display = 'block';
return;
}
document.getElementById('crawlResult').innerHTML = `
成功爬取 ${data.count} 条评论 - <strong>${data.movie_title}</strong>
`;
document.getElementById('crawlResult').style.display = 'block';
// 重新加载电影列表和评论
loadMovieList();
loadMovieReviews(movieId, data.movie_title);
})
.catch(error => {
document.getElementById('crawlError').textContent = '爬取失败: ' + error;
document.getElementById('crawlError').style.display = 'block';
});
}
// 加载电影评论
function loadMovieReviews(movieId, movieTitle) {
currentMovieId = movieId;
document.getElementById('currentMovieTitle').textContent = movieTitle;
document.getElementById('reviewsContainer').innerHTML = '';
document.getElementById('loadingReviews').style.display = 'block';
// 加载评论数据
Promise.all([
fetch(`/reviews?movie_id=${movieId}`).then(res => res.json()),
fetch(`/stats?movie_id=${movieId}`).then(res => res.json())
])
.then(([reviews, stats]) => {
document.getElementById('loadingReviews').style.display = 'none';
// 更新图表
updateChart(stats);
// 显示评论
const container = document.getElementById('reviewsContainer');
reviews.forEach(review => {
const sentimentClass = review.sentiment === '正面' ? 'positive' :
review.sentiment === '负面' ? 'negative' : 'neutral';
const sentimentBadge = review.sentiment === '正面' ? 'bg-success' :
review.sentiment === '负面' ? 'bg-danger' : 'bg-warning text-dark';
const col = document.createElement('div');
col.className = 'col';
col.innerHTML = `
<div class="card review-card ${sentimentClass}">
<div class="card-body">
<div class="d-flex justify-content-between mb-2">
<h6 class="card-title mb-0">${review.username}</h6>
<div>
${review.rating ? `<span class="rating">★ ${review.rating}</span>` : ''}
<span class="badge ${sentimentBadge} sentiment-badge">${review.sentiment}</span>
</div>
</div>
<p class="card-text">${review.content}</p>
<small class="text-muted">
极性: ${review.polarity.toFixed(2)} |
主观性: ${review.subjectivity.toFixed(2)}
</small>
</div>
</div>
`;
container.appendChild(col);
});
})
.catch(error => {
document.getElementById('loadingReviews').style.display = 'none';
alert('加载评论失败: ' + error);
});
}
// 更新图表
function updateChart(stats) {
const ctx = document.getElementById('sentimentChart').getContext('2d');
// 销毁旧图表
if (sentimentChart) {
sentimentChart.destroy();
}
// 准备数据
const labels = ['正面', '中性', '负面'];
const data = [
stats['正面']?.count || 0,
stats['中性']?.count || 0,
stats['负面']?.count || 0
];
const backgroundColors = [
'rgba(40, 167, 69, 0.7)',
'rgba(255, 193, 7, 0.7)',
'rgba(220, 53, 69, 0.7)'
];
// 创建新图表
sentimentChart = new Chart(ctx, {
type: 'doughnut',
data: {
labels: labels,
datasets: [{
data: data,
backgroundColor: backgroundColors,
borderWidth: 1
}]
},
options: {
responsive: true,
plugins: {
legend: {
position: 'bottom',
},
tooltip: {
callbacks: {
label: function(context) {
const label = context.label || '';
const value = context.raw || 0;
const total = context.dataset.data.reduce((a, b) => a + b, 0);
const percentage = Math.round((value / total) * 100);
return `${label}: ${value} (${percentage}%)`;
}
}
}
}
}
});
}
</script>
</body>
</html>
运行截图:
后端
前端