电影评论与情感分析系统

 功能分析:

  1. 电影评论爬取

    • 输入豆瓣电影ID(如《肖申克的救赎》ID为1292052)

    • 爬取前3页评论(约60条)

  2. 情感分析

    • 使用TextBlob分析每条评论的情感

    • 分类为正面/中性/负面

    • 计算情感极性和主观性分数

  3. 数据存储

    • 使用SQLite数据库存储所有评论和分析结果

  4. 可视化展示

    • 饼图展示情感分布比例

    • 卡片形式展示每条评论及分析结果

  5. 电影管理

    • 左侧显示已爬取的电影列表

    • 点击电影名称查看对应评论和分析

import os
import re
import sqlite3
import time
from flask import Flask, render_template, request, jsonify
from textblob import TextBlob
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

app = Flask(__name__)


# 数据库初始化
def init_db():
    conn = sqlite3.connect('movie_reviews.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS reviews
                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
                  movie_id TEXT,
                  movie_title TEXT,
                  username TEXT,
                  rating INTEGER,
                  content TEXT,
                  sentiment TEXT,
                  polarity REAL,
                  subjectivity REAL,
                  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
    conn.commit()
    conn.close()


init_db()


# 爬取豆瓣电影评论
def crawl_douban_reviews(movie_id, max_pages=3):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # 先获取电影标题
    movie_url = f'https://movie.douban.com/subject/{movie_id}/'
    response = requests.get(movie_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    movie_title = soup.find('span', property='v:itemreviewed').text

    reviews = []

    def parse_page(page):
        url = f'https://movie.douban.com/subject/{movie_id}/comments?start={(page - 1) * 20}&limit=20&status=P&sort=new_score'
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        items = soup.find_all('div', class_='comment-item')
        for item in items:
            try:
                username = item.find('span', class_='comment-info').a.text
                rating_tag = item.find('span', class_=re.compile('^allstar'))
                rating = int(rating_tag['class'][0][7]) if rating_tag else 0
                content = item.find('span', class_='short').text.strip()

                # 情感分析
                analysis = TextBlob(content)
                if analysis.sentiment.polarity > 0:
                    sentiment = "正面"
                elif analysis.sentiment.polarity < 0:
                    sentiment = "负面"
                else:
                    sentiment = "中性"

                review = {
                    'movie_id': movie_id,
                    'movie_title': movie_title,
                    'username': username,
                    'rating': rating,
                    'content': content,
                    'sentiment': sentiment,
                    'polarity': analysis.sentiment.polarity,
                    'subjectivity': analysis.sentiment.subjectivity
                }
                reviews.append(review)

                # 存入数据库
                conn = sqlite3.connect('movie_reviews.db')
                c = conn.cursor()
                c.execute('''INSERT INTO reviews 
                             (movie_id, movie_title, username, rating, content, sentiment, polarity, subjectivity)
                             VALUES (?, ?, ?, ?, ?, ?, ?, ?)''',
                          (movie_id, movie_title, username, rating, content, sentiment,
                           analysis.sentiment.polarity, analysis.sentiment.subjectivity))
                conn.commit()
                conn.close()

            except Exception as e:
                print(f"解析评论出错: {e}")
                continue

    # 使用线程池加速爬取
    with ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(parse_page, range(1, max_pages + 1))

    return reviews, movie_title


# 首页路由
@app.route('/')
def index():
    # 获取数据库中已有的电影列表
    conn = sqlite3.connect('movie_reviews.db')
    c = conn.cursor()
    c.execute("SELECT DISTINCT movie_id, movie_title FROM reviews ORDER BY created_at DESC LIMIT 10")
    movies = c.fetchall()
    conn.close()

    return render_template('index.html', movies=movies)


# 爬取评论接口
@app.route('/crawl', methods=['POST'])
def crawl():
    movie_id = request.form.get('movie_id', '').strip()
    if not movie_id:
        return jsonify({'error': '请输入电影ID'}), 400

    try:
        reviews, movie_title = crawl_douban_reviews(movie_id)
        return jsonify({
            'success': True,
            'count': len(reviews),
            'movie_title': movie_title
        })
    except Exception as e:
        return jsonify({'error': str(e)}), 500


# 获取评论数据接口
@app.route('/reviews')
def get_reviews():
    movie_id = request.args.get('movie_id', '')

    conn = sqlite3.connect('movie_reviews.db')
    c = conn.cursor()

    if movie_id:
        c.execute("SELECT * FROM reviews WHERE movie_id = ? ORDER BY created_at DESC", (movie_id,))
    else:
        c.execute("SELECT * FROM reviews ORDER BY created_at DESC LIMIT 100")

    reviews = []
    for row in c.fetchall():
        reviews.append({
            'id': row[0],
            'movie_id': row[1],
            'movie_title': row[2],
            'username': row[3],
            'rating': row[4],
            'content': row[5],
            'sentiment': row[6],
            'polarity': row[7],
            'subjectivity': row[8],
            'created_at': row[9]
        })

    conn.close()
    return jsonify(reviews)


# 获取统计信息接口
@app.route('/stats')
def get_stats():
    movie_id = request.args.get('movie_id', '')

    conn = sqlite3.connect('movie_reviews.db')
    c = conn.cursor()

    if movie_id:
        c.execute("SELECT sentiment, COUNT(*) FROM reviews WHERE movie_id = ? GROUP BY sentiment", (movie_id,))
    else:
        c.execute("SELECT sentiment, COUNT(*) FROM reviews GROUP BY sentiment")

    stats = {}
    total = 0
    for row in c.fetchall():
        stats[row[0]] = row[1]
        total += row[1]

    # 计算百分比
    for key in stats:
        stats[key] = {
            'count': stats[key],
            'percentage': round(stats[key] / total * 100, 1)
        }

    conn.close()
    return jsonify(stats)


if __name__ == '__main__':
    port = int(os.environ.get('PORT', 5000))
    app.run(host='0.0.0.0', port=port, debug=True)

前端代码:

<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>电影评论爬取与情感分析系统</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/chart.js@3.7.1/dist/chart.min.css">
    <style>
        body {
            background-color: #f8f9fa;
            padding-top: 2rem;
        }
        .container {
            max-width: 1200px;
        }
        .card {
            margin-bottom: 1.5rem;
            box-shadow: 0 0.5rem 1rem rgba(0, 0, 0, 0.1);
        }
        .review-card {
            transition: transform 0.2s;
        }
        .review-card:hover {
            transform: translateY(-5px);
        }
        .positive {
            border-left: 5px solid #28a745;
        }
        .negative {
            border-left: 5px solid #dc3545;
        }
        .neutral {
            border-left: 5px solid #ffc107;
        }
        .loading {
            display: none;
            text-align: center;
            padding: 2rem;
        }
        .rating {
            color: #ffc107;
            font-weight: bold;
        }
        .sentiment-badge {
            font-size: 0.8rem;
            padding: 0.3rem 0.6rem;
        }
        #chart-container {
            height: 300px;
            margin-bottom: 2rem;
        }
    </style>
</head>
<body>
    <div class="container">
        <div class="text-center mb-5">
            <h1 class="display-4">🎬 电影评论爬取与情感分析系统</h1>
            <p class="lead text-muted">爬取豆瓣电影评论并进行情感分析</p>
        </div>

        <div class="card mb-4">
            <div class="card-header bg-primary text-white">
                <h5 class="mb-0">电影评论爬取</h5>
            </div>
            <div class="card-body">
                <form id="crawlForm">
                    <div class="row g-3">
                        <div class="col-md-8">
                            <label for="movie_id" class="form-label">豆瓣电影ID</label>
                            <input type="text" class="form-control" id="movie_id"
                                  placeholder="例如:1292052 (肖申克的救赎)" required>
                            <div class="form-text">从豆瓣电影URL中获取ID,如: https://movie.douban.com/subject/1292052/</div>
                        </div>
                        <div class="col-md-4 d-flex align-items-end">
                            <button type="submit" class="btn btn-primary w-100">爬取评论</button>
                        </div>
                    </div>
                </form>

                <div id="crawlResult" class="mt-3 alert alert-success" style="display:none;"></div>
                <div id="crawlError" class="mt-3 alert alert-danger" style="display:none;"></div>
            </div>
        </div>

        <div class="row">
            <div class="col-md-4">
                <div class="card">
                    <div class="card-header bg-info text-white">
                        <h5 class="mb-0">电影列表</h5>
                    </div>
                    <div class="card-body">
                        <div class="list-group" id="movieList">
                            <!-- 电影列表将通过JS动态加载 -->
                        </div>
                    </div>
                </div>
            </div>

            <div class="col-md-8">
                <div class="card">
                    <div class="card-header bg-success text-white">
                        <h5 class="mb-0">情感分析结果</h5>
                    </div>
                    <div class="card-body">
                        <div id="chart-container">
                            <canvas id="sentimentChart"></canvas>
                        </div>

                        <div class="d-flex justify-content-between mb-3">
                            <h5 id="currentMovieTitle">选择电影查看评论</h5>
                            <div>
                                <span class="badge bg-success sentiment-badge">正面</span>
                                <span class="badge bg-warning text-dark sentiment-badge">中性</span>
                                <span class="badge bg-danger sentiment-badge">负面</span>
                            </div>
                        </div>

                        <div id="loadingReviews" class="loading">
                            <div class="spinner-border text-primary" role="status">
                                <span class="visually-hidden">加载中...</span>
                            </div>
                            <p class="mt-2">正在加载评论...</p>
                        </div>

                        <div id="reviewsContainer" class="row row-cols-1 g-3">
                            <!-- 评论将通过JS动态加载 -->
                        </div>
                    </div>
                </div>
            </div>
        </div>
    </div>

    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/chart.js@3.7.1/dist/chart.min.js"></script>
    <script>
        // 全局变量
        let sentimentChart;
        let currentMovieId = '';

        // 初始化页面
        document.addEventListener('DOMContentLoaded', function() {
            loadMovieList();

            // 表单提交事件
            document.getElementById('crawlForm').addEventListener('submit', function(e) {
                e.preventDefault();
                const movieId = document.getElementById('movie_id').value.trim();
                if (!movieId) return;

                crawlReviews(movieId);
            });
        });

        // 加载电影列表
        function loadMovieList() {
            fetch('/reviews?limit=10')
                .then(response => response.json())
                .then(data => {
                    const movieList = document.getElementById('movieList');
                    movieList.innerHTML = '';

                    // 获取不重复的电影列表
                    const uniqueMovies = {};
                    data.forEach(review => {
                        if (!uniqueMovies[review.movie_id]) {
                            uniqueMovies[review.movie_id] = review.movie_title;
                        }
                    });

                    // 显示电影列表
                    for (const [id, title] of Object.entries(uniqueMovies)) {
                        const item = document.createElement('button');
                        item.type = 'button';
                        item.className = 'list-group-item list-group-item-action';
                        item.textContent = title;
                        item.onclick = () => loadMovieReviews(id, title);
                        movieList.appendChild(item);
                    }

                    // 如果有电影,默认加载第一个
                    if (Object.keys(uniqueMovies).length > 0) {
                        const firstId = Object.keys(uniqueMovies)[0];
                        loadMovieReviews(firstId, uniqueMovies[firstId]);
                    }
                });
        }

        // 爬取评论
        function crawlReviews(movieId) {
            const formData = new FormData();
            formData.append('movie_id', movieId);

            document.getElementById('crawlResult').style.display = 'none';
            document.getElementById('crawlError').style.display = 'none';

            fetch('/crawl', {
                method: 'POST',
                body: formData
            })
            .then(response => response.json())
            .then(data => {
                if (data.error) {
                    document.getElementById('crawlError').textContent = data.error;
                    document.getElementById('crawlError').style.display = 'block';
                    return;
                }

                document.getElementById('crawlResult').innerHTML = `
                    成功爬取 ${data.count} 条评论 - <strong>${data.movie_title}</strong>
                `;
                document.getElementById('crawlResult').style.display = 'block';

                // 重新加载电影列表和评论
                loadMovieList();
                loadMovieReviews(movieId, data.movie_title);
            })
            .catch(error => {
                document.getElementById('crawlError').textContent = '爬取失败: ' + error;
                document.getElementById('crawlError').style.display = 'block';
            });
        }

        // 加载电影评论
        function loadMovieReviews(movieId, movieTitle) {
            currentMovieId = movieId;
            document.getElementById('currentMovieTitle').textContent = movieTitle;
            document.getElementById('reviewsContainer').innerHTML = '';
            document.getElementById('loadingReviews').style.display = 'block';

            // 加载评论数据
            Promise.all([
                fetch(`/reviews?movie_id=${movieId}`).then(res => res.json()),
                fetch(`/stats?movie_id=${movieId}`).then(res => res.json())
            ])
            .then(([reviews, stats]) => {
                document.getElementById('loadingReviews').style.display = 'none';

                // 更新图表
                updateChart(stats);

                // 显示评论
                const container = document.getElementById('reviewsContainer');
                reviews.forEach(review => {
                    const sentimentClass = review.sentiment === '正面' ? 'positive' :
                                        review.sentiment === '负面' ? 'negative' : 'neutral';
                    const sentimentBadge = review.sentiment === '正面' ? 'bg-success' :
                                         review.sentiment === '负面' ? 'bg-danger' : 'bg-warning text-dark';

                    const col = document.createElement('div');
                    col.className = 'col';
                    col.innerHTML = `
                        <div class="card review-card ${sentimentClass}">
                            <div class="card-body">
                                <div class="d-flex justify-content-between mb-2">
                                    <h6 class="card-title mb-0">${review.username}</h6>
                                    <div>
                                        ${review.rating ? `<span class="rating">★ ${review.rating}</span>` : ''}
                                        <span class="badge ${sentimentBadge} sentiment-badge">${review.sentiment}</span>
                                    </div>
                                </div>
                                <p class="card-text">${review.content}</p>
                                <small class="text-muted">
                                    极性: ${review.polarity.toFixed(2)} |
                                    主观性: ${review.subjectivity.toFixed(2)}
                                </small>
                            </div>
                        </div>
                    `;
                    container.appendChild(col);
                });
            })
            .catch(error => {
                document.getElementById('loadingReviews').style.display = 'none';
                alert('加载评论失败: ' + error);
            });
        }

        // 更新图表
        function updateChart(stats) {
            const ctx = document.getElementById('sentimentChart').getContext('2d');

            // 销毁旧图表
            if (sentimentChart) {
                sentimentChart.destroy();
            }

            // 准备数据
            const labels = ['正面', '中性', '负面'];
            const data = [
                stats['正面']?.count || 0,
                stats['中性']?.count || 0,
                stats['负面']?.count || 0
            ];
            const backgroundColors = [
                'rgba(40, 167, 69, 0.7)',
                'rgba(255, 193, 7, 0.7)',
                'rgba(220, 53, 69, 0.7)'
            ];

            // 创建新图表
            sentimentChart = new Chart(ctx, {
                type: 'doughnut',
                data: {
                    labels: labels,
                    datasets: [{
                        data: data,
                        backgroundColor: backgroundColors,
                        borderWidth: 1
                    }]
                },
                options: {
                    responsive: true,
                    plugins: {
                        legend: {
                            position: 'bottom',
                        },
                        tooltip: {
                            callbacks: {
                                label: function(context) {
                                    const label = context.label || '';
                                    const value = context.raw || 0;
                                    const total = context.dataset.data.reduce((a, b) => a + b, 0);
                                    const percentage = Math.round((value / total) * 100);
                                    return `${label}: ${value} (${percentage}%)`;
                                }
                            }
                        }
                    }
                }
            });
        }
    </script>
</body>
</html>

运行截图:

后端

前端

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值