Python爬虫数据清洗与可视化实战（附完整代码）

最新推荐文章于 2025-04-06 18:26:01 发布

不辉放弃

最新推荐文章于 2025-04-06 18:26:01 发布

阅读量861

点赞数 4

分类专栏：爬虫文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/2301_76971522/article/details/146292426

版权

爬虫专栏收录该内容

1 篇文章

订阅专栏

一、数据清洗四部曲（案例：京东商品评论）

1. 原始数据示例

raw_data = [
    {"comment": "手机很好用！ 快递很快 ", "rating": "5星", "date": "2023-02-30"},  # 含错误日期
    {"comment": None, "rating": "五星好评", "date": "2023-08-01"},              # 异常评分
    {"comment": "屏幕有划痕", "rating": "1", "date": "2023-08-02"}
]

2. 结构化处理

import pandas as pd
from datetime import datetime

def clean_rating(rating):
    # 统一评分标准（支持中文/数字/符号）
    return float(rating.replace('星','').replace('好评','').strip())

df = pd.DataFrame(raw_data)
df['rating'] = df['rating'].apply(clean_rating)

3. 异常值过滤

# 处理日期异常
df = df[df['date'].apply(lambda x: len(x)==10)] 

# 删除空评论
df = df.dropna(subset=['comment'])

4. 文本清洗（正则实战）

import re

def clean_text(text):
    text = re.sub(r'\s+', '', text)          # 去空格
    text = re.sub(r'[^\w\u4e00-\u9fff]', '', text)  # 去特殊符号
    return text.strip()

df['clean_comment'] = df['comment'].apply(clean_text)

二、Pandas高级技巧

1. 数据透视分析

# 按评分分布统计
rating_dist = df.pivot_table(
    index=pd.cut(df['rating'], bins=[0,2,4,5]),
    values='comment',
    aggfunc='count'
)

2. 情感分析（简易版）

# 使用SnowNLP进行情感评分
from snownlp import SnowNLP

df['sentiment'] = df['clean_comment'].apply(
    lambda x: SnowNLP(x).sentiments
)

三、可视化神器对比

1. Matplotlib基础图表

import matplotlib.pyplot as plt

# 评分分布饼图
plt.figure(figsize=(10,6))
df['rating'].plot(kind='pie', autopct='%1.1f%%')
plt.title('商品评分分布')
plt.savefig('rating_pie.png')

2. Pyecharts动态图表

from pyecharts.charts import Bar
from pyecharts import options as opts

# 月度评论量柱状图
bar = (
    Bar()
    .add_xaxis(df['date'].dt.month.unique().tolist())
    .add_yaxis("评论量", df.groupby(df['date'].dt.month).size().tolist())
    .set_global_opts(title_opts=opts.TitleOpts(title="月度评论趋势"))
)
bar.render("comment_trend.html")

3. WordCloud词云生成

from wordcloud import WordCloud

text = ' '.join(df['clean_comment'])
wc = WordCloud(font_path='simhei.ttf', background_color='white').generate(text)
wc.to_file('wordcloud.png')

四、综合案例：电商数据分析

1. 数据获取（爬虫部分）

# 京东商品评论API示例（需替换实际参数）
def get_jd_comments(product_id, page):
    url = f'https://club.jd.com/comment/productPageComments.action?productId={product_id}&page={page}'
    response = requests.get(url, headers=headers)
    return response.json()['comments']

2. 多维分析报告生成

# 生成自动分析报告
report = f"""
## 商品分析报告（{datetime.today().date()}）
- 平均评分：{df['rating'].mean():.1f} 分
- 好评率（>4分）：{(df['rating']>4).mean()*100:.1f}%
- 情感倾向：{"积极" if df['sentiment'].mean()>0.5 else "谨慎"}
- 高频词汇：{', '.join(top10_words)}
"""

3. 自动生成可视化看板

# 使用Dash构建网页仪表盘
import dash
from dash import dcc, html

app = dash.Dash()
app.layout = html.Div([
    dcc.Graph(figure=bar), 
    html.Img(src='wordcloud.png')
])
app.run_server(debug=True)

五、常见问题解决方案

1. 中文乱码问题

# 全局设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']  
plt.rcParams['axes.unicode_minus'] = False

2. 图表优化技巧

# 添加数据标签示例
ax = df['rating'].value_counts().plot(kind='bar')
for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+0.2, p.get_height()+5))

3. 大数据处理技巧

# 分块处理大文件
chunk_iter = pd.read_csv('big_data.csv', chunksize=10000)
for chunk in chunk_iter:
    process(chunk)  # 自定义处理函数