一、数据清洗四部曲(案例:京东商品评论)
1. 原始数据示例
raw_data = [
{"comment": "手机很好用! 快递很快 ", "rating": "5星", "date": "2023-02-30"}, # 含错误日期
{"comment": None, "rating": "五星好评", "date": "2023-08-01"}, # 异常评分
{"comment": "屏幕有划痕", "rating": "1", "date": "2023-08-02"}
]
2. 结构化处理
import pandas as pd
from datetime import datetime
def clean_rating(rating):
# 统一评分标准(支持中文/数字/符号)
return float(rating.replace('星','').replace('好评','').strip())
df = pd.DataFrame(raw_data)
df['rating'] = df['rating'].apply(clean_rating)
3. 异常值过滤
# 处理日期异常
df = df[df['date'].apply(lambda x: len(x)==10)]
# 删除空评论
df = df.dropna(subset=['comment'])
4. 文本清洗(正则实战)
import re
def clean_text(text):
text = re.sub(r'\s+', '', text) # 去空格
text = re.sub(r'[^\w\u4e00-\u9fff]', '', text) # 去特殊符号
return text.strip()
df['clean_comment'] = df['comment'].apply(clean_text)
二、Pandas高级技巧
1. 数据透视分析
# 按评分分布统计
rating_dist = df.pivot_table(
index=pd.cut(df['rating'], bins=[0,2,4,5]),
values='comment',
aggfunc='count'
)
2. 情感分析(简易版)
# 使用SnowNLP进行情感评分
from snownlp import SnowNLP
df['sentiment'] = df['clean_comment'].apply(
lambda x: SnowNLP(x).sentiments
)
三、可视化神器对比
1. Matplotlib基础图表
import matplotlib.pyplot as plt
# 评分分布饼图
plt.figure(figsize=(10,6))
df['rating'].plot(kind='pie', autopct='%1.1f%%')
plt.title('商品评分分布')
plt.savefig('rating_pie.png')
2. Pyecharts动态图表
from pyecharts.charts import Bar
from pyecharts import options as opts
# 月度评论量柱状图
bar = (
Bar()
.add_xaxis(df['date'].dt.month.unique().tolist())
.add_yaxis("评论量", df.groupby(df['date'].dt.month).size().tolist())
.set_global_opts(title_opts=opts.TitleOpts(title="月度评论趋势"))
)
bar.render("comment_trend.html")
3. WordCloud词云生成
from wordcloud import WordCloud
text = ' '.join(df['clean_comment'])
wc = WordCloud(font_path='simhei.ttf', background_color='white').generate(text)
wc.to_file('wordcloud.png')
四、综合案例:电商数据分析
1. 数据获取(爬虫部分)
# 京东商品评论API示例(需替换实际参数)
def get_jd_comments(product_id, page):
url = f'https://club.jd.com/comment/productPageComments.action?productId={product_id}&page={page}'
response = requests.get(url, headers=headers)
return response.json()['comments']
2. 多维分析报告生成
# 生成自动分析报告
report = f"""
## 商品分析报告({datetime.today().date()})
- 平均评分:{df['rating'].mean():.1f} 分
- 好评率(>4分):{(df['rating']>4).mean()*100:.1f}%
- 情感倾向:{"积极" if df['sentiment'].mean()>0.5 else "谨慎"}
- 高频词汇:{', '.join(top10_words)}
"""
3. 自动生成可视化看板
# 使用Dash构建网页仪表盘
import dash
from dash import dcc, html
app = dash.Dash()
app.layout = html.Div([
dcc.Graph(figure=bar),
html.Img(src='wordcloud.png')
])
app.run_server(debug=True)
五、常见问题解决方案
1. 中文乱码问题
# 全局设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
2. 图表优化技巧
# 添加数据标签示例
ax = df['rating'].value_counts().plot(kind='bar')
for p in ax.patches:
ax.annotate(p.get_height(), (p.get_x()+0.2, p.get_height()+5))
3. 大数据处理技巧
# 分块处理大文件
chunk_iter = pd.read_csv('big_data.csv', chunksize=10000)
for chunk in chunk_iter:
process(chunk) # 自定义处理函数