1.读取数据
import pandas as pd
import json
import numpy as np
# train data file_template
with open('train.json', 'r',encoding='utf-8') as file:
train_json = json.load(file)
train_df = pd.DataFrame(train_json)
articles_data = pd.read_csv('articles.csv')
2.一些基本信息
# 查看数据基本信息
print("文章数据形状:", articles_data.shape)
# 显示前几行数据
print("\n文章数据前5行:")
print(articles_data.head())
# 查看数据信息
print("\n文章数据信息:")
print(articles_data.info())
# 描述性统计
print("\n文章数据描述性统计:")
print(articles_data.describe(include='all'))
# 去除重复值
train_df.drop_duplicates(inplace=True)
articles_data.drop_duplicates(inplace=True)
#统计类别信息画图
train_df['label'].value_counts().plot(kind='bar')
plt.title('Class count')
plt.xlabel("category")
3.数据变换
# 数据处理
train_df['task_category'] = train_df['instruction'].apply(lambda x: 'content' if '电商客服专家' in x else 'image')
# 筛选需要的数据
content_df = train_df[train_df['task_category']=='content'].reset_index(drop=True)
# 假设exp_df是从其他地方获取的数据
exp_df = pd.DataFrame() # 这里应该是你的实际exp_df数据
exp_df['if_use'] = exp_df['if_false'].apply(lambda x: 1 if 'ok' in x else 0) # 只使用标签打对的数据
exp_df = exp_df[exp_df['if_use']==1].reset_index(drop=True)
# 两个数据表通过id进行内连接
exp_df = pd.merge(exp_df, content_df, on='id', how='inner')
# 数据类型转换(示例)
exp_df['id'] = exp_df['id'].astype(str) # 将ID转换为字符串类型
# 特征工程(示例)
exp_df['text_length'] = exp_df['instruction'].apply(len) # 计算文本长度
4.缺失值处理
train_df.dtypes
# 检查缺失值
print("训练数据缺失值统计:")
print(train_df.isnull().sum())
# print("\n文章数据缺失值统计:")
# print(train_df.isnull().sum())
# 处理缺失值
# 对于数值型数据,可以用均值/中位数填充
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
if train_df[col].isnull().sum() > 0:
train_df[col].fillna(train_df[col].median(), inplace=True)
# 对于分类数据,可以用众数或特定值填充
categorical_cols = train_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
if train_df[col].isnull().sum() > 0:
train_df[col].fillna('Unknown', inplace=True)
# 对于文本数据,可以用空字符串填充
text_cols = ['text'] # 假设这些是文本列
for col in text_cols:
if col in train_df.columns and train_df[col].isnull().sum() > 0:
train_df[col].fillna('', inplace=True)
# 如果某些行缺失值太多,可以直接删除
train_df.dropna(thresh=len(train_df.columns)*0.7, inplace=True) # 保留至少70%列有值的行
train_df.dropna(thresh=len(train_df.columns)*0.7, inplace=True)
# 再次检查缺失值
print("\n处理后训练数据缺失值统计:")
print(train_df.isnull().sum())
print("\n处理后文章数据缺失值统计:")
print(train_df.isnull().sum())
#简单处理
# 查看缺失值情况
print("缺失值统计:\n", exp_df.isnull().sum())
# 删除缺失值较多的列(可选)
threshold = 0.5 # 若某列超过50%为空,则删除
exp_df = exp_df.dropna(thresh=int(threshold * len(exp_df)), axis=1)
# 对缺失值进行填充
exp_df['some_column'] = exp_df['some_column'].fillna('未知') # 类别变量填充
exp_df['numeric_column'] = exp_df['numeric_column'].fillna(exp_df['numeric_column'].median()) # 数值变量填充中位数
#删除缺失值
data = data.dropna()
5.文本处理
train_df['text_len'] = train_df['text'].apply(lambda x: len(x.split(' ')))
print(train_df['text_len'].describe()) #统计单词的个数来得到每个句子的长度
_ = plt.hist(train_df['text_len'], bins=200)
plt.xlabel('Text char count')
plt.title("Histogram of char count")
# import matplotlib.pyplot as plt
# data = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]
# plt.hist(data, bins=5) # bins 参数表示要将数据分成多少个区间
#统计每个字符出现的次数
# 将训练集中所有的句子进行拼接进而划分为字符,并统计每个字符的个数。
from collections import Counter
all_lines = ' '.join(list(train_df['text']))
word_count = Counter(all_lines.split(" "))
word_count = sorted(word_count.items(), key=lambda d:d[1], reverse = True)
print(len(word_count))
print(word_count[0])
print(word_count[-1])
去掉高频部分
#统计每类新闻中出现次数对多的字符
#【每类的频数最大的前20个】去掉含有【所有类别总计的前20个】
# 剩下的为每个类别独特的出现最多的字符。
from collections import Counter
import pandas as pd
def get_unique_top_chars(df,topn=20,global_topn=20):
# print(list(df['text']))
all_text=' '.join(list(df['text']))
# print(all_text)
global_count = Counter(all_text.split(" "))
# global_count = sorted(global_count.items(), key=lambda d:int(d[1]), reverse = True)
# print(global_count)
# 计算全局高频出现字符
global_topchars={item[0] for item in global_count.most_common(global_topn)}
print(global_topchars)
# 计算每个类别的高频字符
unique_top_chars={}
for label in df['label'].unique():
class_texts=' '.join(list(df[df['label']==label]['text'])) #不能用tolist
# 统计字符频次
class_counter = Counter(class_texts.split(" "))
# 过滤掉全局高频字符,并取top_n
filtered_chars = [
item for item in class_counter.most_common(topn)
if item[0] not in global_topchars
]
unique_top_chars[label] = filtered_chars
return unique_top_chars
unique_top_chars =get_unique_top_chars(train_df)
# 打印结果
for label, chars in unique_top_chars.items():
print(f"类别 {label} 的独特高频字符(top20):")
for char, count in chars:
print(f" '{char}': {count}次")
print()
简单清洗
# 简单的文本预处理(如清洗)
def clean_text(text):
import re
text = re.sub(r'[^\w\s]', '', text) # 去除标点
text = text.lower() # 小写化
return text
exp_df['clean_text'] = exp_df['text_column'].apply(clean_text)
6.类别标签编码
from sklearn.preprocessing import LabelEncoder, StandardScaler
# 编码类别特征
label_encoder = LabelEncoder()#如果不是数字的话可以统一处理一下
exp_df['label_encoded'] = label_encoder.fit_transform(exp_df['label'])
# 对数值特征标准化(若有)
scaler = StandardScaler()
if 'numeric_column' in exp_df.columns:
exp_df['numeric_column_scaled'] = scaler.fit_transform(exp_df[['numeric_column']])
7.特征工程变量相关性
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# 选择数值型列
numeric_cols = ['click_article_id', 'click_timestamp', 'click_environment',
'click_deviceGroup', 'click_os', 'click_country',
'click_region', 'click_referrer_type']
# 计算相关系数矩阵
corr_matrix = df[numeric_cols].corr()
# 可视化热力图
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Numerical Feature Correlation Heatmap')
plt.show()
两个变量之间的关系
# 将时间戳转换为日期时间
df['click_time'] = pd.to_datetime(df['click_timestamp'], unit='ms')
# 生成点击量按小时统计
df['click_hour'] = df['click_time'].dt.hour
click_by_hour = df.groupby('click_hour').size()
# 绘制点击量随小时变化的折线图
plt.figure(figsize=(10, 5))
click_by_hour.plot(kind='line', marker='o')
plt.title('Clicks by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Number of Clicks')
plt.grid(True)
plt.xticks(range(0, 24))
plt.show()
# 按设备统计点击数
click_device = df.groupby('click_deviceGroup').size()
plt.figure(figsize=(8, 4))
click_device.plot(kind='bar', color='skyblue')
plt.title('Clicks by Device Group')
plt.xlabel('Device Group')
plt.ylabel('Number of Clicks')
plt.show()
# 按国家统计点击数(top10)
top_country_clicks = df['click_country'].value_counts().head(10)
plt.figure(figsize=(8, 4))
top_country_clicks.plot(kind='bar', color='orange')
plt.title('Top 10 Countries by Clicks')
plt.xlabel('Country ID')
plt.ylabel('Clicks')
plt.show()
画图
# 训练数据中每个标签(label 0, 1, 2, 3)的第一个心跳信号序列的波形图
plt.figure(figsize=(12, 4))
for i in range(4):
signal = train_data[train_data['label'] == i]['heartbeat_signals'].values[0]
signal = np.array(signal.split(','), dtype=np.float32)
plt.subplot(2, 2, i+1)
plt.plot(signal)
plt.title(f'Label {i}')
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.tight_layout()
plt.show()