一、读取数据
使用 pandas 读取文件:
data_set = pd.read_csv("data/mbti_1.csv") # 读取文件
二、显示文件信息
def showTableInfo(data_set):
print("DATA PROFILING")
print()
print("a. Desribing Data Set:")
data_set.info() # 输出表格信息
print()
# 统计行、列数目
print("b. We have {r} Rows and {c} Columns".format(r=data_set.shape[0], c=data_set.shape[1]))
print()
# 统计空值数目
print("c. Null Values are :")
print(data_set[data_set.isnull()].count())
print()
# 统计一共有多少种性格类型
print("d. There are {t} Unique MBTI types in this study".format(t=data_set['type'].nunique()))
print(np.unique(np.array(data_set['type'])))
print()
# 统计用户数目和发言语句数量
print("e. No. of Total users & Posts =>")
posts = []
data_set.apply(lambda x: extract(x, posts), axis=1)
print("Number of users", len(data_set))
print("Number of posts", len(posts))
print()
# 输出文件前五行
print("f. Data Sneak Peek: First 5 rows")
print(data_set.head(5))
三、统计各类型数目
def countTypeNumber(data_set):
p_post = data_set['type'].value_counts() # count of comments per personality type - sns barplot requires 1D data
# 柱状图统计每个类别数量
plt.figure(figsize=(15, 4)) # 图像的尺寸
sns.barplot(p_post.index, p_post.values) # 柱状图横坐标为类别,纵坐标为数量
plt.xlabel('MBTI Personality', size=12) # x 轴标题
plt.ylabel('Posts available', size=12) # y 轴标题
plt.title('Posts with regards to each personality type') # 图标标题
plt.show() # 显示图表
print()
print("The number of every type is :")
# 输出每个类别的人数
for idx in range(len(p_post.values)):
print(p_post.index[idx], ": ", p_post.values[idx])
四、数据集处理
# 数据集分割为训练集和测试集,比例为 7:3
X_train, X_test, y_train, y_test = train_test_split(data_set['posts'], data_set['type'],
test_size=0.3,
random_state=123)
tfidf = TfidfVectorizer(stop_words='english') # 统计词频,并使用 tf-idf编码
X_train = tfidf.fit_transform(X_train) # 对训练集使用 tf-idf 编码
X_test = tfidf.transform(X_test) # 对测试集使用 tf-idf 编码
五、模型建立与预测
model1 = LogisticRegression() # 逻辑回归模型
model1.fit(X_train, y_train) # 训练逻辑回归模型
y_pred1 = model1.predict(X_test) # 使用训练好的模型预测
六、指标评价
def showMetrics(y_true,y_pred,model_name): # 计算各种指标
conf_matrix = confusion_matrix(y_true, y_pred) # 混淆矩阵
acc = accuracy_score(y_true, y_pred) # 准确率
prec = precision_score(y_true, y_pred,average='macro') # 精确率
recall = recall_score(y_true, y_pred,average='macro') # 召回率
classes = ['ENFJ','ENFP','ENTJ','ENTP','ESFJ','ESFP','ESTJ','ESTP',
'INFJ','INFP','INTJ','INTP','ISFJ','ISFP','ISTJ','ISTP'] #
# 可视化混淆矩阵
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=classes)
disp.plot(
include_values=True, # 混淆矩阵每个单元格上显示具体数值
cmap="viridis", # 使用的sklearn中的默认值
ax=None, # 同上
xticks_rotation="horizontal", # 同上
values_format="d" # 显示的数值格式
)
plt.title('Confusion Matrix of ' + model_name) # 标题名
plt.show() # 显示图片
print("Accuracy :",acc) # 输出准确率
print("Precision :",prec) # 输出精确率
print("Recall :",recall) # 输出召回率
七、完整代码
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
def extract(posts, new_posts): # 统计语句数量
for post in posts[1].split("|||"): # 以 ”|||“ 为分隔符
new_posts.append((posts[0], post)) # 构建语句列表
def showTableInfo(data_set):
print("DATA PROFILING")
print()
print("a. Desribing Data Set:")
data_set.info() # 输出表格信息
print()
# 统计行、列数目
print("b. We have {r} Rows and {c} Columns".format(r=data_set.shape[0], c=data_set.shape[1]))
print()
# 统计空值数目
print("c. Null Values are :")
print(data_set[data_set.isnull()].count())
print()
# 统计一共有多少种性格类型
print("d. There are {t} Unique MBTI types in this study".format(t=data_set['type'].nunique()))
print(np.unique(np.array(data_set['type'])))
print()
# 统计用户数目和发言语句数量
print("e. No. of Total users & Posts =>")
posts = []
data_set.apply(lambda x: extract(x, posts), axis=1)
print("Number of users", len(data_set))
print("Number of posts", len(posts))
print()
# 输出文件前五行
print("f. Data Sneak Peek: First 5 rows")
print(data_set.head(5))
def countTypeNumber(data_set):
p_post = data_set['type'].value_counts() # count of comments per personality type - sns barplot requires 1D data
# 柱状图统计每个类别数量
plt.figure(figsize=(15, 4)) # 图像的尺寸
sns.barplot(p_post.index, p_post.values) # 柱状图横坐标为类别,纵坐标为数量
plt.xlabel('MBTI Personality', size=12) # x 轴标题
plt.ylabel('Posts available', size=12) # y 轴标题
plt.title('Posts with regards to each personality type') # 图标标题
plt.show() # 显示图表
print()
print("The number of every type is :")
# 输出每个类别的人数
for idx in range(len(p_post.values)):
print(p_post.index[idx], ": ", p_post.values[idx])
def showMetrics(y_true,y_pred,model_name): # 计算各种指标
conf_matrix = confusion_matrix(y_true, y_pred) # 混淆矩阵
acc = accuracy_score(y_true, y_pred) # 准确率
prec = precision_score(y_true, y_pred,average='macro') # 精确率
recall = recall_score(y_true, y_pred,average='macro') # 召回率
classes = ['ENFJ','ENFP','ENTJ','ENTP','ESFJ','ESFP','ESTJ','ESTP',
'INFJ','INFP','INTJ','INTP','ISFJ','ISFP','ISTJ','ISTP'] #
# 可视化混淆矩阵
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=classes)
disp.plot(
include_values=True, # 混淆矩阵每个单元格上显示具体数值
cmap="viridis", # 使用的sklearn中的默认值
ax=None, # 同上
xticks_rotation="horizontal", # 同上
values_format="d" # 显示的数值格式
)
plt.title('Confusion Matrix of ' + model_name) # 标题名
plt.show() # 显示图片
print("Accuracy :",acc) # 输出准确率
print("Precision :",prec) # 输出精确率
print("Recall :",recall) # 输出召回率
if __name__ == '__main__':
warnings.filterwarnings("ignore") # 过滤警告
data_set = pd.read_csv("data/mbti_1.csv") # 读取文件
showTableInfo(data_set) # 显示数据信息
countTypeNumber(data_set) # 统计每类性格的人数
# 数据集分割为训练集和测试集,比例为 7:3
X_train, X_test, y_train, y_test = train_test_split(data_set['posts'], data_set['type'],
test_size=0.3,
random_state=123)
tfidf = TfidfVectorizer(stop_words='english') # 统计词频,并使用 tf-idf编码
X_train = tfidf.fit_transform(X_train) # 对训练集使用 tf-idf 编码
X_test = tfidf.transform(X_test) # 对测试集使用 tf-idf 编码
model1 = LogisticRegression() # 逻辑回归模型
model1.fit(X_train, y_train) # 训练逻辑回归模型
y_pred1 = model1.predict(X_test) # 使用训练好的模型预测
print()
print("The metrics of LogisticRegression:")
showMetrics(y_test,y_pred1,model_name="LogisticRegression") # 计算并输出各种评价指标
model2 = SGDClassifier() # SGD 线性分类器模型
model2.fit(X_train, y_train) # 训练 SGD 线性分类器模型
y_pred2 = model2.predict(X_test) # 使用训练好的模型预测
print()
print("The metrics of SGDClassifier:")
showMetrics(y_test, y_pred2, model_name="SGDClassifier") # 计算并输出各种评价指标