小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分

比赛链接:https://www.kaggle.com/competitions/feedback-prize-effectiveness/overview
公开代码:https://www.kaggle.com/code/mustafakeser4/feedback-eda/notebook

导包加载数据

import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

train = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")
test = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
sample_sub = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")

print('train:')
display(train)
print('test:')
display(test)
print('sample_sub:')
display(sample_sub)

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

train.describe()

在这里插入图片描述
统计每类问题下,学生们的答题评分分布

def count_string(x):
    return len(x.split())

train['word_count'] = train.discourse_text.apply(count_string)
test['word_count'] = test.discourse_text.apply(count_string)


import seaborn as sns
plt.figure(figsize = (10,4))
sns.countplot(data = train,x = 'discourse_type',hue = 'discourse_effectiveness')

在这里插入图片描述


def pieplot(df,col ,title,explodeplace=0):
    """
    df: DataFrame
    col: column
    title: title for plot
    explodeplace: segment|slice location for exploding
    """
    labels = df[col].value_counts().index
    sizes = df[col].value_counts()
    # 初始化偏移程度
    # https://vimsky.com/examples/usage/matplotlib-axes-axes-pie-in-python.html
    explode = np.zeros(len(sizes))
    explode[explodeplace]=0.1
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
            shadow=True, startangle=90)
    ax1.axis('equal')
    ax1.set_title(title)

    plt.show()
    print(df[col].value_counts())
    

分别考察:训练集整体评分分布、训练集整体问题类型分布、测试集整体评分分布

pieplot(train,"discourse_effectiveness","train discourse_effectiveness",explodeplace=1)
pieplot(train,"discourse_type","train discourse_type",explodeplace=0)
pieplot(test,"discourse_type","test discourse_type",explodeplace=0)

在这里插入图片描述
在这里插入图片描述
统计每个评分下,“问题”的长度分布

plt.figure(figsize = (10,5))
sns.histplot(data = train,x = 'word_count',hue = 'discourse_effectiveness',kde = True)
plt.title('distribution of word counts (based df row)')

在这里插入图片描述

train_txt_list = os.listdir("../input/feedback-prize-effectiveness/train")
test_txt_list = os.listdir("../input/feedback-prize-effectiveness/test")
train_word_count = []

for txt in tqdm(train_txt_list):
    with open("../input/feedback-prize-effectiveness/train/"+txt,'r') as f:
        data = f.read().replace('\n','')
    train_word_count.append(len(data.split()))

test_word_count = []
for txt in tqdm(test_txt_list):
    with open("../input/feedback-prize-effectiveness/test/"+txt, "r") as f:
        data = f.read().replace('\n','')
    test_word_count.append(len(data.split()))

train_count_dict = dict(zip([f[:-4] for f in train_txt_list],train_word_count))
test_count_dict = dict(zip([f[:-4] for f in test_txt_list],test_word_count))

每篇文章的字数统计

essay_count_df = pd.DataFrame(data = train_count_dict.values(),index = train_count_dict.keys(),columns = ['word_count'])
essay_count_df

在这里插入图片描述
文章的长度分布

plt.figure(figsize = (10,5))
sns.histplot(data = essay_count_df,x = 'word_count',kde = True)
plt.title("distribution of word counts (based essays)");

在这里插入图片描述

essay_ids = train.essay_id.unique().tolist()
first_sentence_word_count = []
first_sentence_effectiveness = []

for ids in tqdm(essay_ids):
    A = train[train.essay_id == ids]
    first_sentence_word_count.append(A.iloc[0]['word_count'])
    first_sentence_effectiveness.append(A.iloc[0]["discourse_effectiveness"])

essay_count_df['first_discourse'] = first_sentence_word_count
essay_count_df["first_sentence_effectiveness"] = first_sentence_effectiveness
essay_count_df.head()

在这里插入图片描述
每个评分的“问题”长度分析:评分低的“问题”偏长

plt.figure(figsize = (10,5))
sns.scatterplot(data = essay_count_df,y = 'word_count',x="first_discourse",hue="first_sentence_effectiveness")
plt.xlabel("first discourse word count")
plt.ylabel("essay word count");

在这里插入图片描述

_,ax = plt.subplots(ncols = 2,nrows = 1,figsize = (16,5))
train.groupby("discourse_effectiveness").mean().T.plot(kind = 'bar',ax = ax[0])
ax[0].set_title("average word count for discourse type ");
train.groupby('discourse_type').mean().T.plot(kind="bar",ax=ax[1])
ax[1].set_title("average word count for discourse type ");

在这里插入图片描述

停用词

import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

在这里插入图片描述

统计每个评分下的“问题”的高频gram

def generate_ngrams(text,stop_words,n_gram = 1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in stop_words]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

#generate_ngrams('I love you for ever.',['a'],2)

grams = ["unigrams","bigrams","trigrams"]
type_effectiveness = ['All','Adequate', 'Ineffective', 'Effective']
stop_words = stopwords.words('english')
import warnings
warnings.filterwarnings('ignore')
for name in type_effectiveness:
    for i,ngram in enumerate(grams):
        if name == 'All':
            locals()[name] = train.discourse_text
        else:
            locals()[name] = train[train.discourse_effectiveness == name].discourse_text
        locals()[name]["mytext_new"] = locals()[name].str.lower().str.replace('[^\w\s]','')
        locals()[name+'_'+ngram] = pd.Series(generate_ngrams(' '.join(locals()[name].mytext_new.to_list()),stop_words, n_gram=i+1)).value_counts()

name = 'unigrams'
top = 25

for ngram in grams:
    if ngram != 'trigrams':
        sl = 0
        a = 0
    else:
        sl = 1
        a = 2.5
    fig,axs = plt.subplots(ncols = 4,nrows = 1,figsize=(16,int(top/4)),sharey = False)
    plt.subplots_adjust(hspace = 0.5,wspace = 1*a)
    sns.barplot(y=locals()[f"All_{ngram}"].index[sl:top],x = locals()[f"All_{ngram}"].values[sl:top],palette = 'magma_r',ax = axs[0])
    axs[0].set_xlabel('All_frequency')
    axs[0].set_ylabel(ngram)
    sns.barplot(y=locals()[f"Adequate_{ngram}"].index[sl:top], x = locals()[f"Adequate_{ngram}"].values[sl:top],palette='viridis_r',ax=axs[1])
    axs[1].set_xlabel("Adequate_frequency")
    axs[1].set_ylabel(ngram)
    sns.barplot(y=locals()[f"Ineffective_{ngram}"].index[sl:top], x = locals()[f"Ineffective_{ngram}"].values[sl:top],palette='magma_r',ax=axs[2])
    axs[2].set_xlabel("Ineffective_frequency")
    axs[2].set_ylabel(ngram)
    sns.barplot(y=locals()[f"Effective_{ngram}"].index[sl:top], x = locals()[f"Effective_{ngram}"].values[sl:top],palette='viridis_r',ax=axs[3])
    axs[3].set_xlabel("Effective_frequency")
    axs[3].set_ylabel(ngram)
    fig.suptitle(f"TOP {top} {ngram}")
    plt.show()

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

词云图

def word_cloud(is_df = False,df = None,title_plot_df = ' ',is_essay = True,essay_id = '007ACE74B050',discourse_id = '0013cc385424'):
    """
    is df: True if text from df. True cancels essay id and discource id options 
    df: text column in df
    is_assey: True: whole essay text word cloud | False: discourse text word cloud
    """
    path = "../input/feedback-prize-effectiveness/train/"
    if is_df:
        data = ' '.join(df.to_list())

    else:
        if is_essay:
            with open(path+essay_id+'.txt','r') as f:
                data = f.read().replace('\n','')
                
        else:
            data = train[train['discourse_id'] == discourse_id]['discourse_text'][0]
    #print(data)
    wordcloud = WordCloud(relative_scaling=0.1,colormap="cividis",prefer_horizontal=0.5,background_color="white",mode= "RGB").generate(data)
    plt.figure(figsize = (12,8),dpi=90)
    plt.imshow(wordcloud, interpolation="antialiased",aspect=0.5)
    plt.axis('off')
    plt.margins(x=0, y=0)
    if is_df:
        plt.title(f" {title_plot_df} ")
    else:
        if is_essay:
             plt.title(f"essay id: {essay_id} word cloud")
        else:
            plt.title(f"discourse_id: {discourse_id} word cloud")
        plt.show()

word_cloud(is_df=True, df=train[train.discourse_effectiveness=='Adequate'].discourse_text,
           title_plot_df="Adequate word cloud" 
                ,is_essay=True,essay_id="007ACE74B050"
                ,discourse_id="0013cc385424")

在这里插入图片描述

word_cloud(is_df=True, df=train[train.discourse_effectiveness=='Ineffective'].discourse_text,
           title_plot_df="Ineffective word cloud" 
                ,is_essay=True,essay_id="007ACE74B050"
                ,discourse_id="0013cc385424")

在这里插入图片描述

word_cloud(is_df=True, df=train[train.discourse_effectiveness== 'Effective'].discourse_text,
           title_plot_df="Effective word cloud" 
                ,is_essay=True,essay_id="007ACE74B050"
                ,discourse_id="0013cc385424")

在这里插入图片描述

word_cloud(is_essay=True,essay_id="007ACE74B050",discourse_id="0013cc385424")

在这里插入图片描述

word_cloud(is_essay=False,essay_id="007ACE74B050",discourse_id="0013cc385424")

在这里插入图片描述

训练

分别用每个评分的“问题”作为词向量训练

import tensorflow_hub as hub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss,accuracy_score,classification_report,confusion_matrix
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

vec = TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
vec.fit(train.discourse_text.to_list()+test.discourse_text.to_list())
le = LabelEncoder()
preds = []
models = {}
vectorizers = {}
for name in ['All','Adequate', 'Effective', 'Ineffective']:
    vec = TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
    X = train.discourse_text.to_list()
    y = le.fit_transform(train.discourse_effectiveness.astype(str))
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=0)
    if name=='All':
        vec=TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
        vec.fit(train.discourse_text.to_list())
    else:
        vec = TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
        vec.fit(train.discourse_text[train.discourse_effectiveness==name].to_list())
    X_train = vec.transform(X_train)
    X_test = vec.transform(X_test)
    testvec = vec.transform(test.discourse_text.to_list())
    lgsc = LogisticRegression(max_iter=600,penalty='l2',C=1.015)
    lgsc.fit(X_train,y_train)
    pred=lgsc.predict_proba(X_test)
    print(f"\n{name} logloss train: {log_loss(y_train,lgsc.predict_proba(X_train))}, logloss val: {log_loss(y_test,pred)}")
    predtest = lgsc.predict_proba(testvec)
    preds.append(predtest)
    models[name] = lgsc
    vectorizers[name] = vec
    

在这里插入图片描述

print(classification_report(y_train,np.argmax(lgsc.predict_proba(X_train),axis=1)))

在这里插入图片描述

sns.heatmap(confusion_matrix(y_train,np.argmax(lgsc.predict_proba(X_train),axis=1)),annot=True,fmt='.0f',xticklabels=le.classes_,yticklabels=le.classes_)
plt.xlabel("predicted")
plt.ylabel("actual")
plt.title('train confusion matrix')

在这里插入图片描述

提交

predtest = np.mean(preds,axis=0)

sample_sub.iloc[:,1:]=predtest

sample_sub.to_csv("submission.csv",index=False)
sample_sub

在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值