kaggle竞赛——入门二(Natural Language Processing with Disaster Tweets)

比赛地址:https://www.kaggle.com/c/nlp-getting-started/submit

"""
__author__:shuangrui Guo
__description__:
"""
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.stem import SnowballStemmer
import seaborn as sns
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest ,chi2

#读取数据集
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
#在keyword存在61个空值,location列存在2533个空值
#print(train.isnull().sum())

#可视化,在条形图顶部显示数字
# plt.figure()
# ax = train['target'].value_counts().plot.bar()
# for p in ax.patches:
#     ax.annotate(np.round(p.get_height(),decimals=2),
#                 ((p.get_x()+p.get_width()/2.0),p.get_height()),
#                 ha='center',
#                 va='center',
#                 xytext=(0,5),
#                 textcoords='offset points')
# plt.title('True vs False Disaster Tweets')
# plt.xlabel('True vs False')
# plt.xticks(rotation=360)
# plt.show()

#清洗文本
#使用SnowballStemmer来把句子中的单词词干化
stemmer = SnowballStemmer('english')
stopwords_list = stopwords.words('english')

def clean_content(string:str):
    cleaned = []
    temp=re.sub("[^a-zA-Z]"," ",string).split()
    for word in temp:
        if word not in stopwords_list:
            cleaned.append(stemmer.stem(word))
    return " ".join(cleaned).lower()
train['cleaned']=train['text'].apply(clean_content)

#步骤二:去除一些没有用的词与符号
def review_cleaning(text):
    text = re.sub(r'([!”#$%&’()*+,-./:;<=>?[\]^_`{|}~])'," ",text)
    text = re.sub(r'http',' ',text)
    text = re.sub(r'https',' ',text)
    text = re.sub(r'http\S+',' ',text)
    text = re.sub(r'https\S+',' ',text)
    text = re.sub(r'co',' ',text)
    text = re.sub(r'\s+',' ',text)
    text = re.sub(r'\d+',' ',text)
    text = re.sub(r'[^a-zA-Z0-9]+',' ',text)
    return text

train['cleaned'] = train['cleaned'].apply(review_cleaning)

#删除一些只有一个单词的行:
train['cleaned'] = [t for t in train['cleaned'] if len(t)>1]

#创建训练集与测试集
#train['cleaned'] = train['cleaned'].values


#创建tf-idf
tfidf = TfidfVectorizer(analyzer='word',
                        max_features=10000,
                        ngram_range=(1,3),
                        stop_words='english')
X = tfidf.fit_transform(train['cleaned'])
X_train,X_test,y_train,y_test = train_test_split(X,train['target'].tolist(),test_size=0.2,stratify=train['target'].tolist())

pipeline = Pipeline(
    [('mutual_info_classif',SelectKBest(chi2,k=6500)),
     ('classifier',SVC(kernel='rbf',random_state=0,verbose=True,gamma=1,C=1,degree=6,shrinking=True,probability=False,cache_size=5))]
)

model = pipeline.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(f1_score(y_test,y_pred))

#在真正的测试集上进行预测并保存
test['cleaned'] = test['text'].apply(clean_content)
test['cleaned'] = test['cleaned'].apply(review_cleaning)

testing = tfidf.transform(test['cleaned'])
test_pred = model.predict(testing)
test['target'] = test_pred
columns = ['id','target']
submission = test[columns]
submission.to_csv('./submission.csv',index=False)

目前的不足:

文本清洗部分觉得有些奇怪

使用TFIDF的结果直接去划分训练集和测试集不能理解

SelectKBest的作用不清楚

Pipeline的使用不了解

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值