这篇比基础版加了什么呢?
基础版是直接将文本放入TF-IDF,虽然简单方便,但还是不够严谨,可以对数据进行进一步处理,如先小写化,将文本分成晓得tokens,接着删减如the,a,that等停止词,用正则化删除数字。用这些手段将数据进行处理。
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer import pandas as pd import numpy as pd from sklearn.svm import SVC from sklearn.metrics import roc_auc_score from datetime import date #导入数据 data = pd.read_csv('') #将headlines合并起来,考虑所有的news data['combined_news'] = data.filter(regex=('Top.*')).apply(lambda x:''.join(str(x.values)),axis = 1) #分割测试/训练集 train = data[data['Date']<'2015-01-01'] test = data[data['Date']>'2014-12-31'] #文本预处理 X_train = train['combined_news'].str.lower().str.replace('"','').str.replace("'",'').str.split() X_test = test['combined_news'].str.lower().str.replace('"','').str.replace("'",'').str.split() #删减停止词 from nltk.corpus import stopwords stop = stopwords.words('english') #删除数字 import re def hasNumbers(inputStrings): return bool(re.search(r'\d',inputStrings)) #lemma from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() def check(word): #如果需要这个单词,则True,如果需要去除,则False if word in stop: return False elif hasNumbers(word): return False else: return True #然后把整个流程放进我们的DF中进行处理 X_train = X_train.apply(lambda x:[wordnet_lemmatizer.lemmatize(item) for item in x if check(item)]) X_train = X_train.apply(lambda x:[wordnet_lemmatizer.lemmatize(item) for item in x if check(item)]) #因为外部库,比如sklearn ,只支持string输入,所以我们把调整后的list再变回string X_train = X_train.apply(lambda x:''.join(x)) X_test = X_test.apply(lambda x:''.join(x)) #重新fit一遍我们的clf feature_extraction = TfidfVectorizer(lowercase=False) X_train =feature_extraction.fit_transform(X_train.values) X_test =feature_extraction.fit_transform(X_test.values) #训练模型 clf = SVC(probability=True,kernel='rbf') clf.fit(X_train,y_train) predictions = clf.predict_proba(X_test) print('ROC_AUC yieds'+str(roc_auc_score(y_test,predictions[:,1])))