import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
'''
1、数据导入
'''
#导入训练集和测试集
df_train = pd.read_csv('G:/KNNtest/NLP/all/train.csv',encoding='ISO-8859-1')
df_test = pd.read_csv('G:/KNNtest/NLP/all/test.csv',encoding='ISO-8859-1')
#产品介绍导入
df_disc = pd.read_csv('G:/KNNtest/NLP/all/product_descriptions.csv',encoding='ISO-8859-1')
#由于不需要做太多复杂处理,且已经拿到测试集和训练集,因此直接将测试集和训练集合并,一起进行文本预处理
df_all = pd.concat((df_train,df_test),axis=0,ignore_index=True)
#将产品介绍也导入进来,根据product_uid
df_all = pd.merge(df_all,df_disc,how='left',on='product_uid')
# df_all.to_csv('G:/KNNtest/NLP/all/mergeData.csv')
'''
2、文本预处理:
我们这里遇到的文本预处理比较简单,因为最主要的就是看关键词是否会被包含。
所以我们统一化我们的文本内容,以达到任何term在我们的数据集中只有一种表达式的效果
'''
stemmer = SnowballStemmer('english') #词干提取
def str_stemmer(s):
return "".join([stemmer.stem(word) for word in s.lower().split()])
def str_common_word(str1,str2):
return sum(int(str2.find(word)>=0) for word in str1.split())#为了计算『关键词』的有效性,我们可以naive地直接看『出现了多少次』
#把所有文本进行词干提取(词形统一化)
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
'''
3、自制文本特征
'''
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64) #关键词长度
#题标中有多少关键词重合
df_all['commons_in_title'] = df_all.apply(lambda x:str_common_word(x['search_term'],x['product_title']), axis=1)
#描述中有多少关键词重合
df_all['commons_in_desc'] = df_all.apply(lambda x:str_common_word(x['search_term'],x['product_description']), axis=1)
#将不能被机器学习模型处理的column drop掉
df_all = df_all.drop(['search_term','product_title','product_description'],axis = 1)
'''
4、重塑训练集和测试集
'''
#分开训练集和测试集
df_train = df_all.loc[df_train.index]
df_test = df_all.loc[df_test.index]
train_ids = df_train['id']
test_ids = df_test['id']
y_train = df_train['relevance'].values #获取y_train
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values
'''
建立模型
使用CV确保结果的正确性,并调试不同的alpha值
'''
params = [1,3,5,6,7,8,9,10]
test_scores = []
for param in params:
clf = RandomForestRegressor(n_estimators=30,max_depth=param)
test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv=5,scoring='neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
plt.plot(params,test_scores)
plt.title("Param VS CV Error")
plt.show()
'''
获取结果
'''
rf = RandomForestRegressor(n_estimators=30,max_depth=7)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
pd.DataFrame({"id":test_ids,"relevance":y_pred}).to_csv('G:/KNNtest/NLP/all/submission.csv')
kaggle练习---- Home Depot Product Search Relevance
最新推荐文章于 2020-09-28 10:39:42 发布