关键词搜索
"""
功能:实现关键词搜索
可以尝试修改/调试/升级的部分是:
文本预处理步骤: 你可以使用很多不同的方法来使得文本数据变得更加清洁
自制的特征: 相处更多的特征值表达方法(关键词全段重合数量,重合比率,等等)
更好的回归模型: 根据之前的课讲的Ensemble方法,把分类器提升到极致
版本1.0
日期:10.10.2019
"""
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
df_train = pd.read_csv('C:/Users/Administrator/Desktop/七月在线课程下载/word2vec/input/train.csv',
encoding="ISO-8859-1")
df_test = pd.read_csv('C:/Users/Administrator/Desktop/七月在线课程下载/word2vec/input/test.csv',
encoding="ISO-8859-1")
df_desc = pd.read_csv('C:/Users/Administrator/Desktop/七月在线课程下载/word2vec/input/product_descriptions.csv')
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_desc, how='left', on='product_uid')
stemmer = SnowballStemmer('english')
def str_stemmer(s):
"""
:param s: 字符格式的字符
:return: 词干抽取后的字符
"""
return " ".join([stemmer.stem(word) for word in s.lower().split()])
df_all['search_term'] = df_all['search_term'].map(lambda x: str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x: str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x: str_stemmer(x))
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
def str_common_word(str1, str2):
print(str1.head())
return sum(int(str2.find(word) >= 0) for word in str1.split())
df_all['commons_in_title'] = df_all.apply(lambda x:
str_common_word(x['search_term'], x['product_title']),
axis=1)
df_all['commons_in_desc'] = df_all.apply(lambda x:
str_common_word(x['search_term'], x['product_description']),
axis=1)
df_all = df_all.drop(['search_term', 'product_title', 'product_description'], axis=1)
df_train = df_all.loc[df_train.index]
df_test = df_all.loc[df_train.shape[0]:]
test_ids = df_test['id']
y_train = df_train['relevance'].values
X_train = df_train.drop(['id', 'relevance'], axis=1).values
X_test = df_test.drop(['id', 'relevance'], axis=1).values
params = [2, 6, 7, 9]
test_scores = []
for param in params:
classfier = RandomForestRegressor(n_estimators=30, max_depth=param)
test_score = np.sqrt(-cross_val_score(classfier, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
plt.plot(params, test_scores)
plt.title("Param vs CV Error")
rf = RandomForestRegressor(n_estimators=30, max_depth=6)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('submission.csv',index=False)