import pandas as pd
import numpy as np
import scipy
import math
import time
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import *
import gc
NUM_BRANDS = 2500
NAME_MIN_DF = 10
MAX_FEAT_DESCP = 50000
def PLOG(info):
localtime = time.asctime(time.localtime(time.time()))
print(info,': ',localtime)
def __rmsle(y, y_pred):
assert len(y) == len(y_pred)
to_sum = [(math.log(math.fabs(y_pred[i]) + 1) - math.log(math.fabs(y[i]) + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(to_sum) * (1.0/len(y))) ** 0.5
print("Reading in Data")
df_train = pd.read_csv('./train.tsv', sep='\t', nrows=1000)
df_test = pd.read_csv('./test.tsv', sep='\t', nrows=1000)
PLOG('1111111')
df = pd.concat([df_train, df_test], 0)
nrow_train = df_train.shape[0]
y_train = np.log1p(df_train["price"])
PLOG('2222222222')
del df_train
gc.collect()
PLOG('333333333')
print(df.memory_usage(deep = True))
#category_name
df["category_name"] = df["category_name"].fillna("Other").astype("category")
unique_categories = pd.Series("/".join(df["category_name"].unique().astype("str")).split("/")).unique()
count_category = CountVectorizer()
X_category = count_category.fit_transform(df["category_name"])
PLOG('4444444444444')
#brand_name
df["brand_name"] = df["brand_name"].fillna("unknown")
pop_brands = df["brand_name"].value_counts().index[:NUM_BRANDS]
df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"
df["brand_name"] = df["brand_name"].astype("category")
vect_brand = LabelBinarizer(sparse_output=True)
X_brand = vect_brand.fit_transform(df["brand_name"])
PLOG('555555555555')
#item_description
df["item_description"] = df["item_description"].fillna("None")
count_descp = TfidfVectorizer(max_features = MAX_FEAT_DESCP,
ngram_range = (1,3),
stop_words = "english")
X_descp = count_descp.fit_transform(df["item_description"])
PLOG('6666666666666')
#item_condition_id, shipping
df["item_condition_id"] = df["item_condition_id"].astype("category")
X_dummies = scipy.sparse.csr_matrix(pd.get_dummies(df[[
"item_condition_id", "shipping"]], sparse = True).values)
PLOG('7777777777777')
#name
count = CountVectorizer(min_df=NAME_MIN_DF)
X_name = count.fit_transform(df["name"])
PLOG('8888888888888')
X = scipy.sparse.hstack((X_dummies,
X_descp,
X_brand,
X_category,
X_name)).tocsr()
PLOG('99999999999999')
X_train = X[:nrow_train]
model = Ridge(solver = "lsqr", fit_intercept=False)
model.fit(X_train, y_train)
PLOG('10')
X_test = X[nrow_train:]
preds = model.predict(X_test)
print(__rmsle(y_train,preds))
df_test["price"] = np.expm1(preds)
df_test[["test_id", "price"]].to_csv("submission.csv", index = False)
PLOG('11')
Kaggle solution 1: Mercari Price Suggestion Challenge
最新推荐文章于 2024-05-26 09:38:06 发布