#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 28 00:10:58 2018
@author: yingjiezhang
"""
import pandas as pd
import time
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
#start = time.asctime(time.localtime(time.time()))
start = time.time()
print('start..................',start)
#如果内存不够可以尝试分批读进来,以下程序只是做个示范,先读1000行数据进来
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set.csv')
#将文件划分为多个小文件,计算,再合并
'''
'''
df_train.drop(columns = ['article', 'id'], inplace = True)
df_test.drop(columns = ['article'], inplace = True)
#特征工程 将数据集中的字符文本转换为数字向量
vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = 3, max_df = 0.9, max_features = 100000)
#vectorizer.fit(df_train['word_seg'])
x_train = vectorizer.fit_transform(df_train['word_seg']) #fit_transform表示先使用fit方法,再transform方法
x_test = vectorizer.fit_transform(df_test['word_seg'])
y_train = df_train['class'] - 1 #之所以class-1是为了让label从0开始,好预测
#模型在这里设置为logistic回归,也可以改为其他的
lg = LogisticRegression(C=4, dual = True)
lg.fit(x_train, y_train)
y_test = lg.predict(x_test)
#将模型改为LinearSVC,将模型换了之后,为什么result没有变化??
'''
lsvc = LinearSVC()
lsvc.fit(x_train,y_train)
y_test = lsvc.predict(x_test)
'''
df_test['class'] = y_test.tolist()
df_test['class'] = df_test['class'] + 1
df_result = df_test.loc[:, ['id', 'class']]
df_result.to_csv('result2.csv', index = False)
end = time.asctime(time.localtime(time.time()))
print('finished...............',end)
结果:
start… 1547092356.953075
finished… Thu Jan 10 12:03:10 2019
A榜得分:0.04
这个得分古怪的很!!
仔细检查,并没有发现什么不对,即使换过模型改过几遍之后,得分还是在0.04附近徘徊。