赛题通道(进入)
闲聊:
这是我第三次参加大数据比赛,也是第一次接触大数据比赛的自然语言处理,下面吧现在的代码写成博客保存一下,代码还在不断优化中。。。
正题:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import jieba #利用结巴分词功能进行有效的分词
import re #正则表达式相关的库
from random import shuffle
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
#df存储读取文件数据
df = pd.read_csv('train.csv')
#sub_list存储情感分类主题,共10类
sub_list = list(df['subject'].unique())
#sub_cnt_dict存储每一类对应的所有中文文本
sub_word_dict = dict(zip(sub_list, [[]]*10))
#这是一个手动整理的一个停用词文件,但是效果不怎么好。。
with open('stop_word3.txt', 'r') as fp:
stop_word = fp.readline().split()
#提取字符串中的中文
def Translate(str):
pat = re.compile(u"[^\\u4e00-\\u9fa5]")
result = ''.join(pat.split(str))
return result
#将每一个主题对应的文本提取、分词、存入字典中
for item in sub_list:
temp_content = list(df[df['subject']==item]['content'])
temp_content = ''.join(temp_content)
#jieba.cut 进行结巴分词工具分词
sub_word_dict[item] = list(jieba.cut(Translate(temp_content), cut_all = False))
# for i in range(len(temp_content)):
# sub_word_dict[item].extend(list(set(jieba.cut(Translate(temp_content[i]), cut_all = False))))
#
#计算分词的卡方统计量,然后选取最高的number个
def Jieba_feature(sub_word_dict,number):
#可统计所有词的词频
word_fd = {}
#可统计每个主题的词频
con_word_fd = ConditionalFreqDist()
#存储每个类别的word的总词数
con_word_count = {}
for sub in sub_word_dict.keys():
for word in sub_word_dict[sub]:
word_fd[word] = word_fd.get(word, 0) + 1
con_word_fd[sub][word] += 1
temp_num = con_word_fd[sub].N()
con_word_count[sub] = con_word_count.get(sub, temp_num)
total_word_count = sum(con_word_count.values())
word_fd = dict(sorted(word_fd.items(), key = lambda x:x[1], reverse = True))
#print('次品表:', word_fd)
word_scores = {} #存储每个词对应的信息量
#print('总词数:',sum(word_fd.values()))
for word, fred in word_fd.items():
word_scores[word] = 0
if 17> word_fd[word] or word_fd[word] > 1506 :
continue
for sub in sub_word_dict.keys():
temp_num = BigramAssocMeasures.chi_sq(con_word_fd[sub][word],
(fred,con_word_fd[sub].N()),
total_word_count)
word_scores[word] += temp_num
#把词按信息量进行排序,然后去前number个
# print('word_scores:', word_scores)
best_vals = sorted(word_scores.items(),
key=lambda item:item[1],
reverse = True)[:number]
best_words = set([w for w,s in best_vals])
return dict([(word,True) for word in best_words])
temp_list = list(df['sentiment_word'])
for i in range(len(temp_list)):
temp_list[i] = str(temp_list[i])
temp_str = ''.join(temp_list)
temp_word = set(jieba.cut(Translate(temp_str), cut_all = False))
word_list = list(set(Jieba_feature(sub_word_dict, 200)))
#word_list = list(temp_word)
#将数据集转换为特征数据和分类数据
def GetData(df, word_list,sub_list):
train_list = []
for index in range(df.shape[0]):
temp_list = []
word_vec = {}
content = df['content'][index]
subject = df['subject'][index]
fen_ci = list(jieba.cut(Translate(content),cut_all = False))
for word in fen_ci:
if word in word_list:
word_vec[word] = 'True'
temp_list.append(word_vec)
temp_list.append(subject)
train_list.append(temp_list)
return train_list
def GetValueData(data, df):
data2 = data.copy()
for i in range(df.shape[0]):
data2[i][1] = df['sentiment_value'][i]
return data2
#获取可计算的数据集
data = GetData(df, word_list, sub_list)
#转换要预测的值(因为改题有两个值需要预测,一个情感词,还有一个情感值)
#for i in range(df.shape[0]):
# data[i][1] = df['sentiment_value'][i]
import sklearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
def Score(classifier, data):
shuffle(data)
train_data = data[:6947] #6947 条训练数据
test_data = data[6947:]
test_x, test_y = zip(*test_data)
classifier = SklearnClassifier(classifier) #封装的一个借口
classifier.train(train_data) #训练分类器
pred = classifier.classify_many(test_x) #给出预测结果
test_y = np.array(test_y)
pred = np.array(pred)
#print(classification_report(pred, test_y))
return sum(pred == test_y) / len(test_y)
bst_subject = XGBClassifier(max_depth=3,
learning_rate=0.36,
n_estimators=100,
silent=0,
objective='multi:softmax',
booster='gbtree',
n_jobs=1,
nthread=None,
gamma=0,
min_child_weight=1,
max_delta_step=0,
subsample=1,
colsample_bytree=1,
colsample_bylevel=1,
reg_alpha=0,
reg_lambda=1,
scale_pos_weight=1,
base_score=0.5,
random_state=0)
print('--------------------------subject-----------------------')
print('BernoulliNB`s accuracy is %f' %Score(BernoulliNB(), data))
print('MultinomiaNB`s accuracy is %f' %Score(MultinomialNB(), data))
#print('XGBClassifier1s accuracy is %f' %Score(bst_subject, data))
print('RidgeClassifier`s accuracy is %f' %Score(RidgeClassifier(), data))
print('LogisticRegression`s accuracy is %f' %Score(LogisticRegressionCV(), data))
print('LogisticRegression`s accuracy is %f' %Score(LogisticRegression(), data))
#print('SVC`s accuracy is %f' %Score(SVC(), data))
print('LinearSVC`s accuracy is %f' %Score(LinearSVC(), data))
#经过测试,发现在现在的阶段下,lr是预测准确率较高的模型
--------------------------subject-----------------------
BernoulliNBs accuracy is 0.718667 MultinomiaNB
s accuracy is 0.707000
RidgeClassifiers accuracy is 0.739333 LogisticRegression
s accuracy is 0.739000
LogisticRegressions accuracy is 0.721333 LinearSVC
s accuracy is 0.720333