本文使用哈工大分词做文本预处理; 两层隐层神经网络;
# -*- coding: utf-8 -*-
# @bref :使用tensorflow做中文情感分析
import numpy as np
import tensorflow as tf
import random
from sklearn.feature_extraction.text import CountVectorizer
import os
import traceback
# 读取当前的文件路径
# D:\PycharmProjects2020\tensor1\gensiom_word2vec
real_dir_path = os.path.split(os.path.realpath(__file__))[0]
# 对文件路径进行拼接D:\PycharmProjects2020\tensor1\gensiom_word2vec\data/pos_bak.txt
pos_file = os.path.join(real_dir_path, 'data/pos_bak.txt')
neg_file = os.path.join(real_dir_path, 'data/neg_bak.txt')
# 使用哈工大分词和词性标注
from pyltp import Segmentor, Postagger
# 导入分词模型
seg = Segmentor()
seg.load('F:\modelmodel\ltp_data_v3.4.0\cws.model')
# 导入标记模型
poser = Postagger()
poser.load('F:\modelmodel\ltp_data_v3.4.0\pos.model')
# 当前文件所在路径
real_dir_path = os.path.split(os.path.realpath(__file__))[0]
# 停用词路径
stop_words_file = os.path.join(real_dir_path, '../util/stopwords.txt')
# 定义允许的词性 LTP 使用 863 词性标注集,详细请参考 词性标准集。
allow_pos_ltp = ('a', 'i', 'j', 'n', 'nh', 'ni', 'nl', 'ns', 'nt', 'nz', 'v', 'ws')
#分词、去除停用词、词性筛选
# 这个方法可以把s="今 天去 钓 鱼了 你 去么" 过滤成为['今天', '钓鱼']
def cut_stopword_pos(s):
# s.split()是为了把"今 天去 钓 鱼了 你 去么"改成 今天去钓鱼了你去么
# 便于后边的sement进行分词
words = seg.segment(''.join(s.split()))
# 标记今天|去|钓鱼|了|你|去|么
# nt|v|v|u|r|v|u
poses = poser.postag(words)
# {',': None, '?': None, '、': None, '。': None,
stopwords = {
}.fromkeys([line.rstrip() for line in open(stop_words_file,encoding='UTF-8')])
sentence = []
#for i, pos in enumerate(poses):这段话执行如下
# 0 nt
# 1 wp
# 2 n
# 3 v
for i, pos in enumerate(poses):
if (pos in allow_pos_ltp) and (words[i] not in stopwords):
sentence.append(words[i])
return sentence
# 读取文本把过滤成这种dict_keys(['心得', '勘误', '疑点', '兴趣', '朋友', '访问', '网站', '交流', '切磋', 'www',
def create_vocab(pos_file, neg_file):
def process_file(file_path):
with open(file_path, 'r',encoding='UTF-8') as f:
v = []
lines = f.readlines()
for line in lines:
sentence = cut_stopword_pos(line)
v.append(' '.join(sentence))
return v
sent = process_file(pos_file)
sent += process_file(neg_file)
# CountVectorizer(max_df=0.9, min_df=1)
tf_v = CountVectorizer(max_df=0.9, min_df=1)
tf = tf_v.fit_transform(sent)
#print tf_v.vocabulary_
return tf_v.vocabulary_.keys()
#获取词汇
vocab = create_vocab(pos_file, neg_file)
#依据词汇将评论转化为向量
def normalize_dataset(vocab):
dataset = []
# vocab:词汇表; review:评论; clf:评论对应的分类, [0, 1]表示负面评论,[1, 0]表示正面
def string_to_vector(vocab, review, clf):
words = cut_stopword_pos(review) # list of str