总流程
TextCNN
京东评论数据
读取数据
import numpy as np
from sklearn.utils import shuffle
import os
import pandas as pd
import matplotlib.pyplot as plt
import jieba
corpus_neg_dir = 'neg'
corpus_pos_dir = 'pos'
dataset = './data/datasets/'
apple_data_dir = 'corpus/'
def get_file_content(path, type):
''''
path:目录
'''
fileList = []
files = os.listdir(path)
for f in files:
if(os.path.isfile(path + '/' + f)):
fileList.append(f)
pd_all = pd.DataFrame()
for f1 in fileList:
pd_one = pd.read_csv(path + "/" + f1, encoding='gb18030').astype(str)
pd_one.type = type
pd_all = pd_all.append(pd_one)
return pd_all
negative = get_file_content(corpus_neg_dir, 0)
positive = get_file_content(corpus_pos_dir, 1)
def get_balance_corpus(corpus_size, corpus_pos, corpus_neg):
sample_size = corpus_size // 2
pd_corpus_balance = pd.concat([corpus_pos.sample(sample_size, replace=corpus_pos.shape[0] < sample_size), \
corpus_neg.sample(sample_size, replace=corpus_neg.shape[0] < sample_size)])
return pd_corpus_balance
ChnSentiCorp_fruit_40000 = get_balance_corpus(20000, positive, negative)
data = ChnSentiCorp_fruit_40000
data = shuffle(data, random_state=1)
review = data.content
label = data.type
stopword = []
with open("stopword.txt","r",encoding="utf-8") as f:
for w in f.read().splitlines():
stopword.append(w)
with open("cut_all_data.txt", "a", encoding="utf-8") as f:
for line in review:
text_cut = list(jieba.cut(line))
filter_word = [w for w in text_cut if w not in stopword]
for fw in filter_word:
f.write(str(fw.strip()))
f.write(str(" "))
f.write("\n")
with open("all_label.txt", "a", encoding="utf-8") as f:
for la in label:
f.write(str(la))
f.write("\n")
with open("all_data.txt", "a", encoding="utf-8") as f:
for line in review:
text_cut = list(jieba.cut(line))
filter_word = [w for w in text_c