# encoding = 'utf-8'
import re
import string
import jieba
import copy
import time
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
from torchtext import data
from torchtext.vocab import Vectors
import pandas as pd
data = pd.read_csv('all_data.csv', header=0, encoding='utf-8',error_bad_lines=False)
shuffle_df = data.sample(frac=1).reset_index()
train_df = shuffle_df.iloc[0:int(len(shuffle_df)*0.7), :]
val_df = shuffle_df.iloc[int(len(shuffle_df)*0.7):int(len(shuffle_df)*0.85),:]
test_df = shuffle_df.iloc[int(len(shuffle_df)*0.85):,:]
f = open("stopwords.txt")
stop_word_list = []
for line in f.readlines():
stop_word_list.append(line.strip())
import re
import jieba
stop_word = set(stop_word_list)
def chinese_pre(text_data):
## 字母转化为小写,去除数字,
text_data = text_data.lower()
text_data = re.sub("\d+", "", text_data)
## 分词,使用精确模式
text_data = list(jieba.cut(text_data,cut_all=False))
## 去停用词和多余空格
text_data = [word.strip() for word in text_data if word not in stop_word]
## 处理后的词语使用空格连接为字符串
text_data = " ".join(text_data)
return text_data
train_test_list = list(train_df['text'])
train_cutword_list = []
for i in range(len(train_test_list)):
print(i)
train_cutword_list.append(chinese_pre(train_test_list[i]))
val_text_list = list(val_df['text'])
val_cutword_list = []
for i in range(len(val_text_list)):
print(i)
val_cutword_list.append(chinese_pre(val_text_list[i]))
test_text_list = list(test_df['text'])
test_cutword_list = []
for i in range(len(test_text_list)):
print(i)
test_cutword_list.append(chinese_pre(test_text_list[i]))
train_df['cutword'] = pd.DataFrame(train_cutword_list)
val_df = val_df.drop('index', 1).reset_index()
val_df['cutword'] = pd.DataFrame(val_cutword_list)
val_df = val_df.drop('index',1)
val_df