THUCnews
数据下载
中文数据集:THUCNews
THUCNews数据子集:https://pan.baidu.com/s/1hugrfRu 密码:qfud
读取数据与分词
以测试集为例:
import pandas as pd
import numpy as np
train_file = 'cnews/cnews.train.txt'
val_file = 'cnews/cnews.val.txt'
test_file = 'cnews/cnews.test.txt'
##测试集
test_data = pd.read_csv(test_file,sep='\t',engine='python',names=['label','content'],encoding='UTF-8')
print(test_data.shape)
test_data.tail()
from multiprocessing import Pool, cpu_count
import re
import pkuseg
remove = re.compile('[\s\d,。?!~:“”;,.:?"!~$%^&@#¥#*()()、|/]')