@[TOC]基于fastText的新闻文本分类算法
基于fastText的新闻文本分类算法实例
本项目的目的对数据庞大、杂乱无章的意见箱的意见实现分类,使得每种意见都有相应的类型(使用fastText模型)。
代码如下:
项目用到的数据集:
csv文件数据格式:theme表示意见主题,details表示意见内容
一、读数据
import jieba
import pandas as pd
import random
import fasttext
#读数据
#one_build1.csv 表示:城乡建设文件
#one_education1.csv 表示:教育文体文件
#one_environment1.csv 表示:环境保护文件
df_one_build = pd.read_csv("./data/one_build.csv", encoding='utf-8')
df_one_education = pd.read_csv("./data/one_education.csv", encoding='utf-8')
df_one_environment = pd.read_csv("./data/one_environment.csv", encoding='utf-8')
二、对数据进行预处理(过滤无效数据)
#预处理
##去除字母数字及其他符号
df_one_build['theme'].replace({r'[\x00-\x7F]+':''},regex=True,inplace=True)
df_one_build['details'].replace({r'[\x00-\x7F]+':''},regex=True,inplace=True)
df_one_education['theme'].replace({r'[\x00-\x7F]+':''},regex=True,inplace=True)
df_one_education['details'].replace({r'[\x00-\x7F]+':''},regex=True,inplace=True)
df_one_environment['theme'].replace({r'[\x00-\x7F]+':''},regex=True,inplace=True)
df_one_environment['details'].replace({r'[\x00-\x7F]+':''},regex=True,inplace=True)
##合并主题内容与详情
df_one_build['details'] = df_one_build['theme'].map(str)+" "+df_one_build['details'].map(str)
df_one_education['details'] = df_one_education['theme'].map(str)+" "+df_one_education['details'].map(str)
df_one_environment['details'] = df_one_environment['theme'].map(str)+" "+df_one_environment['details'].map(str)
##滤除缺失数据(空数据)
df_one_build = df_one_build.dropna()
df_one_education = df_one_education.dropna()
df_one_environment = df_one_environment.dropna()
##转化为list(二维list)方便进行分词
one_build = df_one_build.details.values.tolist()
one_education = df_one_education.details.values.tolist()
one_environment = df_one_environment.details.values.tolist()
cate_dic = {'df_one_build':1, 'one_education':2, 'one_environment':3}
##停用词
stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'])
stopwords=stopwords['stopword'].values
三、分词
#分词
def preprocess_text(content_lines, sentences, category):
for line in content_lines:
try:
segs=jieba.lcut(line)
segs = filter(lambda x:len(x)>1, segs)
segs = filter(lambda x:x not in stopwords, segs)
sentences.append("__label__"+str(category)+" , "+" ".join(segs))
except BaseException:
print(line)
continue
四、生成数据集并训练模型
#生成训练数据集
sentences = []
preprocess_text(one_build,sentences,'one_build')
preprocess_text(one_education,sentences,'one_education')
preprocess_text(one_environment,sentences,'one_environment')
random.shuffle(sentences)
#写入文件
out = open('train_data1.txt', 'w')
for sentence in sentences:
out.write(str(sentence)+"\n")
#训练模型
model = fasttext.train_supervised(
input='train_data.txt',
lr=0.1,
epoch=14,
wordNgrams=2,
loss='softmax',
label='__label__'
)
result = model.test('train_data.txt')
print('result= ',result)
五、结果
感觉效果还可以!