项目数据集源码在这里:
https://github.com/yingdajun/-fastnlp-Demo
这里是通过借鉴这篇博客:
https://www.jianshu.com/p/70c37deaac26
然后在此基础上进行改进。
1.导入包+初始化
#导入Pytorch包
import torch
import torch.nn as nn
from fastNLP.io.loader import CSVLoader
dataset_loader = CSVLoader(headers=('raw_words','target'), sep='\t')
testset_loader = CSVLoader( headers=['raw_words'],sep='\t')
2. 数据读取
# 表示将CSV文件中每一行的第一项将填入'raw_words' field,第二项填入'target' field。
# 其中项之间由'\t'分割开来
train_path=r'train_shuffle.txt'
test_path=r'new_test_handout.txt'
dataset = dataset_loader._load(train_path)
testset = testset_loader._load(test_path)
3.数据处理
# 将句子分成单词形式, 详见DataSet.apply()方法
import jieba
from itertools import chain
print(jieba.__version__)
# from itertools import chain# '''
# @params:
# data: 数据的列表,列表中的每个元素为 [文本字符串,0/1标签] 二元组
# @return: 切分词后的文本的列表,列表中的每个元素为切分后的词序列
# '''
def get_tokenized(data,words=True):
def tokenizer(text):
return [tok for tok in jieba.cut(text, cut_all=False)]
if words:#按词语进行编码
return tokenizer(data)
else:
#按字进行编码
return [tokenizer(review) for review in data]
dataset.apply(lambda ins:get_tokenized(ins['raw_words']), new_field_name='words', is_input=True)
dataset.apply(lambda ins: len(ins['words']) ,new_field_name='seq_len', is_input=True)dataset.apply(lambda x: int(x['target']), new_field_name='target', is_target=True)
testset.apply(lambda ins: get_tokenized(ins['raw_words']), new_field_name='words', is_input=True)
testset.apply(lambda ins: len(ins['words']) ,new_field_name='seq_len',is_input=True)
###
from fastNLP import Vocabulary
#将DataSet按照ratio的比例拆分,返回两个DataSet
#ratio (float) -- 0<ratio<1, 返回的第一个DataSet拥有 (1-ratio) 这么多数据,第二个DataSet拥有`ratio`这么多数据
train_data, dev_data = dataset.split(0.1, shuffle=False)
vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
vocab.index_dataset(train_data, dev_data, testset, field_name='words', new_field_name='words')
4. 模型搭建
# 预处理模型
from fastNLP.embeddings import StaticEmbedding,StackEmbedding
fastnlp_embed = StaticEmbedding(vocab, model_dir_or_name='cn-char-fastnlp-100d',min_freq=2)
from fastNLP.models import CNNText
model_CNN = CNNText(fastnlp_embed, num_classes=2,dropout=0.1)
print(model_CNN)
5.进行训练
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric,BCELoss
trainer_CNN = Trainer(model=model_CNN, train_data=train_data, dev_data=dev_data,loss=CrossEntropyLoss(), metrics=AccuracyMetric())
trainer_CNN.train()
6.导出结果
demo=[]
#批量进行数据预测
import pandas as pd
import torch
def batch_predict(model,data):
submission=pd.DataFrame(columns=['Prediction'])
# submission = pd.DataFrame(columns=['ID','Prediction'])
for i in range(len(data)):
#for i in range(5):
# print(data.words[i])
tensor = torch.tensor(data.words[i])
pred = model.predict(tensor.view(1,-1))
# print(pred)
prob = pred['pred'].numpy()[0]
# print("pred:%.2f"%(prob))
# print('='*50)
# print(type(prob))
s2 = pd.Series([float(prob)], index=['Prediction'])
demo.append(prob)
# print(s2)
submission = submission.append(s2, ignore_index=True)
submission['Prediction'] = submission.Prediction .astype(int)
# submission['']
# submission['Prediction'] = submission.Prediction.astype(float)
return submission
#开始进行预测,并将结果保存到提交格式文件中,提交平台
# summission_path = r'data\Comments9120'
submission = batch_predict(model_CNN,testset)
submission.to_csv('fastnlpDemo.csv',encoding='utf-8')
# submission.to_csv(summission_path+'\submission-CNN-20200229-words.csv', index=False)