小黑fastNLP成长日记1:DataSet构建

DataSet的构建

字典构建

from fastNLP import DataSet
# 传入字典构建dataset
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."],
        'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],
        'seq_len': [6, 3, 3]}
dataset = DataSet(data)
print(dataset)

±-------------------------±-------------------------±--------+
| raw_words | words | seq_len |
±-------------------------±-------------------------±--------+
| This is the first ins… | [‘this’, ‘is’, ‘the’,… | 6 |
| Second instance . | [‘Second’, ‘instance’… | 3 |
| Third instance . | [‘Third’, ‘instance’,… | 3 |
±-------------------------±-------------------------±--------+

使用append向DataSet中增加数据

from fastNLP import Instance
instance = Instance(raw_words="This is the fourth instance",
                    words=['this', 'is', 'the', 'fourth', 'instance', '.'],
                    seq_len=6)
dataset.append(instance)
print(dataset)

±-------------------------±-------------------------±--------+
| raw_words | words | seq_len |
±-------------------------±-------------------------±--------+
| This is the first ins… | [‘this’, ‘is’, ‘the’,… | 6 |
| Second instance . | [‘Second’, ‘instance’… | 3 |
| Third instance . | [‘Third’, ‘instance’,… | 3 |
| This is the fourth in… | [‘this’, ‘is’, ‘the’,… | 6 |
±-------------------------±-------------------------±--------+

Instance方式构建datset

from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet([
    Instance(raw_words="This is the first instance",
        words=['this', 'is', 'the', 'first', 'instance', '.'],
        seq_len=6),
    Instance(raw_words="Second instance .",
        words=['Second', 'instance', '.'],
        seq_len=3)
    ])
print(dataset)

±-------------------------±-------------------------±--------+
| raw_words | words | seq_len |
±-------------------------±-------------------------±--------+
| This is the first ins… | [‘this’, ‘is’, ‘the’,… | 6 |
| Second instance . | [‘Second’, ‘instance’… | 3 |
±-------------------------±-------------------------±--------+

dataset的删除

from fastNLP import DataSet
dataset = DataSet({'a':range(-5,5),'c':[0]*10})
# 不改变dataset,生成一个删除了满足条件的instance的新DataSet
dropped_dataset = dataset.drop(lambda ins:ins['a'] < 0,inplace = False)
print('条件删除a<0:',dropped_dataset)
print('删除第2个元素:',dataset.delete_instance(1))
# 检查是否有field存在
# 删除 dataset.delete_field('a')
print('a列存在嘛?',dataset.has_field('a'))
print('将c列名称改为b:',dataset.rename_field('c','b'))
print('dataset的长度:',len(dataset))

条件删除a<0: ±–±--+
| a | c |
±–±--+
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 0 |
±–±--+
删除第2个元素: ±—±–+
| a | c |
±—±–+
| -5 | 0 |
| -3 | 0 |
| -2 | 0 |
| -1 | 0 |
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 0 |
±—±–+
a列存在嘛? True
将c列名称改为b: ±—±–+
| a | b |
±—±–+
| -5 | 0 |
| -3 | 0 |
| -2 | 0 |
| -1 | 0 |
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 0 |
±—±–+
dataset的长度: 9

简单数据预处理

from fastNLP import DataSet
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."]}
dataset =DataSet(data)
# 将单词切分,并赋予新的列
dataset.apply(lambda ins:ins['raw_words'].split(),new_field_name = 'words')
# 或使用DataSet.apply_field()
dataset.apply_field(lambda sent:sent.split(),field_name = 'raw_words',new_field_name = 'new_words')
# 定义函数创建新列
def get_words(instance):
    sentence = instance['raw_words']
    words = sentence.split()
    return words
dataset.apply(get_words,new_field_name = 'func_words')
dataset

±--------------------±--------------------±--------------------±--------------------+
| raw_words | words | new_words | func_words |
±--------------------±--------------------±--------------------±--------------------+
| This is the firs… | [‘This’, ‘is’, '… | [‘This’, ‘is’, '… | [‘This’, ‘is’, '… |
| Second instance … | [‘Second’, 'inst… | [‘Second’, 'inst… | [‘Second’, 'inst… |
| Third instance … | [‘Third’, 'insta… | [‘Third’, 'insta… | [‘Third’, 'insta… |
±--------------------±--------------------±--------------------±--------------------+

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值