实现word2sequence

27 篇文章 1 订阅
4 篇文章 0 订阅

coding=utf-8

“”"
author:lei
function: 实现的是构建词典,实现方法把句子转换为数字序列并将其翻转
“”"

class Word2Sequence(object):
UNK_TAG = “UNK”
PAD_TAG = “PAD”

UNK = 0
PAD = 1

def __init__(self):
    self.dict = {
        self.UNK_TAG: self.UNK,
        self.PAD_TAG: self.PAD
    }

    self.count = {}  # 统计词频

def fit(self, sentence):
    """
        把单个句子保存到dict中
    :param sentence: [word1,word2,word3...]
    :return:
    """
    for word in sentence:
        # 判断当前字典中是否有这个词,有则在原先词频的基础上+1,没有则0+1
        self.count[word] = self.count.get(word, 0) + 1

# 限制词频最小值和最大值
def build_vocab(self, min=5, max=None, max_features=None):
    """
        生成词典
    :param min: 最小出现的次数
    :param max_feature: 最大的次数
    :param max_features: 一共保留多少个词语
    :return:
    """
    # 删除count中词频小于min的word
    if min is not None:
        self.count = {word: value for word, value in self.count.items() if value > min}
    # 删除词语大于max的值
    if max is not None:
        self.count = {word: value for word, value in self.count.items() if value < max}
    # 限制保留的词语数
    if max_features is not None:
        # 对所有单词出现的频数进行排序,保留前max_features
        temp = sorted(self.count.items(), key=lambda x: x[-1], reverse=True)[:max_features]
        # 将temp转换为字典
        self.count = dict(temp)

    for word in self.count:
        # 从二开始对每个单词匹配其对应的数字
        self.dict[word] = len(self.dict)

    # 得到一个翻转的dict字典
    self.inverse_dict = dict(zip(self.dict.values(), self.dict.keys()))

def transform(self, sentence, max_len=None):
    """
        把句子转换为序列
    :param sentence: [word1, word2,...]
    :param max_len: int 对句子进行填充或者裁剪
    :return:
    """
    # for word in sentence:
    #     # 将每个句子中的单词转换为对应的数字,如果没有则为UNK
    #     self.dict.get(word, self.UNK)

    if max_len is not None:
        if max_len > len(sentence):
            # 填充
            sentence = sentence + [self.PAD_TAG] * (max_len - len(sentence))
        if max_len < len(sentence):
            # 裁剪
            sentence = sentence[:max_len]

    return [self.dict.get(word, self.UNK) for word in sentence]

def inverse_transform(self, indices):
    """
        把序列转换为句子
    :param indices: [1,2,3,4...]
    :return:
    """
    return [self.inverse_dict.get(idx) for idx in indices]

def __len__(self):
    return len(self.dict)

if name == ‘main’:
ws = Word2Sequence()
ws.fit([“我”, “是”, “谁”])
ws.fit([“我”, “是”, “我”])
ws.build_vocab(min=0)
print(ws.dict)

ret = ws.transform(["我", "爱", "北京"], max_len=10)
print(ret)
ret = ws.inverse_transform(ret)
print(ret)
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值