【代码学习】DeepSC--preprocess_text.py

Deep Learning Enabled Semantic Communication Systems

在这里插入图片描述

实现功能:

1.函数将输入的字符串标准化为一个符合规范的文本字符串(处理空格特殊符号);
2.文本分割成指定长;
3.添加开始结束点;
4.筛选符号;
5.构建词汇表(vocabulary),统计句子中每个词出现的个数

import unicodedata
import re
from w3lib.html import remove_tags
import pickle
import argparse
import os
import json
from tqdm import tqdm
from parameters import para_config

# parser = argparse.ArgumentParser()
# parser.add_argument('--input-data-dir', default='europarl/en', type=str)
# parser.add_argument('--output-train-dir', default='europarl/train_data.pkl', type=str)
# parser.add_argument('--output-test-dir', default='europarl/test_data.pkl', type=str)
# parser.add_argument('--output-vocab', default='europarl/vocab.json', type=str)

SPECIAL_TOKENS = {
  '<PAD>': 0,
  '<START>': 1,
  '<END>': 2,
  '<UNK>': 3,
}

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

# 函数将输入的字符串标准化为一个符合规范的文本字符串
def normalize_string(s):
    # normalize unicode characters
    s = unicode_to_ascii(s)
    # remove the XML-tags
    s = remove_tags(s)
    # add white space before !.?
    # 在感叹号、句号和问号前添加一个空格,以便将它们与前面的单词分开。它使用正则表达式将这些标点符号前添加一个空格。
    s = re.sub(r'([!.?])', r' \1', s)
    # 除了字母(大小写)、句号、感叹号和问号之外的所有字符替换为空格
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    # 这一步将连续出现的多个空格替换为单个空格。它使用正则表达式\s + 匹配连续出现的多个空格,并将它们替换为单个空格
    s = re.sub(r'\s+', r' ', s)
    # change to lower letter
    # 将字符串s中的所有字母转换为小写字母
    s = s.lower()
    return s
# 对清理过的文本进行切割处理,筛选出指定长度范围内的句子。这可能有助于过滤掉过短或过长的句子,以便后续处理或分析
def cutted_data(cleaned, MIN_LENGTH=4, MAX_LENGTH=30):
    # 创建一个空列表cutted_lines,用于存储切割后的句子。
    cutted_lines = list()
    # 遍历输入的清理过的句子列表cleaned中的每个句子
    for line in cleaned:
        # 计算当前句子line中的单词数,并将结果存储在变量length中
        length = len(line.split())
        # 判断当前句子的单词数是否大于MIN_LENGTH且小于MAX_LENGTH
        if length > MIN_LENGTH and length < MAX_LENGTH:
            # 将当前句子line按空格切分成单词,并将结果存储在列表line中
            line = [word for word in line.split()]
            # 将切分后的单词列表line重新组合成一个句子,并将该句子添加到cutted_lines
            cutted_lines.append(' '.join(line))
    return cutted_lines

def save_clean_sentences(sentence, save_path):
    pickle.dump(sentence, open(save_path, 'wb'))
    print('Saved: %s' % save_path)

def process(text_path):
    fop = open(text_path, 'r', encoding='utf8')
    raw_data = fop.read()
    # 将读取到的内容去除首尾的空白字符,并按换行符('\n')
    # 进行分割,将文本划分为多个句子,保存在变量sentences中。
    sentences = raw_data.strip().split('\n')
    # 对每个句子应用normalize_string函数,将句子进行标准化处理,去除XML标签、添加空格等
    raw_data_input = [normalize_string(data) for data in sentences]
    # 筛选出长度在指定范围内的句子
    raw_data_input = cutted_data(raw_data_input)
    fop.close()

    return raw_data_input

# 用于将一个字符串 s 切分成一个(字符串)标记列表,即将字符串按指定的分隔符进行分割,
# 并可选择保留或移除特定的标点符号,以及添加起始和结束标记
def tokenize(s, delim=' ',  add_start_token=True, add_end_token=True,
             punct_to_keep=None, punct_to_remove=None):
    """
    Tokenize a sequence, converting a string s into a list of (string) tokens by
    splitting on the specified delimiter. Optionally keep or remove certain
    punctuation marks and add start and end tokens.
    """
    if punct_to_keep is not None:
        for p in punct_to_keep:
            s = s.replace(p, '%s%s' % (delim, p))

    if punct_to_remove is not None:
        for p in punct_to_remove:
            s = s.replace(p, '')

    tokens = s.split(delim)
    if add_start_token:
        tokens.insert(0, '<START>')
    if add_end_token:
        tokens.append('<END>')
    return tokens

# 构建词汇表(vocabulary)
def build_vocab(sequences, token_to_idx = { }, min_token_count=1, delim=' ',
                punct_to_keep=None, punct_to_remove=None, ):
    # 记录词汇表中每个单词的出现次数
    token_to_count = {}
    # 遍历sequences列表中的每个句子seq
    for seq in sequences:
      # 使用了tokenize函数进行分词操作
      seq_tokens = tokenize(seq, delim=delim, punct_to_keep=punct_to_keep,
                      punct_to_remove=punct_to_remove,
                      add_start_token=False, add_end_token=False)
      # 统计每个词在所有句子中出现的频次
      for token in seq_tokens:
        if token not in token_to_count:
          token_to_count[token] = 0
        token_to_count[token] += 1

    # 出现次数不低于min_token_count的单词才会被添加到token_to_idx
    # 字典中,并赋予一个唯一的索引值。
    for token, count in sorted(token_to_count.items()):
      if count >= min_token_count:
        token_to_idx[token] = len(token_to_idx)
    # {'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNK>': 3, '': 4, 'a': 5, 'abstentions': 6, 'accordance': 7, 'add': 8,
    #  'adopted': 9}
    return token_to_idx


def encode(seq_tokens, token_to_idx, allow_unk=False):
    seq_idx = []
    for token in seq_tokens:
      if token not in token_to_idx:
        if allow_unk:
          token = '<UNK>'
        else:
          raise KeyError('Token "%s" not in vocab' % token)
      seq_idx.append(token_to_idx[token])
    return seq_idx


def decode(seq_idx, idx_to_token, delim=None, stop_at_end=True):
    tokens = []
    for idx in seq_idx:
      tokens.append(idx_to_token[idx])
      if stop_at_end and tokens[-1] == '<END>':
        break
    if delim is None:
      return tokens
    else:
      return delim.join(tokens)


def main(args):
    data_dir = '/home/hx301/data/'
    # args.input_data_dir = args.input_data_dir
    # args.output_train_dir = args.output_train_dir
    # args.output_test_dir = args.output_test_dir
    # args.output_vocab = args.output_vocab

    print(args.input_data_dir)
    sentences = []
    print('Preprocess Raw Text')
    for fn in tqdm(os.listdir(
            args.input_data_dir)):
        if not fn.endswith('.txt'): continue #直到找到以.txt结尾的文件
        process_sentences = process(os.path.join(args.input_data_dir, fn))
        # 将列表process_sentences中的元素添加到列表sentences中的一种简写方式。
        # 它实际上是将process_sentences列表中的元素逐个追加到sentences列表的末尾。
        sentences += process_sentences

    # remove the same sentences
    # 使用字典a统计sentence列表中的句子出现的次数,并删除重复的句子。最终,将去重后的句子存储在sentence列表中
    a = {}
    for set in sentences:
        if set not in a:
            a[set] = 0
        a[set] += 1
    sentences = list(a.keys())
    print('Number of sentences: {}'.format(len(sentences)))
    
    print('Build Vocab')
    token_to_idx = build_vocab(
        sentences, SPECIAL_TOKENS,
        punct_to_keep=[';', ','], punct_to_remove=['?', '.']
    )

    vocab = {'token_to_idx': token_to_idx}
    print('Number of words in Vocab: {}'.format(len(token_to_idx)))

    # save the vocab
    if args.output_vocab != '':
        with open(args.output_vocab, 'w') as f:
            json.dump(vocab, f)

    print('Start encoding txt')
    results = []
    count_len = []
    #重新处理每个句子,根据上面对每个词赋予的token_to_idx,将句子转换成数字列表
    for seq in tqdm(sentences):
        words = tokenize(seq, punct_to_keep=[';', ','], punct_to_remove=['?', '.'])
        tokens = [token_to_idx[word] for word in words]
        count_len.append(len(tokens))
        results.append(tokens)


    print('Writing Data')
    train_data = results[: round(len(results) * 0.9)]
    test_data = results[round(len(results) * 0.9):]
    # 训练集:测试集=9:1
    with open(args.output_train_dir, 'wb') as f:
        pickle.dump(train_data, f)
    with open(args.output_test_dir, 'wb') as f:
        pickle.dump(test_data, f)

if __name__ == '__main__':
    # Set Parameters
    args = para_config()
    main(args)

处理好的vocab.json文件

{"token_to_idx": {"<PAD>": 0, "<START>": 1, "<END>": 2, "<UNK>": 3, "": 4, "a": 5, "abstentions": 6, "accordance": 7, "add": 8, "adopted": 9, "advertising": 10, "advisers": 11, "against": 12, "agenda": 13, "agriculture": 14, "all": 15, "allowances": 16, "already": 17, "always": 18, "amended": 19, "amendment": 20, "amendments": 21, "among": 22, "and": 23, "any": 24, "applause": 25, "appreciation": 26, "are": 27, "as": 28, "at": 29, "be": 30, "been": 31, "behalf": 32, "being": 33, "business": 34, "but": 35, "by": 36, "can": 37, "capital": 38, "card": 39, "cards": 40, "case": 41, "clearly": 42, "closed": 43, "closely": 44, "cohesion": 45, "coming": 46, "commandment": 47, "commend": 48, "commission": 49, "commissioner": 50, "committee": 51, "compliments": 52, "conclusions": 53, "continue": 54, "coordination": 55, "counted": 56, "creation": 57, "dangerous": 58, "de": 59, "debate": 60, "declared": 61, "depth": 62, "development": 63, "do": 64, "economic": 65, "electronically": 66, "elements": 67, "entitled": 68, "especially": 69, "european": 70, "event": 71, "facts": 72, "familiar": 73, "favour": 74, "few": 75, "finally": 76, "financing": 77, "first": 78, "for": 79, "forget": 80, "forgotten": 81, "from": 82, "fund": 83, "funds": 84, "give": 85, "gladly": 86, "goods": 87, "group": 88, "has": 89, "have": 90, "hear": 91, "her": 92, "his": 93, "house": 94, "i": 95, "if": 96, "in": 97, "include": 98, "included": 99, "indeed": 100, "into": 101, "is": 102, "it": 103, "its": 104, "job": 105, "just": 106, "keeping": 107, "koch": 108, "last": 109, "least": 110, "let": 111, "letter": 112, "like": 113, "link": 114, "logical": 115, "look": 116, "m": 117, "madam": 118, "made": 119, "main": 120, "make": 121, "mandate": 122, "member": 123, "members": 124, "mention": 125, "meticulous": 126, "minute": 127, "more": 128, "mr": 129, "mrs": 130, "much": 131, "my": 132, "necessary": 133, "no": 134, "not": 135, "noted": 136, "now": 137, "objectives": 138, "observed": 139, "of": 140, "on": 141, "oral": 142, "order": 143, "other": 144, "p": 145, "parliament": 146, "party": 147, "perfectly": 148, "period": 149, "piece": 150, "place": 151, "pleased": 152, "poettering": 153, "point": 154, "political": 155, "positions": 156, "ppe": 157, "presented": 158, "presently": 159, "presidency": 160, "president": 161, "principles": 162, "proceed": 163, "proposal": 164, "propose": 165, "pse": 166, "question": 167, "quite": 168, "rapporteur": 169, "reasonable": 170, "received": 171, "regarding": 172, "regions": 173, "reinstated": 174, "reiterate": 175, "rejected": 176, "remain": 177, "repeat": 178, "report": 179, "request": 180, "requests": 181, "result": 182, "road": 183, "room": 184, "rose": 185, "rural": 186, "s": 187, "safety": 188, "schroedter": 189, "segni": 190, "shall": 191, "she": 192, "should": 193, "silence": 194, "since": 195, "sitting": 196, "situation": 197, "so": 198, "social": 199, "socialists": 200, "speak": 201, "speakers": 202, "speaking": 203, "starting": 204, "statement": 205, "strategic": 206, "structural": 207, "substantive": 208, "suggestions": 209, "support": 210, "tabled": 211, "take": 212, "tax": 213, "thank": 214, "that": 215, "the": 216, "their": 217, "themselves": 218, "then": 219, "there": 220, "therefore": 221, "this": 222, "thursday": 223, "thus": 224, "time": 225, "to": 226, "tomorrow": 227, "too": 228, "topical": 229, "transport": 230, "two": 231, "unable": 232, "understood": 233, "union": 234, "upheld": 235, "urgent": 236, "very": 237, "vote": 238, "votes": 239, "voting": 240, "was": 241, "we": 242, "when": 243, "which": 244, "who": 245, "wholehearted": 246, "whose": 247, "will": 248, "willing": 249, "wishes": 250, "with": 251, "withdrawn": 252, "work": 253, "would": 254, "wurtz": 255, "yes": 256, "you": 257, "your": 258}}
  • 2
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

|7_7|

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值