num = 0
for id, (en_sent, gloss_sent) in enumerate(zip(en_sents, gloss_sents)):
if (Counter(en_sent)['.'] > 2) or (Counter(gloss_sent)['.'] > 2):
print(id)
print(' '.join(en_sent))
print(' '.join(gloss_sent))
num+=1
print('{} sents'.format(num))
# 提取整个文本中某一个标识符的出现情况
import numpy as np
import re
path = r'D:\NMT_Code\nematus-local-attention\data\iwslt_UN_zh-en\train'
en_path = path + r'\train.zh-en.en.3'
zh_path = path + r'\train.zh-en.zh.3'
# 把正则表达式编译成对象,如果经常使用该对象,此种方式可提高一定效率
special_tokens = '─'
# extract_regex = re.compile(r'[\。。]')
with open(en_path, encoding='utf-8') as en, open(zh_path, encoding='utf-8') as zh:
for idx , en_line in enumerate(en.readlines()):
zh_line = zh.readline()
if (special_tokens in en_line) or (special_tokens in zh_line):
# if extract_regex.findall(en_line) or extract_regex.findall(zh_line):
print('第{}个中-英句子对'.format(idx))
print(en_line)
print(zh_line)
# # 获得中英句子, 各自的全部标识符
# en_all_token = en_line.strip().split(' ')
# zh_all_token = zh_line.strip().split(' ')
#
# # 英文最后一个标识符不为字母或汉字,数字
# for en_token in en_all_token:
# if extract_regex.findall(en_token):
# print(en_line)
# print(zh_line)
#
# # 中文最后一个标识符不为汉字或字母
# for zh_token in zh_all_token:
# if extract_regex.findall(zh_token):
# print(en_line)
# print(zh_line)