语料获取
句对生成所需语料:小语种语料(da)、汉语语料(zh)、对照标题文本(titles.txt)
使用WikiDump下载相关语料
https://dumps.wikimedia.org/**wiki/
**:语料语言类型的639-1码,以下均以da为例
*-pages-articles.xml.bz2
使用WikiExtractor抽取出正文内容和标题 https://github.com/attardi/wikiextractor
python WikiExtractor.py -b 10G dawiki-20190301-pages-articles.xml.bz2
(python.exe WikiExtractor.py -b 10G zhwiki-latest-pages-articles.xml.bz2)
-b:设置文本分割大小的参数
*-page.sql.gz
*-langlinks.sql.gz
使用wikipedia-parallel-titles-master生成对照标题文件
./build-corpus.sh zh dawiki-20190301 > titles.txt
句对生成
定义停用标点
zh_stops = (u'。', u'?', u'!')
da_stops = ('.', '?', '!')
通过使用已经对齐的标题进行篇章级对齐
def alignment(da_file, zh_file, output_file):
"""
根据对齐的title词表、小语种语料和汉语语料生成平行句对
:param da_file: 小语种语料文件
:param zh_file: 汉语语料文件
:param output_file: 输出文件
:return:
"""
titles_rf = codecs.open('titles.txt', 'r', 'utf-8')
titles_dict = {}
line = titles_rf.readline()
count = 0
while line:
items = line.split("|||")
da = items[0].strip()
zh = items[1].strip()
titles_dict[zh] = da
count += 1
line = titles_rf.readline()
titles_rf.close()
print count, "parallel titles"
# 根据对齐的titles词表构建词典,key为汉语title,value为小语种title
da_rf = codecs.open(da_file, 'r', 'utf-8')
da_dict = {}
line = da_rf.readline()
count = 0
while line:
if re.search('<doc.*?>', line):
# 篇章开始
body = "".encode('utf-8')
res = re.search('title=".*?"', line)
title = res.group()
title = title.replace('title=','')
title = title.replace('"', '')
title = title.strip()
# 提取标题
while True:
line = da_rf.readline()
line = line.strip()
if line == "" or line == title:
# 忽略空行和标题行
continue
if re.search('</doc>', line):
# 篇章结束
da_dict[title] = body
break
body = body + line
count += 1
line = da_rf.readline()
da_rf.close()
print count, "da articles"
# 构建小语种语料词典,key为篇章title,value为篇章正文内容
wf = codecs.open(output_file, 'w', 'utf-8')
# 存放生成的平行句对
zh_rf = codecs.open(zh_file, 'r', 'utf-8')
log = codecs.open('log.txt', 'w', 'utf-8')
# 错误日志文件,存放无法对齐的篇章标题
line = zh_rf.readline()
count = 0
res_count = 0
while line:
if re.search('<doc.*?>', line):
# 篇章开始
body = "".encode('utf-8')
flag = True
res = re.search('title=".*?"', line)
title = res.group()
title = title.replace('title=', '')
title = title.replace('"', '')
title = title.strip()
# 提取标题
if title not in titles_dict.keys():
flag = False
log.write("Lack aligned titles: " + title + "\n")
# 对齐的titles词典中不存在当前汉语篇章的标题,也就是说无法找到对应的小语种篇章的标题
# 不进行后续操作,直接开始对下一个汉语篇章的操作
while flag:
line = zh_rf.readline()
line = line.strip()
if line == "" or line == title:
continue
if re.search('</doc>', line):
# 篇章结束
da_title = titles_dict[title]
if da_title not in da_dict.keys():
log.write("Lack aligned articles: " + title + "\n")
break
# 小语种语料中找不到以da_title为标题的篇章
da_body = da_dict[da_title]
res = sentences_alignment(da_body, body)
# 将对齐了的篇章内容进行句子级对齐,生成平行句对
if res[1] == 0:
break
wf.write(res[0])
wf.write("\n")
wf.flush()
res_count += res[1]
break
body = body + line
count += 1
print count
line = zh_rf.readline()
print count, "zh articles"
print res_count, "alignment sentences"
zh_rf.close()
wf.close()
log.close()
在写入文件时,如果不及时清除缓存,可能会导致写入失败
句子级对齐使用停用符号和句子长度方法进行对齐
考虑以下三种方式(个人想法)
篇章级对齐正确率较高,并且不能轻易舍弃语料
def sentences_alignment1(da, zh):
"""
句子对齐方法1(当篇章无法实现句子级对齐时,只返回篇章级对齐结果)
:param da: 实现篇章级对齐的小语种文本
:param zh: 实现篇章级对齐的汉语文本
:return: 对齐结果,对齐句对个数(返回篇章级对齐结果时为1)
"""
count = 0
da_len = len(da)
zh_len = len(zh)
da_tag = ""
da_sent = []
zh_tag = ""
zh_sent = []
tmp = ""
for c in da:
tmp += c
if c == da_stops[0]:
da_tag += "F"
da_sent.append(tmp)
tmp = ""
elif c == da_stops[1]:
da_tag += "Q"
da_sent.append(tmp)
tmp = ""
elif c == da_stops[2]:
da_tag += "E"
da_sent.append(tmp)
tmp = ""
tmp = ""
for c in zh:
if c == zh_stops[0]:
zh_tag += "F"
zh_sent.append(tmp)
tmp = ""
elif c == zh_stops[1]:
zh_tag += "Q"
zh_sent.append(tmp)
tmp = ""
elif c == zh_stops[2]:
zh_tag += "E"
zh_sent.append(tmp)
tmp = ""
res = ""
if da_tag == zh_tag and len(da_tag) > 0:
for i in range(len(da_sent)):
da_tmp = da_sent[i]
zh_tmp = zh_sent[i]
da_ratio = float(len(da_tmp))/da_len
zh_ratio = float(len(zh_tmp))/zh_len
if abs(zh_ratio-da_ratio) <= 0.05:
res += da_tmp+"\t"+zh_tmp+"\n"
count += 1
else:
res = ""
break
if res == "":
count = 1
res = da + "\t" + zh + "\n"
else:
count = 1
res = da + "\t" + zh + "\n"
return res, count
篇章级对齐正确率较高且多缺失
def sentences_alignment(da, zh):
"""
句子对齐方法(舍弃无法对齐的句子)
:param da: 实现篇章级对齐的小语种文本
:param zh: 实现篇章级对齐的汉语文本
:return: 对齐的平行句对,生成的平行句对数量
"""
count = 0
da_len = len(da)
zh_len = len(zh)
da_tag = ""
da_sent = []
zh_tag = ""
zh_sent = []
tmp = ""
for c in da:
tmp += c
if c == da_stops[0]:
da_tag += "F"
# F:fullstop
da_sent.append(tmp)
tmp = ""
elif c == da_stops[1]:
da_tag += "Q"
# Q:question
da_sent.append(tmp)
tmp = ""
elif c == da_stops[2]:
da_tag += "E"
# E:exclamation
da_sent.append(tmp)
tmp = ""
tmp = ""
for c in zh:
tmp += c
if c == zh_stops[0]:
zh_tag += "F"
zh_sent.append(tmp)
tmp = ""
elif c == zh_stops[1]:
zh_tag += "Q"
zh_sent.append(tmp)
tmp = ""
elif c == zh_stops[2]:
zh_tag += "E"
zh_sent.append(tmp)
tmp = ""
res = ""
for i in range(min(len(da_tag), len(zh_tag))):
if da_tag[i] == zh_tag[i]:
# 根据停用标点进行句子对齐
da_tmp = da_sent[i].strip()
zh_tmp = zh_sent[i].strip()
da_ratio = float(len(da_tmp)) / da_len
zh_ratio = float(len(zh_tmp)) / zh_len
if abs(zh_ratio - da_ratio) <= 0.05:
# 根据句子长度进行句子对齐
res += da_tmp + "\t" + zh_tmp + "\n"
count += 1
else:
break
else:
break
return res, count
提高平行句对正确率而不要求数量
def sentences_alignment(da, zh):
"""
句子对齐方法(舍弃无法对齐的句子)
:param da: 实现篇章级对齐的小语种文本
:param zh: 实现篇章级对齐的汉语文本
:return: 对齐的平行句对,生成的平行句对数量
"""
count = 0
da_len = len(da)
zh_len = len(zh)
da_tag = ""
da_sent = []
zh_tag = ""
zh_sent = []
tmp = ""
for c in da:
tmp += c
if c == da_stops[0]:
da_tag += "F"
# F:fullstop
da_sent.append(tmp)
tmp = ""
elif c == da_stops[1]:
da_tag += "Q"
# Q:question
da_sent.append(tmp)
tmp = ""
elif c == da_stops[2]:
da_tag += "E"
# E:exclamation
da_sent.append(tmp)
tmp = ""
tmp = ""
for c in zh:
tmp += c
if c == zh_stops[0]:
zh_tag += "F"
zh_sent.append(tmp)
tmp = ""
elif c == zh_stops[1]:
zh_tag += "Q"
zh_sent.append(tmp)
tmp = ""
elif c == zh_stops[2]:
zh_tag += "E"
zh_sent.append(tmp)
tmp = ""
res = ""
if da_tag == zh_tag and len(da_tag) > 0:
# 根据停用标点进行句子对齐
for i in range(len(da_sent)):
da_tmp = da_sent[i]
zh_tmp = zh_sent[i]
da_ratio = float(len(da_tmp)) / da_len
zh_ratio = float(len(zh_tmp)) / zh_len
if abs(zh_ratio - da_ratio) <= 0.05:
# 根据句子长度进行句子对齐
res += da_tmp + "\t" + zh_tmp + "\n"
count += 1
else:
res = ""
count = 0
break
return res, count
由于最终是为了生成平行句对,因此使用第三种方式比较好
最后运行
alignment('da', 'zh', 'da-zh-parallel-sentences.txt')
就可以成功生成基于维基百科语料的平行句对了