最近因为在做NER相关的东西,所以需要对数据进行各种各样的预处理。这里先码出一小部分。
这里的代码主要是因为BERT的位置编码会默认为512,因此需要对数据长度进行修剪。
delete_char():检查文本中每一行的字数,并删除大于超过500个字符的行。最后生成txt文本。(空格不算,具体的阀值可以自己改)
审核问题。这块数据就不展示了,总之就是按字符分开的。
delete_word:和上面是一样的,不过在NLP领域做分词后,文本是不规则的,因此计量方法不一样。直接上效果。
审核问题。这块数据就不展示了,总之就是分词之后的。
import codecs
import sys
def delete_char(input_path , output_path):
linedata = []
lines = 0
input_data = codecs.open(input_path, 'r', 'utf-8')
output_data = codecs.open(output_path, 'w', 'utf-8')
for line in input_data.readlines(): # 按行读取数据
line = line.split()
wordCount = 0
for word in line:
word = word.split() # 用'/'将word给划分开。可以将标记和词语分开。
linedata.append(word[0])
wordCount = wordCount + 1
# if wordCount > 500:
# print("##################THE LINE IS:", lines+1)
# print("##################THE wordCount IS:",wordCount)
if wordCount < 500:
print("##################THE LINE IS:", lines+1)
print("##################THE wordCount IS:",wordCount)
for word in line:
word = word.split()
output_data.write(word[0] + " ")
output_data.write('\n')
lines = lines + 1
print("SUCCESS")
def check_word(file_name):
line_count = 0
word_count = 0
character_count = 0
with open(file_name,'r',encoding='utf-8') as f:
for line in f:
if line.strip()=='':
continue
word=line.split()
line_count += 1
print("###########第",line_count,"行########")
word_count = len(word)
print("句子中词语的个数是(不包括空格):", word_count)
temp = 0
for i in line:
char = str(i)
if char != " " and char != "\n":
# print(temp,i,"#####")
character_count = character_count + 1
temp = temp+1
print("句子中字符的个数是(不包括空格):", character_count)
character_count = 0
def delete_word(input_path,output_path):
line_count = 0
character_count = 0
output_data = codecs.open(output_path, 'w', 'utf-8')
with open(input_path,'r',encoding='utf-8') as f:
for line in f:
if line.strip()=='':
continue
temp = 0
for i in line:
char = str(i)
if char != " " and char != "\n":
character_count = character_count + 1
temp = temp+1
if character_count < 500:
for word in line.strip():
output_data.write(word[0])
output_data.write('\n')
print("##################THE LINE IS:", line_count + 1)
print("句子中字符的个数是(不包括空格):", character_count)
character_count = 0
line_count = line_count +1
if __name__ == "__main__":
# delete_char("E://PycharmCode//data_processing//MSRA//train_char.txt","E://PycharmCode//data_processing//output.txt")
# delete_char("E://PycharmCode//data_processing//MSRA//train_bioattr.txt","E://PycharmCode//data_processing//output2.txt")
# delete_char("E://PycharmCode//data_processing//test.txt","E://PycharmCode//data_processing//output3.txt")
# delete_char("E://PycharmCode//data_processing//test.txt","E://PycharmCode//data_processing//output.txt")
# check_word("test.txt")
delete_word("test.txt","delete_word_output.txt")