!wget https://raw.githubusercontent.com/zhangbo2008/data_ner/main/aomanyupianjian -O aaaa
!pip install datasets
#=======加载自己数据集
with open('aaaa') as f:
t=f.readlines()
save1=[]
save2=[]
a=''
b=''
for i in t:
if i=='\n':
save1.append(a[:-1])
save2.append(b[:-1])
a=''
b=''
else:
a+=i.split(' ')[0]+' '
b+=i.split(' ')[1].replace('\n',' ')
print(3)
all2=[]
import copy
aaa=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
tmp={'tokens':[], 'ner_tags':[]}
for i,j in zip(save1,save2):
if i:
tmp=tmp.copy()
# tmp=copy.deepcopy(tmp) #没有多级数组所以可以用浅拷贝.
tmp['tokens']=i.split(' ')
tmp['ner_tags']=j.split(' ')#末尾句号一般也要进行独立设置O
if '.' in tmp['tokens'][-1] and '.' !=tmp['tokens'][-1]:
tmp['tokens'][-1]=tmp['tokens'][-1][:-1]
tmp['tokens'].append('.')
tmp['ner_tags'].append('O')
if ',' in tmp['tokens'][-1] and ',' !=tmp['tokens'][-1]:
tmp['tokens'][-1]=tmp['tokens'][-1][:-1]
tmp['tokens'].append(',')
tmp['ner_tags'].append('O')
for kkk in range(len(tmp['ner_tags'])):
tmp['ner_tags'][kkk]=tmp['ner_tags'][kkk].replace('B-Person','B-PER').replace('I-Person','I-PER').replace('B-Location','B-LOC').replace('I-Location','I-LOC').replace('B-Misc','B-MISC').replace('I-Misc','I-MISC')
for i1 in range(len(tmp['ner_tags'])):
for j1 in range(len(aaa)):
if tmp['ner_tags'][i1]==aaa[j1]:
tmp['ner_tags'][i1]=j1
all2.append(tmp)
print(111)
from datasets import Dataset
ds = Dataset.from_list(all2)
#===========tag进行编码
#'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'
ds[0]
ds2=ds.train_test_split(test_size=0.3)#下面我们使用ds2即可.
ds2
利用hf datasets库包构建自己的数据集
最新推荐文章于 2024-11-05 15:28:12 发布