python手动拆分数据集

def data(fileName, fileOut):
    f = open(fileName)
    w = open(fileOut, 'w')
    flag=False
    doc=f.readlines()
    # pos=int(pos)


    for line in doc:
        line = line.strip()
        if line.startswith(">"):
            w.write(line+'|2')
            w.write("\n")


        else:
            w.write(line)
            w.write("\n")


    f.close()
    w.close()
import random
def data1(fileName, fileOut1,fileOut2):
    dalist=[]
    temp={}

    f = open(fileName)
    wvalid = open(fileOut1, 'w')
    wtrain = open(fileOut2, 'w')
    flag=False
    doc=f.readlines()
    # pos=int(pos)
    for i in range(0, 28378, 2):
        temp[doc[i].strip('\n')] = doc[i + 1].strip()
        dalist.append(temp)
        # 置空
        temp={}
    # print(len(dalist))
    random.shuffle(dalist)
    # print(len(dalist))
    # print(dalist[0])
    # print(dalist[1])
    # print(dalist[2])

    validlist = dalist[0:2838]
    trainlist = dalist[2838:]
    print(len(validlist))
    print(len(trainlist))
    # print(validlist)

    for mynewlist in validlist:
        # print(len(mynewlist))
        for key in mynewlist:
            # print(key)
            # print(mynewlist[key])
            wvalid.write(str(key)+'\n')
            wvalid.write(mynewlist[key] + '\n')

    for mynewlist in trainlist:
        for key in mynewlist:
            wtrain.write(str(key) + '\n')
            wtrain.write(mynewlist[key] + '\n')
        # print(type(mynewlist))

    f.close()
    wvalid.close()
    wtrain.close()

data1('PDB_Pronghe.txt','PDB_valid.txt','PDB_train.txt')

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值