数据集通常需要去掉文件中的重复行,以下是操作代码:
####将2个.csv拼接,并保存.csv文件
df1 = pd.read_csv(r'/Ce/NO.1/CeNegative.csv', header=None)#读取第一个文件
df2 = pd.read_csv(r'/Ce/NO.1/CePositive.csv', header=None)#读取第二个文件
file1= [df1, df2]
outfile1 = pd.concat(file1)#竖着拼接
outfile1.to_csv("/data/proteinall"+".csv", index=0, header=None, sep=',')
#####去掉重复行
inFile = open('/data/proteinall.csv','r') #
outFile = open('/data/protein1.csv','w') #最后保存的.csv文件
listLines = []
for line in inFile:
if line in listLines:
continue
else:
outFile.write(line)
listLines.append(line)
outFile.close()
inFile.close()
以下是读取.txt,去掉重复行,保存为.csv
####加载.txt,去掉重复序列,保存为.csv
import numpy as np
import pandas as pd
import copy
def read_traingingData(file_name):
# read sample from a file
seq = []
with open(file_name, 'r') as fp:
i = 0
for line in fp:
seq.append(line.split('\n')[0])
i = i+1
return seq
file_1 = '/Ce/Ce.AC_N1.txt'
protein_A = read_traingingData(file_1)
df_protein=pd.DataFrame(protein_A)
seq_protein = df_protein.drop_duplicates()
seq_protein.to_csv('/home/aita/4444/LX/Ziqi/protein.csv',index=0,header= None,encoding='gbk')