import numpy as np
import glob
import argparse
import pandas as pd
parser = argparse.ArgumentParser(description='Make kmer matirx from kmer files of each kmer files using directoruy')
parser.add_argument("totalkmer",help="input the total kmerfile as features") # 总的kmer的list
parser.add_argument("chrommers",help="input chrommer filesfloder") # chrom样本kmer的文件夹名字
parser.add_argument("plasmers",help="input chrommer files id") # plas样本kmer文件夹的名字
args = parser.parse_args()
# read total kmer list
kmerlist = {}
count = 0
with open(args.totalkmer)as f:
for line in f:
i = line.strip().split("\t")
kmerlist[i[0]] = count
count += 1
# read chrom kmers
mat = []
chromlist = glob.glob(args.chrommers)
for i in chromlist:
arr = [0]*len(kmerlist)
with open(i) as f:
for line in f:
j = line.strip().split("\t")
site = kmerlist[j[0]]
arr[site] = int(j[1])
mat.append(arr)
# read plsmid kmers
plaslist = glob.glob(args.plasmers)
for i in plaslist:
arr = [0]*len(kmerlist)
with open(i) as f:
for line in f:
j = line.strip().split("\t")
site = kmerlist[j[0]]
arr[site] = int(j[1])
mat.append(arr)
allmatrix = np.array(mat, dtype="int32") # numpy array
# make target
target = np.hstack((np.zeros(len(chromlist)),np.ones(len(plaslist)))) # produce label
#delete samples with sum less than 1995 for 2k and 4995 for 5k
# for 5k frag
(allmatrix.sum(axis=1)!=4995).sum()
idx=allmatrix.sum(axis=1)==4995
allmatrix.shape
target.shape
allmatrix_com=allmatrix[idx]
target_com=target[idx]
allmatrix_com.shape
target_com.shape
# save matrix and target
pd.DataFrame(allmatrix_com).to_csv('allmatrix_com.csv')
np.savetxt("target_com",target_com)
# save kmerlist/index as jason files
import json
with open('kmerlist_index.json','w') as f:
json.dump(kmerlist, f)
#save matrix sample id
sampleid=chromlist+plaslist
sampleid=[x[6:]for x in sampleid]
sampleid_a=np.array(sampleid)[allidx]
with open('matrix_sample_id.json','w') as f:
json.dump(sampleid,f)
#读取kmerlist
with open('kmerlist_index.json','r') as f:
kmerlist = json.load(f)
(END)