机器学习python处理文本数据 代码


<span style="font-family: Arial, Helvetica, sans-serif;">#!/usr/bin/python</span>
# -*- coding: UTF-8 -*-  
import re
from itertools import islice

fin = open('/Users/tangchao/Desktop/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt')
fout = open('/Users/tangchao/Desktop/snp1.txt','a+')

for i in range(2):
#get ten lines(list)
    nolines = list(islice(fin, 10))
    str_newlines = ""
    for j in range(len(nolines)):
        #get elements in nolines(str)
        element = nolines[j]
        #process 
        processed_Element = "\t".join(element.split()[1:4]) + "\n"
        #sum the element to total
        str_newlines = str_newlines + processed_Element
    fout.write(str_newlines)
fin.close()
fout.close()

aa fin file 格式

提取文件多少到多少列;

###################################################

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
from itertools import islice

#############main code###############
#step 1 
#####################################
#get oneline to save the index

##########################illustration####################################
### file2 contains all of "hgxxx" tables, while file1 has redundant talbes

#FPKM file, tag"hg000" is row
file1 = open ('/Users/tangchao/Desktop/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt','r')
#snp file, only one column containing tag "hg000"
file2 = open('/Users/Tangchao/Desktop/1KG_GD462_indiv_ROW_1.txt')

# measure file1
fstdText1 = file1.readline()
print fstdText1
#convert string to list
list1 = re.split("\t", fstdText1)
'''
#measure file2
list2 = []
for line in file2.readlines():
	newline = line.split()
	_newline = "\t".join(newline)
	list2.append(_newline)


#find the different contents between 2 lists
diff = []
for i in list1:
    if i not in list2:
        diff.append(i)
for j in list2:
    if j not in list1:
        diff.append(j)

#get the index of list which the content is not indentical
index = []
for i in range(len(list1)):
    for j in range(len(diff)):
        if (list1[i] == diff[j]):
          index.append(i)

#file close
file1.close()
file2.close()
print list1


#####################################
#step 2 
#####################################
#save the same content to file

#set the iteration
iteration = 100

#open file measure to remove some columns tagged "hgxxx"(superfluous)
#then save to new file
fin = open('/Users/tangchao/Desktop/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt','r')
fout = open("/Users/tangchao/Desktop/snp1.txt",'w+')

for i in range(iteration):

    #define processed lines(str)
    islice_newlines = ""

    #get ten lines(list)
    islice_lines = list(islice(fin, 10))

    for j in range(len(islice_lines)):

        #get the element(str)
        element = islice_lines[j]

        #convert str(element) to list
        eleList = re.split("\t", element)

        #define the new list (element) 
        new_eleList = []

        #process
        for k in range(len(eleList)):
            if k not in index:
                new_eleList.append(eleList[k])
        
        #convery list(new_elelist) to str, plus "\n"
        new_element = "\t".join(new_eleList) + "\n"

        #sum element to total
        islice_newlines = islice_newlines + new_element

    #write str to file
    fout.write(islice_newlines)

#file close
fin.close()
fout.close()
文件格式

找出两个文件的不同的index ,然后对FPKM文件进行列删除


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值