前言:
最近帮某三甲医院做了一些医学数据处理的程序
记录一下
一 病人样本去重
需要:
第三列表格病人的编号,第一列表格病人的数据类型
统计的时候,出现同一个病人有不同的数据类型,按照一定的优先级别,
只保留一类。
sample | OS | _PATIENT | OS.time |
MMRF_2226_1_PB_Whole | 0 | MMRF_2226 | 1 |
MMRF_2226_1_BM_CD138pos | 0 | MMRF_2226 | 1 |
MMRF_2187_1_PB_CD138pos | 0 | MMRF_2187 | 1 |
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 20 14:10:43 2020
@author: chengxf2
去掉内容重复的
https://www.jb51.net/article/42635.htm
"""
import os
from openpyxl import load_workbook
import numpy as np
from xlrd import open_workbook
import csv
import sys
class RemoveRep:
"""
获取当前目录下面的xls
Args
None
return
path: 文件路径
"""
def GetPath(self):
root = os.getcwd()
files = os.listdir(root)
for file in files:
pos = file.find("xls")
if pos>1:
path = os.path.abspath(file)
fileName = os.path.splitext(file)[0]
print("\n path ",path)
return path,fileName
def __init__(self):
self.m = 0
self.n = 0
self.DictItem ={} #表格里面的所有内容
self.keyList = []
"""
获得优先级别
Args
tag1: CD138pos,Whole,WBC
tag2: PB, BM
return
pri:优先级别,3 最高
"""
def GetPre(self, tag1, tag2):
# print("\n tag1 ",tag1,"\t tg2 ",tag2)
pri = -1
if tag1 =="CD138pos" and tag2 == "BM":
pri = 3
elif tag1 =="CD138pos" and tag2 =="PB":
pri = 2
else:
pri = 1
return pri
"""
同一个病人,有重复的资料,只保存一种
[['CD138pos', 'PB', 3], ['CD3pos', 'PB', 4], ['CD138pos', 'BM', 5]]
Args
Items: 重复的列表项
return
maxIndex: 选择最优的
"""
def GetIndex(self, Items):
maxPri = -1 #最优级别的index
maxIndex = -1 #最优级别的index
for item in Items:
tag1 = item[0]
tag2 = item[1]
index = item[2]
pri = self.GetPre(tag1, tag2)
# print("\n ","tag1",tag1,"\t tag2 ",tag2, "\t 优先级别: ",pri,"\t index ",index)
if pri>maxPri:
maxPri = pri
maxIndex = index
# print("\n maxIndex:",maxIndex)
return maxIndex
"""
去除重复的
Args
rows: 行
head: 头
return
retRow: 去重后的列表
"""
def Merge(self, rows,head):
retRow = []
retRow.append(head)
Index =[] #需要添加的列表
#print("head ",head)
for key in self.DictItem.keys(): #只留一个
items = self.DictItem[key]
if len(items)>1: #有重复的
#print("++++++key +++++",key)
index = self.GetIndex(items) #多个只预留一个
# print("item: ",rows[index])
else:
#print("\n item ",item[0][2])
index = items[0][2]
Index.append(index)
#print(Index)
for key in Index:
item = rows[key]
retRow.append(item)
#print("\n 删除后的表格: \n ",retRow)
return retRow
"""
读取xls
Args
path:
表格路径
CD138Pos 排在第一优先级
[BM,CD]
return
rows : 表格内容
head: 表格头
"""
def Readxls(self,path):
rows =[]
workbook = open_workbook(path) # 打开xls文件
sh = workbook.sheet_by_index(0)
nrows = sh.nrows
ncols = sh.ncols
# print("sheets ",nrows,"\t ncols : ",ncols)
#获取第一行,第一列数据
#Whead = sh.cell_value(1,1)
head = sh.row_values(0) #从0 开始的
for i in range(1,nrows): #第一行不读,为头
row_data = sh.row_values(i)
#print("\n row_data ",row_data)
sample = row_data[0].split("_")
tag1 = sample[4]#CD138pos
tag2 = sample[3]#PB,BM
index = i-1 #头不读,所以减去1
patient = row_data[2] #病人标号
if patient not in self.DictItem.keys():
self.DictItem[patient]=[[tag1,tag2,index]]
else: #已经存在了
curitem = self.DictItem[patient] #当前列表
newItem = [tag1,tag2,index] #新的条目
curitem.append(newItem)
self.DictItem[patient] = curitem
rows.append(row_data)
return rows,head
"""
保存到csv文件里面
Arg
Head: 表格第一行 #head = ['标题列1', '标题列2']
rows: 表格内容 #rows = [['张三', 80],['李四', 90]]
return
Rows: 文件里面的数据内容
Head: [ID,文件名]
"""
def WriteCSV(self,rows, name):
print("\n *****保存文件 ********\n ") #4,60483
fileName = name+".csv"
f= open(fileName,'w', newline='')
# print("\n rows, ",rows)
writer = csv.writer(f)
writer.writerows(rows)
f.close()
print("\n *****保存退出 ********\n ") #4,60483
"""
"""
def Remove(self):
path,name = self.GetPath()
print("\n 读取表格 \n ")
xmlrows,head = self.Readxls(path)
print("\n 合并表格 \n ")
rows = self.Merge(xmlrows,head)
self.WriteCSV(rows,name)
move = RemoveRep()
move.Remove()