# -*- coding: utf-8 -*-
"""
Created on Thu Dec 24 14:00:01 2020
@author: s
"""
import re
import openpyxl
import pandas as pd
from openpyxl.styles import Font,PatternFill,Side,Alignment,Border
from datetime import datetime
def AE_dup(filename):
import re
import openpyxl
import pandas as pd
from openpyxl.styles import Font,PatternFill,Side,Alignment,Border
from datetime import datetime
flag1=[] #原始数据集行号记录
line1=[] #AE记录编号
subject1=[] #受试者编号
AEname1=[] #不良事件名称
stdat1=[] #开始日期
endat1=[] #结束日期
CTCAE1=[] #AE严重程度
line2=[] #输出的AE记录编号
subject2=[] #输出的受试者编号
AEname2=[] #输出的不良事件名称
stdat2=[] #输出的开始日期
endat2=[] #输出的结束日期
flag2=[] #输出的对应的flag
CTCAE2=[]
sheet='AE'
data=pd.read_excel(filename,sheet) #读取AE sheet
for i in range(len(data)):
flag1.append(i) #记录flag
line1.append(data['记录编号'][i]) #记录AE记录行号
subject1.append(data['受试者编号'][i]) #记录受试者号
AEname1.append(data['不良事件名称'][i]) #记录不良事件名称
CTCAE1.append(data['严重程度(CTCAE5.0分级)'][i])
if len(str(data['开始日期'][i]))==10 and data['开始日期'][i][-1] !='N' and data['开始日期'][i][-1] !='K': #将开始日期为完整日期的转换为日期格式放入stdat1列表
stdat1.append(datetime.strptime(data['开始日期'][i],'%Y-%m-%d'))
elif str(data['开始日期'][i])[-1] =='N': #将含有UN的同一格式后放入stdat1列表
stdat1.append(re.sub('UN','',data['开始日期'][i]))
else: #日期为空的统一设置为1900-01-01
stdat1.append(datetime.strptime('1900-01-01','%Y-%m-%d'))
for i in range(len(data)):
if len(str(data['结束日期'][i]))==10 and str(data['结束日期'][i])[-1] !='N' and str(data['结束日期'][i])[-1] !='K': #将完整结束日期转换为日期格式后放入endat1列表
endat1.append(datetime.strptime(data['结束日期'][i],'%Y-%m-%d'))
elif str(data['结束日期'][i])[-1] =='N': #将含有UN的结束日期统一格式后放入结束日期列表
endat1.append(re.sub('UN','',data['结束日期'][i]))
else: #将其余的日期为空的统一设置为2100-01-01
endat1.append(datetime.strptime('2100-01-01','%Y-%m-%d'))
for i in range(len(subject1)):
for n in range(len(subject1)): #遍历每一条AE
if i!=n and subject1[i]==subject1[n] and AEname1[i] ==AEname1[n]: #如果比较的两者不是同一条AE且受试者号相同不良事件名称相同,进入下面程序
if len(str(stdat1[n]))>8 and len(str(endat1[i]))>8 and len(str(stdat1[i]))>8 and stdat1[n]>=stdat1[i] and stdat1[n]<endat1[i]:# and endat1[i] !=endat1[n]: #如果长度大于8,则证明是完整日期,AE1的开始日期大于等于AE2的开始日期且小于AE2的结束日期,则记录flag号
flag2.append(i)
flag2.append(n)
if len(str(endat1[n]))<8 and int(stdat1[n])>=int(stdat1[i][0:4]) and int(stdat1[n])<=int(endat1[i][0:4]): #如果长度小于8,则证明是只有年已知,比较年份
print(endat1[n])
flag2.append(i)
flag2.append(n)
if (len(str(endat1[n]))==8 or len(str(stdat1[n]))==8) and int(str(stdat1[n])[5:7])>=(int(str(stdat1[i])[5:7])+((int(str(stdat1[i])[0:4])-int(str(stdat1[n])[0:4]))*12)) and (int(str(stdat1[n])[5:7])<=int(str(endat1[i])[5:7])+((int(str(endat1[i])[0:4])-int(str(stdat1[n])[0:4]))*12)):#若长度等于8则年月已知,比较年月
print(endat1[n],int(str(stdat1[n])[5:7]))
flag2.append(i)
flag2.append(n)
if len(str(stdat1[n]))>8 and len(str(endat1[i]))>8 and len(str(stdat1[i]))>8 and stdat1[n]==endat1[i] and CTCAE1[n]==CTCAE1[i]:
flag2.append(i)
flag2.append(n)
print(endat1)
for i in flag2:
subject2.append(data['受试者编号'][i])
line2.append(data['记录编号'][i])
AEname2.append(data['不良事件名称'][i])
stdat2.append(data['开始日期'][i])
endat2.append(data['结束日期'][i])
CTCAE2.append(data['严重程度(CTCAE5.0分级)'][i])
data1=pd.DataFrame({'受试者编号':subject2,'记录编号':line2,'不良事件名称':AEname2,'严重程度':CTCAE2,'开始日期':stdat2,'结束日期':endat2})
data1.to_excel(r'C:\Users\s\Desktop\I10A-B301\B301人工核查\AE日期重复\SCT-I10A-B301 Data Listing_output.xlsx',sheet_name='AE日期重复',index=False)
#---------------------------------------------------------------------------------------------------------------------------------------------------------
#设置单元格格式
def reset_color(filename):
wb=openpyxl.load_workbook(filename)
fill=PatternFill(
fill_type='solid',
start_color='99ccff')
border=Border(top=Side(border_style='thin',color='000000'),
bottom=Side(border_style='thin',color='000000'),
left=Side(border_style='thin',color='000000'),
right=Side(border_style='thin',color='000000'))
for i in wb.sheetnames:
ws=wb[i]
ws.sheet_view.showGridLines=False #隐藏默认网线
for c in range(1,ws.max_column+1):
for r in range(1,ws.max_row+1):
bordercell=ws.cell(r,c)
bordercell.border=border
for end in range(1,ws.max_column+1):
fillcell=ws.cell(1,end)
fillcell.fill=fill
wb.save(filename)
def reset_col(filename):
wb=openpyxl.load_workbook(filename)
for sheet in wb.sheetnames:
ws=wb[sheet]
df=pd.read_excel(filename,sheet).fillna('-')
df.loc[len(df)]=list(df.columns)
for col in df.columns:
index=list(df.columns).index(col)
letter=openpyxl.utils.get_column_letter(index+1)
collen=df[col].apply(lambda x:len(str(x).encode())).max()
ws.column_dimensions[letter].width=collen+4
wb.save(filename)
#调用函数
filename1=r'C:\Users\s\Desktop\I10A-B301\B301人工核查\AE日期重复\SCT-I10A-B301 Data Listing.xlsx'
AE_dup(filename1)
filename=r'C:\Users\s\Desktop\I10A-B301\B301人工核查\AE日期重复\SCT-I10A-B301 Data Listing_output.xlsx'
reset_col(filename)
reset_color(filename)
python用于临床数据管理_AE查重脚本
于 2021-05-06 15:29:39 首次发布