原始代码:
在MIMIC-III中提取临床记录的代码
from scipy import stats
import os
import pandas as pd
"""
Preprocess PubMed abstracts or MIMIC-III reports
"""
import re
import json
from nltk import sent_tokenize, word_tokenize
SECTION_TITLES = re.compile(
r'('
r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
r'|TECHNIQUE'
r'):|FINAL REPORT',
re.I | re.M)
def pattern_repl(matchobj):
"""
Return a replacement string to be used for match object
"""
return ' '.rjust(len(matchobj.group(0)))
def find_end(text):
"""Find the end of the report."""
ends = [len(text)]
patterns = [
re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
re.compile(r'\n {3,}DR.', re.I),
re.compile(r'[ ]{1,}RADLINE ', re.I),
re.compile(r'.*electronically signed on', re.I),
re.compile(r'M\[0KM\[0KM')
]
for pattern in patterns:
matchobj = pattern.search(text)
if matchobj:
ends.append(matchobj.start())
return min(ends)
def split_heading(text):
"""Split the report into sections"""
start = 0
for matcher in SECTION_TITLES.finditer(text):
# add last
end = matcher.start()
if end != start:
section = text[start:end].strip()
if section:
yield section
# add title
start = end
end = matcher.end()
if end != start:
section = text[start:end].strip()
if section:
yield section
start = end
# add last piece
end = len(text)
if start < end:
section = text[start:end].strip()
if section:
yield section
def clean_text(text):
"""
Clean text
"""
# Replace [**Patterns**] with spaces.
text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
# Replace `_` with spaces.
text = re.sub(r'_', ' ', text)
start = 0
end = find_end(text)
new_text = ''
if start > 0:
new_text += ' ' * start
new_text = text[start:end]
# make sure the new text has the same length of old text.
if len(text) - end > 0:
new_text += ' ' * (len(text) - end)
return new_text
def preprocess_mimic(text):
"""
Preprocess reports in MIMIC-III.
1. remove [**Patterns**] and signature
2. split the report into sections
3. tokenize sentences and words
4. lowercase
"""
for sec in split_heading(clean_text(text)):
for sent in sent_tokenize(sec):
text = ' '.join(word_tokenize(sent))
yield text.lower()
df = pd.read_csv(r'scripts\NOTEEVENTS.csv',low_memory=False)
df.CHARTDATE = pd.to_datetime(df.CHARTDATE)
df.CHARTTIME = pd.to_datetime(df.CHARTTIME)
df.STORETIME = pd.to_datetime(df.STORETIME)
df2 = df[df.SUBJECT_ID.notnull()]
df2 = df2[df2.HADM_ID.notnull()]
df2 = df2[df2.CHARTTIME.notnull()]
df2 = df2[df2.TEXT.notnull()]
df2 = df2[['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT']]
del df
def filter_for_first_hrs(dataframe, _days=2):
min_time = dataframe.CHARTTIME.min()
return dataframe[dataframe.CHARTTIME < min_time + pd.Timedelta(days=_days)]
def getText(t):
return " ".join(list(preprocess_mimic(t)))
def getSentences(t):
return list(preprocess_mimic(t))
# df_filtered = df2.groupby('HADM_ID').apply(
# lambda x: filter_for_first_hrs(x, 2))
# print(df_filtered.shape)
print(df2.groupby('HADM_ID').count().describe())
'''
count 55926.000000 55926.000000 55926.000000
mean 28.957283 28.957283 28.957283
std 59.891679 59.891679 59.891679
min 1.000000 1.000000 1.000000
25% 5.000000 5.000000 5.000000
50% 11.000000 11.000000 11.000000
75% 27.000000 27.000000 27.000000
max 1214.000000 1214.000000 1214.000000
'''
dataset_path = r'scripts/'
all_files = os.listdir(dataset_path)
all_folders = list(filter(lambda x: x.isdigit(), all_files))
output_folder = r'scripts/'
suceed = 0
failed = 0
failed_exception = 0
all_folders = all_folders
sentence_lens = []
hadm_id2index = {}
for folder in all_folders:
try:
patient_id = int(folder)
sliced = df2[df2.SUBJECT_ID == patient_id]
if sliced.shape[0] == 0:
print("No notes for PATIENT_ID : {}".format(patient_id))
failed += 1
continue
sliced.sort_values(by='CHARTTIME')
# get the HADM_IDs from the stays.csv.
stays_path = os.path.join(dataset_path, folder, 'stays.csv')
stays_df = pd.read_csv(stays_path)
hadm_ids = list(stays_df.HADM_ID.values)
for ind, hid in enumerate(hadm_ids):
hadm_id2index[str(hid)] = str(ind)
sliced = sliced[sliced.HADM_ID == hid]
#text = sliced.TEXT.str.cat(sep=' ')
#text = "*****".join(list(preprocess_mimic(text)))
data_json = {}
for index, row in sliced.iterrows():
#f.write("%s\t%s\n" % (row['CHARTTIME'], getText(row['TEXT'])))
data_json["{}".format(row['CHARTTIME'])
] = getSentences(row['TEXT'])
with open(os.path.join(output_folder, folder + '_' + str(ind+1)), 'w') as f:
json.dump(data_json, f)
suceed += 1
except:
import traceback
traceback.print_exc()
print("Failed with Exception FOR Patient ID: %s", folder)
failed_exception += 1
print("Sucessfully Completed: %d/%d" % (suceed, len(all_folders)))
print("No Notes for Patients: %d/%d" % (failed, len(all_folders)))
print("Failed with Exception: %d/%d" % (failed_exception, len(all_folders)))
with open(os.path.join(output_folder, 'test_hadm_id2index'), 'w') as f:
json.dump(hadm_id2index, f)
代码分析
不同模块的作用
导入所需的模块和库:
scipy.stats:用于一些统计函数。
os:用于处理文件和目录。
pandas:用于数据处理和分析。
re:正则表达式模块,用于文本匹配和替换。
json:用于读写 JSON 格式文件。
nltk:用于自然语言处理,包括句子和单词的分词。
from scipy import stats
import os
import pandas as pd
"""
Preprocess PubMed abstracts or MIMIC-III reports
"""
import re
import json
from nltk import sent_tokenize, word_tokenize
定义正则表达式模式:
SECTION_TITLES:用于匹配报告中的各个部分标题。
SECTION_TITLES = re.compile(
r'('
r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
r'|TECHNIQUE'
r'):|FINAL REPORT',
re.I | re.M)
定义辅助函数:
pattern_repl(matchobj):替换匹配对象的函数。
find_end(text):查找报告的结束位置。
split_heading(text):将报告按部分拆分。
clean_text(text):清理文本中的特殊字符。
preprocess_mimic(text):对 MIMIC-III 中的报告进行预处理,包括去除特殊字符、拆分部分、分句和分词。
def pattern_repl(matchobj):
"""
Return a replacement string to be used for match object
"""
return ' '.rjust(len(matchobj.group(0)))
def find_end(text):
"""Find the end of the report."""
ends = [len(text)]
patterns = [
re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
re.compile(r'\n {3,}DR.', re.I),
re.compile(r'[ ]{1,}RADLINE ', re.I),
re.compile(r'.*electronically signed on', re.I),
re.compile(r'M\[0KM\[0KM')
]
for pattern in patterns:
matchobj = pattern.search(text)
if matchobj:
ends.append(matchobj.start())
return min(ends)
def split_heading(text):
"""Split the report into sections"""
start = 0
for matcher in SECTION_TITLES.finditer(text):
# add last
end = matcher.start()
if end != start:
section = text[start:end].strip()
if section:
yield section
# add title
start = end
end = matcher.end()
if end != start:
section = text[start:end].strip()
if section:
yield section
start = end
# add last piece
end = len(text)
if start < end:
section = text[start:end].strip()
if section:
yield section
def clean_text(text):
"""
Clean text
"""
# Replace [**Patterns**] with spaces.
text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
# Replace `_` with spaces.
text = re.sub(r'_', ' ', text)
start = 0
end = find_end(text)
new_text = ''
if start > 0:
new_text += ' ' * start
new_text = text[start:end]
# make sure the new text has the same length of old text.
if len(text) - end > 0:
new_text += ' ' * (len(text) - end)
return new_text
def preprocess_mimic(text):
"""
Preprocess reports in MIMIC-III.
1. remove [**Patterns**] and signature
2. split the report into sections
3. tokenize sentences and words
4. lowercase
"""
for sec in split_heading(clean_text(text)):
for sent in sent_tokenize(sec):
text = ' '.join(word_tokenize(sent))
yield text.lower()
读取 MIMIC-III 数据集文件:
df = pd.read_csv(r’scripts\NOTEEVENTS.csv’,low_memory=False):
读取名为 NOTEEVENTS.csv 的文件,并将其存储为 DataFrame 对象 df。
数据预处理:
对 df 进行一系列的筛选操作,保留非空的 SUBJECT_ID、HADM_ID、CHARTTIME 和 TEXT 列。
选取 df 中的 SUBJECT_ID、HADM_ID、CHARTTIME 和 TEXT 列,存储到新的 DataFrame df2 中。
df = pd.read_csv(r'scripts\NOTEEVENTS.csv',low_memory=False)
df.CHARTDATE = pd.to_datetime(df.CHARTDATE)
df.CHARTTIME = pd.to_datetime(df.CHARTTIME)
df.STORETIME = pd.to_datetime(df.STORETIME)
df2 = df[df.SUBJECT_ID.notnull()]
df2 = df2[df2.HADM_ID.notnull()]
df2 = df2[df2.CHARTTIME.notnull()]
df2 = df2[df2.TEXT.notnull()]
df2 = df2[['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT']]
del df
定义辅助函数:
filter_for_first_hrs(dataframe, _days=2):根据时间筛选出最早的几个小时的数据。
getText(t):将报告文本进行预处理,并返回处理后的文本。
getSentences(t):将报告文本进行预处理,并返回处理后的句子列表。
def filter_for_first_hrs(dataframe, _days=2):
min_time = dataframe.CHARTTIME.min()
return dataframe[dataframe.CHARTTIME < min_time + pd.Timedelta(days=_days)]
def getText(t):
return " ".join(list(preprocess_mimic(t)))
def getSentences(t):
return list(preprocess_mimic(t))
根据 HADM_ID 进行分组,并应用 filter_for_first_hrs 函数进行筛选(该部分代码被注释掉)。
统计每个 HADM_ID 对应的报告数量,并打印描述统计结果。
# df_filtered = df2.groupby('HADM_ID').apply(
# lambda x: filter_for_first_hrs(x, 2))
# print(df_filtered.shape)
print(df2.groupby('HADM_ID').count().describe())
'''
count 55926.000000 55926.000000 55926.000000
mean 28.957283 28.957283 28.957283
std 59.891679 59.891679 59.891679
min 1.000000 1.000000 1.000000
25% 5.000000 5.000000 5.000000
50% 11.000000 11.000000 11.000000
75% 27.000000 27.000000 27.000000
max 1214.000000 1214.000000 1214.000000
'''
设置数据集路径和输出文件夹路径。
dataset_path = r'scripts/'
all_files = os.listdir(dataset_path)
all_folders = list(filter(lambda x: x.isdigit(), all_files))
output_folder = r'scripts/'
定义变量用于统计成功处理、处理失败和出现异常的次数。
suceed = 0
failed = 0
failed_exception = 0
all_folders = all_folders
遍历数据集中的文件夹:
a. 尝试将文件夹名称转换为整数,作为患者的ID。
b. 筛选出与该患者ID对应的数据,并检查是否存在报告数据。若不存在,则打印提示信息并增加失败次数,然后继续下一个文件夹的处理。
c. 对选取的数据按照 CHARTTIME 进行排序。
d. 从 stays.csv 文件中获取相应的 HADM_ID 列表。
e. 遍历 HADM_ID 列表和相应的索引值:
-
将 HADM_ID 与索引值的对应关系存储到
hadm_id2index
字典中。 -
筛选出与当前 HADM_ID 对应的数据。
-
遍历筛选出的数据,并将每个报告的时间和处理后的句子列表存储到
data_json
字典中。 -
将
data_json
字典以 JSON 格式写入到文件中。
f. 增加成功处理次数。
g. 若出现异常,打印异常信息并增加异常次数。
for folder in all_folders:
try:
patient_id = int(folder)
sliced = df2[df2.SUBJECT_ID == patient_id]
if sliced.shape[0] == 0:
print("No notes for PATIENT_ID : {}".format(patient_id))
failed += 1
continue
sliced.sort_values(by='CHARTTIME')
# get the HADM_IDs from the stays.csv.
stays_path = os.path.join(dataset_path, folder, 'stays.csv')
stays_df = pd.read_csv(stays_path)
hadm_ids = list(stays_df.HADM_ID.values)
for ind, hid in enumerate(hadm_ids):
hadm_id2index[str(hid)] = str(ind)
sliced = sliced[sliced.HADM_ID == hid]
#text = sliced.TEXT.str.cat(sep=' ')
#text = "*****".join(list(preprocess_mimic(text)))
data_json = {}
for index, row in sliced.iterrows():
#f.write("%s\t%s\n" % (row['CHARTTIME'], getText(row['TEXT'])))
data_json["{}".format(row['CHARTTIME'])
] = getSentences(row['TEXT'])
with open(os.path.join(output_folder, folder + '_' + str(ind+1)), 'w') as f:
json.dump(data_json, f)
suceed += 1
except:
import traceback
traceback.print_exc()
print("Failed with Exception FOR Patient ID: %s", folder)
failed_exception += 1
print("Sucessfully Completed: %d/%d" % (suceed, len(all_folders)))
print("No Notes for Patients: %d/%d" % (failed, len(all_folders)))
print("Failed with Exception: %d/%d" % (failed_exception, len(all_folders)))
with open(os.path.join(output_folder, 'test_hadm_id2index'), 'w') as f:
json.dump(hadm_id2index, f)