MIMIC-III文本提取

最新推荐文章于 2024-05-08 09:39:41 发布

Attention守恒

最新推荐文章于 2024-05-08 09:39:41 发布

阅读量212

点赞数

文章标签：开发语言 python pandas

本文链接：https://blog.csdn.net/weixin_44157403/article/details/134378996

版权

原始代码：

在MIMIC-III中提取临床记录的代码

from scipy import stats
import os
import pandas as pd
"""
Preprocess PubMed abstracts or MIMIC-III reports
"""
import re
import json

from nltk import sent_tokenize, word_tokenize

SECTION_TITLES = re.compile(
    r'('
    r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
    r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
    r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
    r'|TECHNIQUE'
    r'):|FINAL REPORT',
    re.I | re.M)


def pattern_repl(matchobj):
    """
    Return a replacement string to be used for match object
    """
    return ' '.rjust(len(matchobj.group(0)))


def find_end(text):
    """Find the end of the report."""
    ends = [len(text)]
    patterns = [
        re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
        re.compile(r'\n {3,}DR.', re.I),
        re.compile(r'[ ]{1,}RADLINE ', re.I),
        re.compile(r'.*electronically signed on', re.I),
        re.compile(r'M\[0KM\[0KM')
    ]
    for pattern in patterns:
        matchobj = pattern.search(text)
        if matchobj:
            ends.append(matchobj.start())
    return min(ends)


def split_heading(text):
    """Split the report into sections"""
    start = 0
    for matcher in SECTION_TITLES.finditer(text):
        # add last
        end = matcher.start()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        # add title
        start = end
        end = matcher.end()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        start = end

    # add last piece
    end = len(text)
    if start < end:
        section = text[start:end].strip()
        if section:
            yield section


def clean_text(text):
    """
    Clean text
    """

    # Replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    # Replace `_` with spaces.
    text = re.sub(r'_', ' ', text)

    start = 0
    end = find_end(text)
    new_text = ''
    if start > 0:
        new_text += ' ' * start
    new_text = text[start:end]

    # make sure the new text has the same length of old text.
    if len(text) - end > 0:
        new_text += ' ' * (len(text) - end)
    return new_text


def preprocess_mimic(text):
    """
    Preprocess reports in MIMIC-III.
    1. remove [**Patterns**] and signature
    2. split the report into sections
    3. tokenize sentences and words
    4. lowercase
    """
    for sec in split_heading(clean_text(text)):
        for sent in sent_tokenize(sec):
            text = ' '.join(word_tokenize(sent))
            yield text.lower()


df = pd.read_csv(r'scripts\NOTEEVENTS.csv',low_memory=False)
df.CHARTDATE = pd.to_datetime(df.CHARTDATE)
df.CHARTTIME = pd.to_datetime(df.CHARTTIME)
df.STORETIME = pd.to_datetime(df.STORETIME)

df2 = df[df.SUBJECT_ID.notnull()]
df2 = df2[df2.HADM_ID.notnull()]
df2 = df2[df2.CHARTTIME.notnull()]
df2 = df2[df2.TEXT.notnull()]

df2 = df2[['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT']]

del df


def filter_for_first_hrs(dataframe, _days=2):
    min_time = dataframe.CHARTTIME.min()
    return dataframe[dataframe.CHARTTIME < min_time + pd.Timedelta(days=_days)]


def getText(t):
    return " ".join(list(preprocess_mimic(t)))


def getSentences(t):
    return list(preprocess_mimic(t))


# df_filtered = df2.groupby('HADM_ID').apply(
#    lambda x: filter_for_first_hrs(x, 2))
# print(df_filtered.shape)
print(df2.groupby('HADM_ID').count().describe())
'''
count  55926.000000  55926.000000  55926.000000
mean      28.957283     28.957283     28.957283
std       59.891679     59.891679     59.891679
min        1.000000      1.000000      1.000000
25%        5.000000      5.000000      5.000000
50%       11.000000     11.000000     11.000000
75%       27.000000     27.000000     27.000000
max     1214.000000   1214.000000   1214.000000
'''

dataset_path = r'scripts/'
all_files = os.listdir(dataset_path)
all_folders = list(filter(lambda x: x.isdigit(), all_files))

output_folder = r'scripts/'

suceed = 0
failed = 0
failed_exception = 0

all_folders = all_folders

sentence_lens = []
hadm_id2index = {}

for folder in all_folders:
    try:
        patient_id = int(folder)
        sliced = df2[df2.SUBJECT_ID == patient_id]
        if sliced.shape[0] == 0:
            print("No notes for PATIENT_ID : {}".format(patient_id))
            failed += 1
            continue
        sliced.sort_values(by='CHARTTIME')

        # get the HADM_IDs from the stays.csv.
        stays_path = os.path.join(dataset_path, folder, 'stays.csv')
        stays_df = pd.read_csv(stays_path)
        hadm_ids = list(stays_df.HADM_ID.values)

        for ind, hid in enumerate(hadm_ids):
            hadm_id2index[str(hid)] = str(ind)

            sliced = sliced[sliced.HADM_ID == hid]
            #text = sliced.TEXT.str.cat(sep=' ')
            #text = "*****".join(list(preprocess_mimic(text)))
            data_json = {}
            for index, row in sliced.iterrows():
                #f.write("%s\t%s\n" % (row['CHARTTIME'], getText(row['TEXT'])))
                data_json["{}".format(row['CHARTTIME'])
                          ] = getSentences(row['TEXT'])

            with open(os.path.join(output_folder, folder + '_' + str(ind+1)), 'w') as f:
                json.dump(data_json, f)

        suceed += 1
    except:
        import traceback
        traceback.print_exc()
        print("Failed with Exception FOR Patient ID: %s", folder)
        failed_exception += 1

print("Sucessfully Completed: %d/%d" % (suceed, len(all_folders)))
print("No Notes for Patients: %d/%d" % (failed, len(all_folders)))
print("Failed with Exception: %d/%d" % (failed_exception, len(all_folders)))


with open(os.path.join(output_folder, 'test_hadm_id2index'), 'w') as f:
    json.dump(hadm_id2index, f)

代码分析

不同模块的作用

导入所需的模块和库：

scipy.stats：用于一些统计函数。
os：用于处理文件和目录。
pandas：用于数据处理和分析。
re：正则表达式模块，用于文本匹配和替换。
json：用于读写 JSON 格式文件。
nltk：用于自然语言处理，包括句子和单词的分词。

from scipy import stats
import os
import pandas as pd
"""
Preprocess PubMed abstracts or MIMIC-III reports
"""
import re
import json

from nltk import sent_tokenize, word_tokenize

定义正则表达式模式：

SECTION_TITLES：用于匹配报告中的各个部分标题。

SECTION_TITLES = re.compile(
    r'('
    r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
    r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
    r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
    r'|TECHNIQUE'
    r'):|FINAL REPORT',
    re.I | re.M)

定义辅助函数：

pattern_repl(matchobj)：替换匹配对象的函数。
find_end(text)：查找报告的结束位置。
split_heading(text)：将报告按部分拆分。
clean_text(text)：清理文本中的特殊字符。
preprocess_mimic(text)：对 MIMIC-III 中的报告进行预处理，包括去除特殊字符、拆分部分、分句和分词。

def pattern_repl(matchobj):
    """
    Return a replacement string to be used for match object
    """
    return ' '.rjust(len(matchobj.group(0)))

def find_end(text):
    """Find the end of the report."""
    ends = [len(text)]
    patterns = [
        re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
        re.compile(r'\n {3,}DR.', re.I),
        re.compile(r'[ ]{1,}RADLINE ', re.I),
        re.compile(r'.*electronically signed on', re.I),
        re.compile(r'M\[0KM\[0KM')
    ]
    for pattern in patterns:
        matchobj = pattern.search(text)
        if matchobj:
            ends.append(matchobj.start())
    return min(ends)

def split_heading(text):
    """Split the report into sections"""
    start = 0
    for matcher in SECTION_TITLES.finditer(text):
        # add last
        end = matcher.start()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        # add title
        start = end
        end = matcher.end()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        start = end

    # add last piece
    end = len(text)
    if start < end:
        section = text[start:end].strip()
        if section:
            yield section

def clean_text(text):
    """
    Clean text
    """

    # Replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    # Replace `_` with spaces.
    text = re.sub(r'_', ' ', text)

    start = 0
    end = find_end(text)
    new_text = ''
    if start > 0:
        new_text += ' ' * start
    new_text = text[start:end]

    # make sure the new text has the same length of old text.
    if len(text) - end > 0:
        new_text += ' ' * (len(text) - end)
    return new_text

def preprocess_mimic(text):
    """
    Preprocess reports in MIMIC-III.
    1. remove [**Patterns**] and signature
    2. split the report into sections
    3. tokenize sentences and words
    4. lowercase
    """
    for sec in split_heading(clean_text(text)):
        for sent in sent_tokenize(sec):
            text = ' '.join(word_tokenize(sent))
            yield text.lower()

读取 MIMIC-III 数据集文件：

df = pd.read_csv(r’scripts\NOTEEVENTS.csv’,low_memory=False)：
读取名为 NOTEEVENTS.csv 的文件，并将其存储为 DataFrame 对象 df。
数据预处理：
对 df 进行一系列的筛选操作，保留非空的 SUBJECT_ID、HADM_ID、CHARTTIME 和 TEXT 列。
选取 df 中的 SUBJECT_ID、HADM_ID、CHARTTIME 和 TEXT 列，存储到新的 DataFrame df2 中。

df = pd.read_csv(r'scripts\NOTEEVENTS.csv',low_memory=False)
df.CHARTDATE = pd.to_datetime(df.CHARTDATE)
df.CHARTTIME = pd.to_datetime(df.CHARTTIME)
df.STORETIME = pd.to_datetime(df.STORETIME)

df2 = df[df.SUBJECT_ID.notnull()]
df2 = df2[df2.HADM_ID.notnull()]
df2 = df2[df2.CHARTTIME.notnull()]
df2 = df2[df2.TEXT.notnull()]

df2 = df2[['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT']]

del df

定义辅助函数：

filter_for_first_hrs(dataframe, _days=2)：根据时间筛选出最早的几个小时的数据。
getText(t)：将报告文本进行预处理，并返回处理后的文本。
getSentences(t)：将报告文本进行预处理，并返回处理后的句子列表。

def filter_for_first_hrs(dataframe, _days=2):
    min_time = dataframe.CHARTTIME.min()
    return dataframe[dataframe.CHARTTIME < min_time + pd.Timedelta(days=_days)]


def getText(t):
    return " ".join(list(preprocess_mimic(t)))


def getSentences(t):
    return list(preprocess_mimic(t))

根据 HADM_ID 进行分组，并应用 filter_for_first_hrs 函数进行筛选（该部分代码被注释掉）。

统计每个 HADM_ID 对应的报告数量，并打印描述统计结果。

# df_filtered = df2.groupby('HADM_ID').apply(
#    lambda x: filter_for_first_hrs(x, 2))
# print(df_filtered.shape)
print(df2.groupby('HADM_ID').count().describe())
'''
count  55926.000000  55926.000000  55926.000000
mean      28.957283     28.957283     28.957283
std       59.891679     59.891679     59.891679
min        1.000000      1.000000      1.000000
25%        5.000000      5.000000      5.000000
50%       11.000000     11.000000     11.000000
75%       27.000000     27.000000     27.000000
max     1214.000000   1214.000000   1214.000000
'''

设置数据集路径和输出文件夹路径。


dataset_path = r'scripts/'
all_files = os.listdir(dataset_path)
all_folders = list(filter(lambda x: x.isdigit(), all_files))

output_folder = r'scripts/'

定义变量用于统计成功处理、处理失败和出现异常的次数。

suceed = 0
failed = 0
failed_exception = 0

all_folders = all_folders

遍历数据集中的文件夹：

a. 尝试将文件夹名称转换为整数，作为患者的ID。

b. 筛选出与该患者ID对应的数据，并检查是否存在报告数据。若不存在，则打印提示信息并增加失败次数，然后继续下一个文件夹的处理。

c. 对选取的数据按照 CHARTTIME 进行排序。

d. 从 stays.csv 文件中获取相应的 HADM_ID 列表。

e. 遍历 HADM_ID 列表和相应的索引值：

将 HADM_ID 与索引值的对应关系存储到 hadm_id2index 字典中。
筛选出与当前 HADM_ID 对应的数据。
遍历筛选出的数据，并将每个报告的时间和处理后的句子列表存储到 data_json 字典中。
将 data_json 字典以 JSON 格式写入到文件中。

f. 增加成功处理次数。

g. 若出现异常，打印异常信息并增加异常次数。

for folder in all_folders:
    try:
        patient_id = int(folder)
        sliced = df2[df2.SUBJECT_ID == patient_id]
        if sliced.shape[0] == 0:
            print("No notes for PATIENT_ID : {}".format(patient_id))
            failed += 1
            continue
        sliced.sort_values(by='CHARTTIME')

        # get the HADM_IDs from the stays.csv.
        stays_path = os.path.join(dataset_path, folder, 'stays.csv')
        stays_df = pd.read_csv(stays_path)
        hadm_ids = list(stays_df.HADM_ID.values)

        for ind, hid in enumerate(hadm_ids):
            hadm_id2index[str(hid)] = str(ind)

            sliced = sliced[sliced.HADM_ID == hid]
            #text = sliced.TEXT.str.cat(sep=' ')
            #text = "*****".join(list(preprocess_mimic(text)))
            data_json = {}
            for index, row in sliced.iterrows():
                #f.write("%s\t%s\n" % (row['CHARTTIME'], getText(row['TEXT'])))
                data_json["{}".format(row['CHARTTIME'])
                          ] = getSentences(row['TEXT'])

            with open(os.path.join(output_folder, folder + '_' + str(ind+1)), 'w') as f:
                json.dump(data_json, f)

        suceed += 1
    except:
        import traceback
        traceback.print_exc()
        print("Failed with Exception FOR Patient ID: %s", folder)
        failed_exception += 1

print("Sucessfully Completed: %d/%d" % (suceed, len(all_folders)))
print("No Notes for Patients: %d/%d" % (failed, len(all_folders)))
print("Failed with Exception: %d/%d" % (failed_exception, len(all_folders)))


with open(os.path.join(output_folder, 'test_hadm_id2index'), 'w') as f:
    json.dump(hadm_id2index, f)