全扫描病理图片对应的xml格式文件的标注转为mask图像

最新推荐文章于 2024-04-18 15:29:03 发布

七白学长

最新推荐文章于 2024-04-18 15:29:03 发布

阅读量1.4k

点赞数 4

文章标签： xml python 人工智能

本文链接：https://blog.csdn.net/wangshuhuan1/article/details/125980415

版权

提示：文章写完后，目录可以自动生成，如何生成可参考右边的帮助文档

文章目录

1、xml to region
2、轮廓
3、标注边缘提取
4、xml文件的对应解析
5、调用
总结

1、xml to region

提取region部分的相关信息。
代码如下（示例）：

def xml_to_region(xml_file):
    """
    parse XML label file and get the points
    :param xml_file: xml file
    :return: region list,region_class
    """

    tree = ET.parse(xml_file)
    region_list = []
    region_class = []
    for color in tree.findall('.//Annotation'):
        if color.attrib['LineColor'] in ['65280', '255', '65535']:
            # '65280'是绿色,'255'是红色,可以根据自己的实际情况更改这个判断条件(或者直接if True)
            for region in color.findall('./Regions/Region'):
                vertex_list = []
                # region.attrib.get('Type')=='0':
                region_class.append(region.attrib.get('Type'))
                for vertex in region.findall('.//Vertices/Vertex'):
                    # parse the 'X' and 'Y' for the vertex
                    vertex_list.append(vertex.attrib)
                region_list.append(vertex_list)

    return region_list, region_class

2、轮廓

防止过滤小轮廓。


def region_handler(im, region_list, region_class, level_downsample):
    """
    handle region label point to discrete point, and draw the region point to line
    :param im: the image painted in region line
    :param region_list: region list, region point,
                    eg : [[{'X': '27381.168113', 'Y': '37358.653791'}], [{'X': '27381.168113', 'Y': '37358.653791'}]]
    :param region_class : list,keep the value of region.attrib.get('Type') in elements of region list
                    eg : [0,0,0,1,2,3]
    :param level_downsample: slide level down sample
    :return: image painted in region line of numpy array format
    """

    dr = ImageDraw.Draw(im)
    for r_class, region in enumerate(region_list):
        point_list = []
        if region_class[r_class] == '0' or region_class[r_class] == '3':
            for __, point in enumerate(region):
                X, Y = int(float(point['X']) / level_downsample), int(float(point['Y']) / level_downsample)
                point_list.append((X, Y))

            #        points_length = len(point_list)
            #            x_max = max(point_list, key=lambda point: point[0])[0]
            #            x_min = min(point_list, key=lambda point: point[0])[0]
            #            y_max = max(point_list, key=lambda point: point[1])[1]
            #            y_min = min(point_list, key=lambda point: point[1])[1]
            # mislabeled, here checked by x and y coordinate max and min difference
            # if (x_max - x_min < 50) or (y_max - y_min < 50): continue
            ## 上述这个逻辑很容易过滤小轮廓而不显示，暂且将其注释掉. ——by Bohrium Kwong 20201125

            if region_class[r_class] == '3':
                dr.arc(point_list, 0, 360, fill='#000000', width=12)
            else:
                dr.line(point_list, fill="#000000", width=12)

    return im

3、标注边缘提取

代码如下（示例）：

def region_binary_image(tile, region_list,region_class, level_downsample,label_correction = True):
    """
    convert the region labeled or not by doctor to binary image
    :param tile: a return image based on the method of Slide class object in 'utils.openslide_utils'
    :param region_list: region list, region point,
                    eg : [[{'X': '27381.168113', 'Y': '37358.653791'}], [{'X': '27381.168113', 'Y': '37358.653791'}]]
    :param region_class : list,keep the value of region.attrib.get('Type') in elements of region list
                    eg : [0,0,0,1,2,3]
    :param level_downsample: slide level down sample
    :param label_correction: label correctting or not 
    :return: image painted in region line of numpy array format
    """
    im = Image.new(mode="1", size=tile.size)
    dr = ImageDraw.Draw(im)
    regions_list = []
    for r_class, region in enumerate(region_list):
        point_list = []
        if region_class[r_class] == '0':
            for __, point in enumerate(region):
                X, Y = int(float(point['X'])/level_downsample), int(float(point['Y'])/level_downsample)
                point_list.append((X, Y))

            regions_list.append(point_list)
    
    if label_correction:
    # 考虑到有些读取xml的场景是针对分割生成的结果，有一些非常小的区域，故在这里新增一个label_correction参数，只有其值为True的时候才执行修正
        
        pin_jie_flag = [] #存储已经被拼接过的标注坐标列表序号                  
        single_list = [] #存储新标注坐标列表的列表          
        for j,p_list in enumerate(regions_list):
            if dist(p_list[0], p_list[-1]) < 50 and j not in pin_jie_flag:
            #如果首尾坐标距离相差在150范围内(曼哈顿距离)，且未成被拼接过，直接认为这个组坐标无须拼接，存储起来
                single_list.append(p_list)                
            elif dist(p_list[0], p_list[-1]) > 50 and j not in pin_jie_flag:
            #如果首尾坐标距离相差在150范围外(曼哈顿距离)，且未成被拼接过，说明这组坐标是残缺非闭合的，需要对其余标注坐标进行新一轮的循环判断
                for j_2,p_list_2 in enumerate(regions_list):
                    if j_2 != j and j_2 not in pin_jie_flag:
    
                        if dist(p_list[-1],p_list_2[0]) < 50 :
                            p_list = p_list + p_list_2.copy()
                            pin_jie_flag.append(j_2)
                        elif dist(p_list[0],p_list_2[-1]) < 50 :
                            p_list = p_list_2.copy() + p_list
                            pin_jie_flag.append(j_2)
                        elif dist(p_list[-1],p_list_2[-1]) < 50 :
                            p_list_2_new = copy.deepcopy(p_list_2)
                            p_list_2_new.reverse()
                            p_list = p_list + p_list_2_new
                            pin_jie_flag.append(j_2)
                        elif dist(p_list[0],p_list_2[0]) < 50 :
                            p_list_2_new = copy.deepcopy(p_list_2)
                            p_list_2_new.reverse()
                            p_list = p_list_2_new + p_list
                            pin_jie_flag.append(j_2)
                        # 当这组非闭合的尾坐标和其他组坐标的首坐标接近到一定范围时(距离是150内),就让当前的非闭合的坐标列表和该组坐标列表相加                        
                        # 处理完毕之后，将该组坐标的序号增加到已拼接坐标的列表中，确保后续循环不会再判断这个列表
                single_list.append(p_list)
        for points in single_list:
            dr.polygon(points, fill="#ffffff")
            
        #由于医生的标注除了出现不连续(非闭合)的情况外，还存在多余勾画的情况，对这种情况暂时没有完整的思路予以接近，先用
        # opencv中的开闭操作组合来进行修补
        kernel = np.ones((15,15),np.uint8)                                
        filter_matrix = np.array(im).astype(np.uint8)
        filter_matrix = cv2.morphologyEx(filter_matrix, cv2.MORPH_OPEN, kernel)
        filter_matrix = cv2.morphologyEx(filter_matrix, cv2.MORPH_CLOSE, kernel)  

    else:
        for points in regions_list:
            dr.polygon(points, fill="#ffffff")
        filter_matrix = np.array(im).astype(np.uint8)
#    plt.imshow(filter_matrix)              
    return filter_matrix

4、xml文件的对应解析

代码如下（示例）：

def contours_to_xml(savepath,contours,if_add = False,level_downsample = 16,mpp= "0.252100",linecolor ="16711680",contour_area_threshold=2000):
    """
    based on a mask of svs file(mask sure the size of the mask equals the size of the svs file 's level_dimensions in level 2) to make a
    xml format lable file for this svs file
    :param savepath :  the xml format lable file save file path
    :param contours :  contours list return from cv2.findContours of the mask
    :param if_add : Added niew Annotation to an exits xml format label file or not ,defaut False
    :param level_downsample : the value of slide.level_downsamples[2]
    :param mpp : the value of MicronsPerPixel in slide.properties['openslide.mpp-x']
    :param linecolor : the value of decimal color code to draw contours in xml format lable file ,default color is blue
    :param contour_area_threshold : the threshold to drop small contours base on cv2.contourArea,which helps to keep the big area contours in xml format lable  file
    :return:
    """
    ann_begin_tag = 1
    Annotations = ET.Element('Annotations', {'MicronsPerPixel': mpp})
    origin_color_list = []
    if if_add and os.path.exists(savepath):
        origin = ET.parse(savepath)
        ann_begin_tag = len(origin.findall('.//Annotation')) + 1
        for ann in origin.findall('.//Annotation'):
            origin_color_list.append(ann.attrib['LineColor'])
            Annotations.append(ann)
                               
    if linecolor in origin_color_list: linecolor = "13382297"
    Annotation = ET.SubElement(Annotations, 'Annotation',
                                          {'Id': str(ann_begin_tag), 'Name': '', 'ReadOnly': '0', 'NameReadOnly': '0',
                                           'LineColorReadOnly': '0', 'Incremental': '0', 'Type': '4',
                                           'LineColor': linecolor, 'Visible': '1', 'Selected': '1',
                                           'MarkupImagePath': '', 'MacroName': ''})
    Attributes = ET.SubElement(Annotation, 'Attributes')
    ET.SubElement(Attributes, 'Attribute', {'Name': '', 'Id': '0', 'Value': ''})
    Regions = ET.SubElement(Annotation, 'Regions')
    RegionAttributeHeaders = ET.SubElement(Regions, 'RegionAttributeHeaders')
    ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
                             {'Id': "9999", 'Name': 'Region', 'ColumnWidth': '-1'})
    ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
                             {'Id': "9997", 'Name': 'Length', 'ColumnWidth': '-1'})
    ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
                             {'Id': "9996", 'Name': 'Area', 'ColumnWidth': '-1'})
    ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
                             {'Id': "9998", 'Name': 'Text', 'ColumnWidth': '-1'})
    ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
                             {'Id': "1", 'Name': 'Description', 'ColumnWidth': '-1'})
    i = 1
    for cnt in contours:
        contour_area = cv2.contourArea(cnt)
        if contour_area > contour_area_threshold:
            Region = ET.SubElement(Regions, 'Region',
                                          {'Id': str(i), 'Type': '0', 'Zoom': '0.011', 'Selected': '0',
                                           'ImageLocation': '', 'ImageFocus': '-1', 'Length': str(cnt.shape[0]), 'Area': str(level_downsample**2*contour_area),
                                           'LengthMicrons': '0', 'AreaMicrons': '0', 'Text': '', 'NegativeROA': '0',
                                           'InputRegionId': '0', 'Analyze': '1', 'DisplayId': str(i)})
            ET.SubElement(Region, 'Attributes')
            Vertices = ET.SubElement(Region, 'Vertices')
            cnt = np.squeeze(np.asarray(cnt))
            for j in range(cnt.shape[0]):
                ET.SubElement(Vertices, 'Vertex', {'X': str(cnt[j,0]*level_downsample), 'Y': str(cnt[j,1]*level_downsample)})
            i = i + 1
    ET.SubElement(Annotation, 'Plots')
    doc = ET.ElementTree(Annotations)
    doc.write(open(savepath, "wb"), pretty_print=True)

5、调用

if __name__ == '__main__':
    import sys

    sys.path.append('../')
    from utils.openslide_utils import Slide
    import matplotlib.pyplot as plt
	#图片文件输入路径
    files = os.listdir("input_path")
    i = 0
    j = 0
for file in files:
    plt.rcParams['figure.figsize'] = 15, 15
    slide = Slide('input_path' + os.path.splitext(file)[0] + '.tif')
    try:
        xml_file = 'xml_path' + os.path.splitext(file)[0] + '.xml'

        tile = slide.get_thumb()  # 获取2级采样下的全片截图
        region_list, region_class = xml_to_region(xml_file)
        # 在这里使用xml_utils的方法进行指定区域提取(最终返回的是个True False矩阵)
        region_process_mask = region_binary_image(tile, region_list, region_class, slide.get_level_downsample())
        #    # 根据上述返回的标注坐标列表生成WSI原图2级采样大小的True False矩阵
        #    region_label = region_handler(tile, region_list, slide.get_level_downsample())
        #    plt.imshow()
        plt.imshow(region_process_mask)
        plt.show()