提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
1、xml to region
提取region部分的相关信息。
代码如下(示例):
def xml_to_region(xml_file):
"""
parse XML label file and get the points
:param xml_file: xml file
:return: region list,region_class
"""
tree = ET.parse(xml_file)
region_list = []
region_class = []
for color in tree.findall('.//Annotation'):
if color.attrib['LineColor'] in ['65280', '255', '65535']:
# '65280'是绿色,'255'是红色,可以根据自己的实际情况更改这个判断条件(或者直接if True)
for region in color.findall('./Regions/Region'):
vertex_list = []
# region.attrib.get('Type')=='0':
region_class.append(region.attrib.get('Type'))
for vertex in region.findall('.//Vertices/Vertex'):
# parse the 'X' and 'Y' for the vertex
vertex_list.append(vertex.attrib)
region_list.append(vertex_list)
return region_list, region_class
2、轮廓
防止过滤小轮廓。
def region_handler(im, region_list, region_class, level_downsample):
"""
handle region label point to discrete point, and draw the region point to line
:param im: the image painted in region line
:param region_list: region list, region point,
eg : [[{'X': '27381.168113', 'Y': '37358.653791'}], [{'X': '27381.168113', 'Y': '37358.653791'}]]
:param region_class : list,keep the value of region.attrib.get('Type') in elements of region list
eg : [0,0,0,1,2,3]
:param level_downsample: slide level down sample
:return: image painted in region line of numpy array format
"""
dr = ImageDraw.Draw(im)
for r_class, region in enumerate(region_list):
point_list = []
if region_class[r_class] == '0' or region_class[r_class] == '3':
for __, point in enumerate(region):
X, Y = int(float(point['X']) / level_downsample), int(float(point['Y']) / level_downsample)
point_list.append((X, Y))
# points_length = len(point_list)
# x_max = max(point_list, key=lambda point: point[0])[0]
# x_min = min(point_list, key=lambda point: point[0])[0]
# y_max = max(point_list, key=lambda point: point[1])[1]
# y_min = min(point_list, key=lambda point: point[1])[1]
# mislabeled, here checked by x and y coordinate max and min difference
# if (x_max - x_min < 50) or (y_max - y_min < 50): continue
## 上述这个逻辑很容易过滤小轮廓而不显示,暂且将其注释掉. ——by Bohrium Kwong 20201125
if region_class[r_class] == '3':
dr.arc(point_list, 0, 360, fill='#000000', width=12)
else:
dr.line(point_list, fill="#000000", width=12)
return im
3、标注边缘提取
代码如下(示例):
def region_binary_image(tile, region_list,region_class, level_downsample,label_correction = True):
"""
convert the region labeled or not by doctor to binary image
:param tile: a return image based on the method of Slide class object in 'utils.openslide_utils'
:param region_list: region list, region point,
eg : [[{'X': '27381.168113', 'Y': '37358.653791'}], [{'X': '27381.168113', 'Y': '37358.653791'}]]
:param region_class : list,keep the value of region.attrib.get('Type') in elements of region list
eg : [0,0,0,1,2,3]
:param level_downsample: slide level down sample
:param label_correction: label correctting or not
:return: image painted in region line of numpy array format
"""
im = Image.new(mode="1", size=tile.size)
dr = ImageDraw.Draw(im)
regions_list = []
for r_class, region in enumerate(region_list):
point_list = []
if region_class[r_class] == '0':
for __, point in enumerate(region):
X, Y = int(float(point['X'])/level_downsample), int(float(point['Y'])/level_downsample)
point_list.append((X, Y))
regions_list.append(point_list)
if label_correction:
# 考虑到有些读取xml的场景是针对分割生成的结果,有一些非常小的区域,故在这里新增一个label_correction参数,只有其值为True的时候才执行修正
pin_jie_flag = [] #存储已经被拼接过的标注坐标列表序号
single_list = [] #存储新标注坐标列表的列表
for j,p_list in enumerate(regions_list):
if dist(p_list[0], p_list[-1]) < 50 and j not in pin_jie_flag:
#如果首尾坐标距离相差在150范围内(曼哈顿距离),且未成被拼接过,直接认为这个组坐标无须拼接,存储起来
single_list.append(p_list)
elif dist(p_list[0], p_list[-1]) > 50 and j not in pin_jie_flag:
#如果首尾坐标距离相差在150范围外(曼哈顿距离),且未成被拼接过,说明这组坐标是残缺非闭合的,需要对其余标注坐标进行新一轮的循环判断
for j_2,p_list_2 in enumerate(regions_list):
if j_2 != j and j_2 not in pin_jie_flag:
if dist(p_list[-1],p_list_2[0]) < 50 :
p_list = p_list + p_list_2.copy()
pin_jie_flag.append(j_2)
elif dist(p_list[0],p_list_2[-1]) < 50 :
p_list = p_list_2.copy() + p_list
pin_jie_flag.append(j_2)
elif dist(p_list[-1],p_list_2[-1]) < 50 :
p_list_2_new = copy.deepcopy(p_list_2)
p_list_2_new.reverse()
p_list = p_list + p_list_2_new
pin_jie_flag.append(j_2)
elif dist(p_list[0],p_list_2[0]) < 50 :
p_list_2_new = copy.deepcopy(p_list_2)
p_list_2_new.reverse()
p_list = p_list_2_new + p_list
pin_jie_flag.append(j_2)
# 当这组非闭合的尾坐标和其他组坐标的首坐标接近到一定范围时(距离是150内),就让当前的非闭合的坐标列表和该组坐标列表相加
# 处理完毕之后,将该组坐标的序号增加到已拼接坐标的列表中,确保后续循环不会再判断这个列表
single_list.append(p_list)
for points in single_list:
dr.polygon(points, fill="#ffffff")
#由于医生的标注除了出现不连续(非闭合)的情况外,还存在多余勾画的情况,对这种情况暂时没有完整的思路予以接近,先用
# opencv中的开闭操作组合来进行修补
kernel = np.ones((15,15),np.uint8)
filter_matrix = np.array(im).astype(np.uint8)
filter_matrix = cv2.morphologyEx(filter_matrix, cv2.MORPH_OPEN, kernel)
filter_matrix = cv2.morphologyEx(filter_matrix, cv2.MORPH_CLOSE, kernel)
else:
for points in regions_list:
dr.polygon(points, fill="#ffffff")
filter_matrix = np.array(im).astype(np.uint8)
# plt.imshow(filter_matrix)
return filter_matrix
4、xml文件的对应解析
代码如下(示例):
def contours_to_xml(savepath,contours,if_add = False,level_downsample = 16,mpp= "0.252100",linecolor ="16711680",contour_area_threshold=2000):
"""
based on a mask of svs file(mask sure the size of the mask equals the size of the svs file 's level_dimensions in level 2) to make a
xml format lable file for this svs file
:param savepath : the xml format lable file save file path
:param contours : contours list return from cv2.findContours of the mask
:param if_add : Added niew Annotation to an exits xml format label file or not ,defaut False
:param level_downsample : the value of slide.level_downsamples[2]
:param mpp : the value of MicronsPerPixel in slide.properties['openslide.mpp-x']
:param linecolor : the value of decimal color code to draw contours in xml format lable file ,default color is blue
:param contour_area_threshold : the threshold to drop small contours base on cv2.contourArea,which helps to keep the big area contours in xml format lable file
:return:
"""
ann_begin_tag = 1
Annotations = ET.Element('Annotations', {'MicronsPerPixel': mpp})
origin_color_list = []
if if_add and os.path.exists(savepath):
origin = ET.parse(savepath)
ann_begin_tag = len(origin.findall('.//Annotation')) + 1
for ann in origin.findall('.//Annotation'):
origin_color_list.append(ann.attrib['LineColor'])
Annotations.append(ann)
if linecolor in origin_color_list: linecolor = "13382297"
Annotation = ET.SubElement(Annotations, 'Annotation',
{'Id': str(ann_begin_tag), 'Name': '', 'ReadOnly': '0', 'NameReadOnly': '0',
'LineColorReadOnly': '0', 'Incremental': '0', 'Type': '4',
'LineColor': linecolor, 'Visible': '1', 'Selected': '1',
'MarkupImagePath': '', 'MacroName': ''})
Attributes = ET.SubElement(Annotation, 'Attributes')
ET.SubElement(Attributes, 'Attribute', {'Name': '', 'Id': '0', 'Value': ''})
Regions = ET.SubElement(Annotation, 'Regions')
RegionAttributeHeaders = ET.SubElement(Regions, 'RegionAttributeHeaders')
ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
{'Id': "9999", 'Name': 'Region', 'ColumnWidth': '-1'})
ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
{'Id': "9997", 'Name': 'Length', 'ColumnWidth': '-1'})
ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
{'Id': "9996", 'Name': 'Area', 'ColumnWidth': '-1'})
ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
{'Id': "9998", 'Name': 'Text', 'ColumnWidth': '-1'})
ET.SubElement(RegionAttributeHeaders, 'AttributeHeader',
{'Id': "1", 'Name': 'Description', 'ColumnWidth': '-1'})
i = 1
for cnt in contours:
contour_area = cv2.contourArea(cnt)
if contour_area > contour_area_threshold:
Region = ET.SubElement(Regions, 'Region',
{'Id': str(i), 'Type': '0', 'Zoom': '0.011', 'Selected': '0',
'ImageLocation': '', 'ImageFocus': '-1', 'Length': str(cnt.shape[0]), 'Area': str(level_downsample**2*contour_area),
'LengthMicrons': '0', 'AreaMicrons': '0', 'Text': '', 'NegativeROA': '0',
'InputRegionId': '0', 'Analyze': '1', 'DisplayId': str(i)})
ET.SubElement(Region, 'Attributes')
Vertices = ET.SubElement(Region, 'Vertices')
cnt = np.squeeze(np.asarray(cnt))
for j in range(cnt.shape[0]):
ET.SubElement(Vertices, 'Vertex', {'X': str(cnt[j,0]*level_downsample), 'Y': str(cnt[j,1]*level_downsample)})
i = i + 1
ET.SubElement(Annotation, 'Plots')
doc = ET.ElementTree(Annotations)
doc.write(open(savepath, "wb"), pretty_print=True)
5、调用
if __name__ == '__main__':
import sys
sys.path.append('../')
from utils.openslide_utils import Slide
import matplotlib.pyplot as plt
#图片文件输入路径
files = os.listdir("input_path")
i = 0
j = 0
for file in files:
plt.rcParams['figure.figsize'] = 15, 15
slide = Slide('input_path' + os.path.splitext(file)[0] + '.tif')
try:
xml_file = 'xml_path' + os.path.splitext(file)[0] + '.xml'
tile = slide.get_thumb() # 获取2级采样下的全片截图
region_list, region_class = xml_to_region(xml_file)
# 在这里使用xml_utils的方法进行指定区域提取(最终返回的是个True False矩阵)
region_process_mask = region_binary_image(tile, region_list, region_class, slide.get_level_downsample())
# # 根据上述返回的标注坐标列表生成WSI原图2级采样大小的True False矩阵
# region_label = region_handler(tile, region_list, slide.get_level_downsample())
# plt.imshow()
plt.imshow(region_process_mask)
plt.show()
总结
提示:本文章适用于全扫描病理组织图片(如svs,tif格式图片)。其他类型图片大同小异,可以自行修改细节。
完整项目代码分享在百度网盘:网盘
提取码:b5ic