'''麦穗数据集分析'''
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 读取数据
train_data_label = pd.read_csv('./data/wheatData/train_data.csv')
# 预览数据
train_data_label.head()
# 获取数据集大小
print(f'数据集大小:{len(train_data_label)}')
# 图片中目标框数量分布
counts = train_data_label['image_id'].value_counts()
sns.displot(counts, kde=True, color="g") # 核密度图
plt.title('数据集框数量')
plt.show()
# 标注框左上角坐标的分布
sns.histplot(data=train_data_label, x='x', y='y', bins=50, pmax=0.9)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
train_data_label['cx'] = train_data_label['x'] + train_data_label['width'] / 2
train_data_label['cy'] = train_data_label['y'] + train_data_label['height'] / 2
# # 标注框中心点坐标的分布
sns.histplot(data=train_data_label, x='cx', y='cy', bins=50, pmax=0.9)
plt.xlabel('cx')
plt.ylabel('cy')
plt.show()
#
# # 标注框宽高比分布
sns.histplot(data=train_data_label, x='w', y='h', bins=50, pmax=0.9)
plt.xlabel('w')
plt.ylabel('h')
plt.show()
# 标注框面积的分布:用来设置anchor的尺度
aeras = train_data_label['w'] * train_data_label['h']
print("目标框最小面积为{}最大面积为{}".format(min(aeras), max(aeras)))
sns.histplot(aeras, bins=50, kde=False)
plt.show()
def show_images(imgs, num_rows=1, num_cols=2):
fig = plt.figure()
ax = []
for i in range(num_rows * num_cols):
img = imgs[i]
ax.append(fig.add_subplot(num_rows, num_cols, i + 1))
plt.imshow(img)
plt.axis('off')
return ax
def show_bboxes(ax, bboxes, labels=None, colors=None):
if colors is None:
colors = ['r']
for bbox in bboxes:
x1, y1, x2, y2 = bbox
rect = plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor=colors[0], linewidth=1)
ax.add_patch(rect)
if labels is not None:
label = labels[int(bbox[4])]
plt.text(x1 + 5, y1 + 15, label, color='w', fontsize=16)
# 图片及标注框预览
# 设置坐标轴的个数
num_rows, num_cols = 1, 2
# 获取标注信息
ids = train_data_label['image_id'].unique()[100:100 + num_rows * num_cols]
train_data_dir = './data/wheatData/train'
# 读取图片
imgs = [plt.imread(f'{train_data_dir}/{n}.jpg') for n in ids]
# 图片显示
axes = show_images(imgs, num_rows, num_cols)
# 显示标注框
for ax, id in zip(axes, ids):
datas = train_data_label[train_data_label['image_id'] == id]
bboxes = [(d['x'], d['y'], d['x'] + d['w'], d['y'] + d['h']) for _, d in datas.iterrows()]
show_bboxes(ax, bboxes, labels=None, colors=['r'])
plt.show()
画图结果: