对类别型数据进行可视化操作:
def categoryVisualizationFunc(featurename): # 接收一个数字:1~1138,category feature对应的字段名称从index_category_arr中选取 numdf = pd.DataFrame(train[featurename]) numfq_ndarray = numdf.groupby(featurename).size() numfq = numfq_ndarray.tolist() numarr_ndarray = numdf.groupby(featurename).size().index numarr = numarr_ndarray.tolist() plt.xlabel('value', fontsize=16) plt.ylabel('count', fontsize=16) plt.title('category feature ' + featurename, fontsize=16) plt.xticks(numarr, numarr) print( "There exist " + str(len(numarr)) + " kinds of values in " + featurename + ".") # 打印出该特征字段共计多少种取值,即柱状图中共计多少个矩形 rect = plt.bar(left=numarr, height=numfq, width=0.6, align="center", alpha=0.5, color='g') # plt.legend((rect,),('count per value',)) # autolabel(rect) for rect_data in rect: height = rect_data.get_height() plt.text(rect_data.get_x()+rect_data.get_width()/2.0, 1.03*height, '%s' % int(height), fontsize=12) plt.axis([numarr_ndarray.min() - 0.5, numarr_ndarray.max() + 0.5, 0, (numfq_ndarray.max() / 1000 + 1.5) * 1000]) plt.tick_params(axis='both', which='major', labelsize=12) plt.show()
对数值型数据进行可视化操作:
def numericVisualizationFunc(featurename, bins=30):
#接收一个元组,第一个数字表示特征字段名,第二个数字表示直方图中共有多少个矩形
numarr = train[featurename].values
numarr = numarr[~np.isnan(numarr)] #去除可能存在的NaN值,否则报错!
print("The max value of " + featurename + " is: " + str(numarr.max()))
print("The min value of " + featurename + " is: " + str(numarr.min()))
plt.hist(numarr, bins=bins, color='green', alpha=0.5) #bins参数表示直方图中有多少个矩形,默认为bins=10
plt.xlabel('value', fontsize=16)
plt.ylabel('count', fontsize=16)
plt.title('numeric feature ' + featurename , fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show()