def bubble_plot(df, x, y, z_boolean=None, ordered_x_values=None, ordered_y_values=None, bins_x=10,
bins_y=10, fontsize=16, figsize=(10,5), maximal_bubble_size=4000,
normalization_by_all = False, log=False):
"""
:param df: dataframe
:param x: name of first numerical/categorical field (string) (for x-axis)
:param y: name of second numerical/categorical field (string) (for y-axis)
:param z_boolean: name of categorical field with two categories / boolean field (for coloring)
:param ordered_x_values: the values we would like to map from x categorical variable
according to the order we would like to present them
:param ordered_y_values: the values we would like to map from the y categorical variable
according to the order we would like to present them
:param bins_x: the bins for x values if x is numberic
:param bins_y: the bins for y values if y is numberic
:param normalization_by_all: True - shows joint distribution p(x,y), False - shows conditional distribution p(y|x)
:param maximal_bubble_size: if the bubbles are too big or too small this is the parameter you should change!
:param log: whether to apply log on the count (influence the size of the bubbles)
:return: nice bubble plot, bubble size is propotional to the frequency of the bucket :)
"""
plt.figure(figsize=figsize)
x_is_numeric = df[x].dtype in (float, int) and ordered_x_values is None
y_is_numeric = df[y].dtype in (float, int) and ordered_y_values is None
count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y]], axis=1)
count_table = count_table.groupby(x)[y].value_counts().unstack().fillna(0)
ordered_x_values = count_table.index.values if ordered_x_values is None else ordered_x_values
ordered_y_values = count_table.columns if ordered_y_values is None else ordered_y_values
if z_boolean is not None:
count_table_long, xticks, yticks, xticklabels, yticklabels = plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size,
normalization_by_all=normalization_by_all)
else:
count_table_long, xticks, yticks, xticklabels, yticklabels = plot_without_z(df, x, y, z_boolean, count_table, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values,
normalization_by_all=normalization_by_all, log=log, maximal_bubble_size=maximal_bubble_size )
plt.xticks(xticks, xticklabels,fontsize=fontsize)
plt.yticks(yticks, yticklabels,fontsize=fontsize)
plt.xlabel(x, fontsize=fontsize)
plt.ylabel(y, fontsize=fontsize)
if z_boolean is None:
plt.title("{} vs {} ".format(y,x),fontsize=fontsize+4);
else:
plt.title("{} vs {} and {} (in colors)".format(y,x, z_boolean),fontsize=fontsize+4);