基于分位数分箱
import pandas as pd
import numpy as np
data = pd.Series(np.random.randint(1, 100, size=100))
data_with_duplicates = pd.concat([data, data[:20]])
data_unique = data_with_duplicates.drop_duplicates()
quantiles = data_unique.quantile([0.25, 0.5, 0.75])
bins = [data_unique.min(), quantiles[0.25], quantiles[0.5], quantiles[0.75], data_unique.max()]
labels = ['Bin_1', 'Bin_2', 'Bin_3', 'Bin_4']
data_binned_unique = pd.cut(data_unique, bins=bins, labels=labels)
print(f"原始有重复值数据样本数量:{len(data_with_duplicates)}")
print(f"去除重复值后分箱数据样本数量:{len(data_binned_unique)}")
等宽分箱
min_value = data_with_duplicates.min()
max_value = data_with_duplicates.max()
bin_width = (max_value - min_value) / 4
bins = [min_value + i * bin_width for i in range(5)]
labels = ['Bin_1', 'Bin_2', 'Bin_3', 'Bin_4']
data_binned_equal_width = pd.cut(data_with_duplicates, bins=bins, labels=labels)
print(f"等宽分箱后数据样本数量:{len(data_binned_equal_width)}")
可视化分箱结果
import matplotlib.pyplot as plt
plt.hist(data_binned_unique, bins=len(labels), edgecolor='black')
plt.title('Distribution after Binning (Unique Values)')
plt.xlabel('Bins')
plt.ylabel('Frequency')
plt.show()
plt.hist(data_binned_equal_width, bins=len(labels), edgecolor='black')
plt.title('Distribution after Equal Width Binning')
plt.xlabel('Bins')
plt.ylabel('Frequency')
plt.show()