import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pylab import style
style.use('ggplot')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import warnings
warnings.filterwarnings("ignore")
pd.set_option('mode.chained_assignment', None)
def equal_interval_sampling(data, mnum):
col = 'SEL_MW'
astep = (data.max()[col] - data.min()[col]) / mnum
astep = np.round(astep, 1)
age_bins = np.round(np.arange(data.min()[col], data.max()[col] + 1, astep), 3)
print(f'步长:{astep}\n区间个数:{len(age_bins) - 1}')
ind_list = []
for val in age_bins:
fh = data.iloc[:, -1]
bf = np.abs(fh - val)
sbf = bf.sort_values().round(3)
sbf = sbf.reset_index()
sbf.columns = ['编号', '分割点']
ind_list.append(sbf.loc[0, '编号'] )
print('排序前样本数目:', len(ind_list))
end_row = sorted(set(ind_list))
print('排序后样本数目:', len(end_row))
end_df = data.iloc[end_row, :]
end_df = end_df.reset_index(drop=True)
return end_df
from Memorymatrix import IntervalSampl
mnum = 2000
cdata = pd.read_csv(r"a.csv", encoding='gbk')
sdata =IntervalSampl.equal_interval_sampling(cdata, mnum)
print(sdata.shape)
sdata.head(3)