# 使用Python估计数据概率分布函数

Python学习 同时被 2 个专栏收录
11 篇文章 0 订阅
1 篇文章 0 订阅

[1.4000000e+013.2000000e+01，7.8000000e+01，1.1600000e+02，1.8800000e+02，2.2200000e+02，2.6300000e+02，3.1200000e+02，3.2200000e+02，4.2100000e+02，4.8800000e+02，5.0400000e+02，5.1900000e+02，5.7200000e+02，7.2600000e+02，9.4500000e+02，1.0100000e+03，1.0650000e+03，1.1900000e+03]

## 第一步，读取脉冲位置并计算脉冲间时间间隔

def read_diff():
d = pd.read_table('data/peaks.csv', names=['pks', 'locs', 'None'], delim_whitespace=True)
time = d['locs']
dif = []
for i, ele in enumerate(time[1:]):
dif.append(ele - time[i])
return dif

## 第二步，对间隔频率进行统计

matplotlib.pyplot.hist(x, bins=10, range=None, normed=False, weights=None, cumulative=False, bottom=None, histtype='bar', align='mid', orientation='vertical', rwidth=None, log=False, color=None, label=None, stacked=False, hold=None, data=None, **kwargs)

range : tuple or None, optional

The lower and upper range of the bins. Lower and upper outliers are ignored. If not provided, range is (x.min(), x.max()). Range has no effect if bins is a sequence.

If bins is a sequence or range is specified, autoscaling is based on the specified bin range instead of the range of x.

Default is None

import matplotlib.pylab as plt
cnt = plt.hist(dif, bins=bin)

## 第三步，对已得的频率分布做概率密度估计

### Plan A

import numpy as np
from scipy.optimize import leastsq
def func_n(x, a, b, c):
return a * np.square(x) + b * x + c

def residuals(p, x, y, reg):
regularization = 0.1  # 正则化系数lambda
ret = y - func_n(x, p)
if reg == 1:
ret = np.append(ret, np.sqrt(regularization) * p)
return ret

def diff_time(bin=50, reg=1):
d = pd.read_table('data/peaks.csv', names=['pks', 'locs', 'None'], delim_whitespace=True)
time = d['locs']
dif = []
for i, ele in enumerate(time[1:]):
dif.append(ele - time[i])
plt.figure()
cnt = plt.hist(dif, bins=bin)
x = cnt[1]
y = cnt[0]
try:
r = leastsq(residuals, [1, 1, 1], args=(x, y, reg))
except:
print("Error - curve_fit failed")
y2 = [func(i, r[0]) for i in x]
plt.plot(x, y2)

### Plan B

[curve_fit]

def kde_sim(bins=150):
plt.figure()
cnt = plt.hist(dif, bins=bins)
x = cnt[1]
y = cnt[0]
popt, pcov = curve_fit(func, x[1:], y)
y2 = [func(i, popt[0], popt[1], popt[2]) for i in x]
plt.figure()
plt.plot(x, y2, 'r--')
return popt

### Plan C

sns.distplot(data)

from scipy import stats
sns.distplot(d,kde=False, fit=stats.expon)

## 第四步 得到其概率密度函数

scipy.stats

Rvs:随机变量
Pdf:概率密度函数
Cdf:累计分布函数
Sf:残差函数
Ppf:百分点函数，cdf/1
Isf:1/sf

### Plan B 继续使用Seaborn模块

statsmodels和scipy.stats.gaussian_kde

distplot里关于拟合的代码如下

    if fit is not None:
fit_color = fit_kws.pop("color", "#282828")
gridsize = fit_kws.pop("gridsize", 200)
cut = fit_kws.pop("cut", 3)
clip = fit_kws.pop("clip", (-np.inf, np.inf))
bw = stats.gaussian_kde(a).scotts_factor() * a.std(ddof=1)
x = _kde_support(a, bw, gridsize, cut, clip)
params = fit.fit(a)  #stats中分布具有fit方法
pdf = lambda x: fit.pdf(x, *params)   #由参数得到概率密度函数
y = pdf(x) #得到y向量
if vertical:
x, y = y, x
ax.plot(x, y, color=fit_color, **fit_kws)
if fit_color != "#282828":
fit_kws["color"] = fit_color

Methods

cdf()   Returns the cumulative distribution function evaluated at the support.
cumhazard() Returns the hazard function evaluated at the support.
entropy()   Returns the differential entropy evaluated at the support
evaluate(point) Evaluate density at a single point.
fit([kernel, bw, fft, weights, gridsize, ...])  Attach the density estimate to the KDEUnivariate class.
icdf()  Inverse Cumulative Distribution (Quantile) Function
sf()    Returns the survival function evaluated at the support.

Attributes

dataset (ndarray) The dataset with which gaussian_kde was initialized.
d   (int) Number of dimensions.
n   (int) Number of datapoints.
factor  (float) The bandwidth factor, obtained from kde.covariance_factor, with which the covariance matrix is multiplied.
covariance  (ndarray) The covariance matrix of dataset, scaled by the calculated bandwidth (kde.factor).
inv_cov (ndarray) The inverse of covariance.

Methods
kde.evaluate(points)    (ndarray) Evaluate the estimated pdf on a provided set of points.
kde(points) (ndarray) Same as kde.evaluate(points)
kde.integrate_gaussian(mean, cov)   (float) Multiply pdf with a specified Gaussian and integrate over the whole domain.
kde.integrate_box_1d(low, high) (float) Integrate pdf (1D only) between two bounds.
kde.integrate_box(low_bounds, high_bounds)  (float) Integrate pdf over a rectangular space between low_bounds and high_bounds.
kde.integrate_kde(other_kde)    (float) Integrate two kernel density estimates multiplied together.
kde.resample(size=None) (ndarray) Randomly sample a dataset from the estimated pdf.
kde.set_bandwidth(bw_method=’scott’)    (None) Computes the bandwidth, i.e. the coefficient that multiplies the data covariance matrix to obtain the kernel covariance matrix. .. versionadded:: 0.11.0
kde.covariance_factor   (float) Computes the coefficient (kde.factor) that multiplies the data covariance matrix to obtain the kernel covariance matrix. The default is scotts_factor. A subclass can overwrite this method to provide a different method, or set it through a call to kde.set_bandwidth.

sns.distplot(data, bins=150, kde=True)

from scipy import stats
sns.displot(data, bins=150, kde=False, fit=stats.expon)

fit后使用较少数量的样本值同样也是

12-03 1万+
08-09 2111
10-11 6314
02-04 1128
02-04 1840

CaspianR

¥2 ¥4 ¥6 ¥10 ¥20

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、C币套餐、付费专栏及课程。