数据分析习题

*第一章*

import numpy as np #均值,各种差等
import pandas as pd #偏度峰度
import matplotlib.pyplot as plt #画图
import seaborn as sns #界面可视化
from scipy import stats
from scipy.stats import shapiro #检验类
from scipy.stats import kstest #Kolmogorov-Smirnov检验
from scipy.stats import chi2_contingency #卡方检验
from itertools import groupby #茎叶图

a =np.array([ 74.3,78.8,68.8,78.0,70.4,80.5,80.5,69.7,71.2,73.5
,79.5,75.6,75.0,78.8,72.0,72.0,72.0,74.3,71.2,72.0
,75.0,73.5,78.8,74.3,75.8,65.0,74.3,71.2,69.7,68.0
,73.5,75.0,72.0,64.3,75.8,80.3,69.7,74.3,73.5,73.5
,75.8,75.8,68.8,76.5,70.4,71.2,81.2,75.0,70.4,68.0
,70.4,72.0,76.5,74.3,76.5,77.6,67.3,72.0,75.0,74.3
,73.5,79.5,73.5,74.7,65.0,76.5,81.6,75.4,72.7,72.7
,67.2,76.5,72.7,70.4,77.2,68.8,67.5,67.5,67.3,72.7
,75.8,73.5,75.0,73.5,73.5,73.5,72.7,81.6,70.3,74.3
,73.5,79.5,70.4,76.5,72.7,77.2,84.3,75.0,76.5,70.4])

print(np.mean(a)) # 求均值
print(np.std(a)) # 求标准差
print(np.var(a)) # 求方差
print(np.std(a)/np.mean(a)) # 求变异系数
print(np.median(a)) # 求中位数
print(np.max(a)-np.min(a)) # 求极差
s = pd.Series(a)
print(s.skew()) # 求偏度
print(s.kurt()) # 求峰度
print(np.percentile(a, (75,25,99,95,90,10,5,1), interpolation=‘midpoint’)) #求诸分位数 a为源数据,percentile内的元组为要求的第N分位数,比如75为0.75分位数
# 求四分位极差
Q = np.percentile(a, (75,25), interpolation=‘midpoint’)
print(Q[0]-Q[1])
# 求三均值
M = np.median(a)
print(0.25Q[1]+0.5M+0.25Q[0])
# 四分位标准差
R = Q[0]-Q[1]
print(R/1.349)
# 上下截断点
print(Q[1]-1.5
R)
print(Q[0]+1.5*R)

#做直方图
sns.distplot(a,color=“r”,bins=30,kde=True)
plt.show()

# 做正态QQ图
stats.probplot(a, dist=“norm”, plot=plt)
plt.show()

#箱线图
s = pd.Series(a)
df = pd.DataFrame(s) #数据帧
df.columns=[‘value’] #塔器
f = plt.boxplot(df[‘value’], vert =False, patch_artist =False,meanline =True,showmeans = True)
plt.show()

#茎叶图

nums2=[25,45,50,54,55,61,64,68,72,75,75,78,79,81,83,84,84,84,85,86,86,86,87,89,89,89,90,91,91,92,100]
for k, g in groupby(sorted(nums2), key=lambda x: int(x) // 10):
lst = map(str, [int(y) % 10 for y in list(g)])
print (k, ‘|’, ’ '.join(lst))

# 正态w检验
stat,p = shapiro(a)
print(“stat:%f” %stat,“p value:%f” %p)

*第二章*

from scipy.stats import ranksums #秩和检验
from scipy.stats import mannwhitneyu #U检验
from scipy.stats import ks_2samp #卡方检验
from scipy.stats import wilcoxon #wilcoxon符号秩检验
from scipy import stats #随机变量,统计函数
from scipy.stats import friedmanchisquare # 秩方差分析
a = [23.1,57.6,10.5,23.6,11.9,54.6,21.0,20.3]
b = [22.7,53.2,9.7,19.6,13.8,47.1,13.6,23.6]
c = [22.5,53.7,10.8,21.1,13.7,39.2,13.7,16.3]
d = [22.6,53.1,8.3,21.6,13.3,37.0,14.8,14.8]
# 秩和检验 检验是否有显著差异
print(ranksums(a, b)) # ks检验 检验指定的两个数列是否服从相同分布

# U检验
print(mannwhitneyu(a,b,use_continuity=False,alternative=‘greater’))

# ks检验 检验指定的两个数列是否服从相同分布
print(ks_2samp(a,b,))

##wilcoxon符号秩检验
print(wilcoxon(a,b,zero_method = ‘wilcox’,correction = ‘False’)) #丢弃所有的零差
print(wilcoxon(a,b,zero_method=‘zsplit’)) #零秩分裂

#Kruskal-Wallis检验
stats.kruskal(a,b,c,d)

# 秩方差分析
print(friedmanchisquare(a,b,c,d))

*第三章*

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import statsmodels.api as sm #用于拟合多种统计模型,执行统计测试以及数据探索和可视化
from sklearn.linear_model import LinearRegression #线性回归
from sklearn import metrics

#线性回归
x=[[274,2450],[180,3254],[375,3802],[205,2838],[86,2347],[265,3782],[98,3008],[330,2450],[195,2137],[53,2560],[430,4020],[372,4427],[236,2660],[157,2088],[370,2605]]
y=[162,120,223,131,67,169,81,192,116,55,252,232,144,103,212]
linreg=LinearRegression()
linreg.fit(x,y) #方法fit的输入参数包含数据和标签。数据是x,标签是y
print(linreg.intercept_,linreg.coef_) #输出线性回归的截距和各个系数,coef为控制变量系数
y_per=linreg.predict(x) ##模型拟合测试集
print(“y_per”,y_per)
err=metrics.mean_squared_error(y,y_per) #MSE即方差
print(err)
decision_score=linreg.score(x,y)
print(decision_score)

#通过OLS回归模型获得各个参数
x=[[1,274,2450],[1,180,3254],[1,375,3802],[1,205,2838],[1,86,2347],[1,265,3782],[1,98,3008],[1,330,2450],[1,195,2137],[1,53,2560],[1,430,4020],[1,372,4427],[1,236,2660],[1,157,2088],[1,370,2605]]
y=[162,120,223,131,67,169,81,192,116,55,252,232,144,103,212]
model=sm.OLS(y,x).fit()
sm.stats.linear_rainbow(model)
print(model.summary())

#Logit回归
x=[[1,2.5,0,0],[1,173.0,2,0],[1,119.0,2,0],[1,10.0,2,0],[1,502.2,2,0],[1,4.0,0,0],[1,14.4,0,1],[1,2.0,2,0],[1,40.0,2,0],[1,6.6,0,0],[1,21.4,2,1],[1,2.8,0,0],[1,2.5,0,0],[1,6.0,0,0],[1,3.5,0,1],[1,62.2,0,0],[1,10.8,0,1],[1,21.6,0,1],[1,2.0,0,1],[1,3.4,2,1],[1,5.1,0,1],[1,2.4,0,0],[1,1.7,0,1],[1,1.1,0,1],[1,12.8,0,1],[1,1.2,2,0],[1,3.5,0,0],[1,39.7,0,0],[1,62.4,0,0],[1,2.4,0,0],[1,34.7,0,0],[1,28.4,2,0],[1,0.9,0,1],[1,30.6,2,0],[1,5.8,0,1],[1,6.1,0,1],[1,2.7,2,1],[1,4.7,0,0],[1,128.0,2,1],[1,35.0,0,0],[1,2.0,0,0],[1,8.5,0,1],[1,2.0,2,1],[1,2.0,0,1],[1,4.3,0,1],[1,244.8,2,1],[1,4.0,0,1],[1,5.1,0,1],[1,32.0,0,1],[1,1.4,0,1]]
y=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1]
print(x) # 查看样本

model=sm.Logit(y,x).fit()
print(model.summary())

*第四章*

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA #主成分分析
from factor_analyzer import FactorAnalyzer #因子分析

X=np.array([[92,77,80,95,99,126],[97,75,77,80,95,125],[95,80,70,78,89,120],
[75,75,73,88,98,110],[92,68,72,79,88,113],[90,85,80,70,78,103],
[72,93,75,77,80,100],[88,70,76,72,81,102],[64,70,69,85,93,105],
[70,73,70,87,84,100],[78,69,75,73,89,97],[78,72,71,68,75,96],
[75,64,63,76,73,92],[84,66,77,55,65,76],[70,64,51,60,67,88],
[58,72,75,62,52,75],[82,73,40,50,48,61],[45,65,42,47,43,60]])

# 主成分分析
corrMat= np.corrcoef(X,rowvar=0) #corrcoef得到相关系数矩阵(向量的相似程度)
print(corrMat)
corr_eigVals,corr_eigVects=np.linalg.eig(np.mat(corrMat)) #求特征值和特征向量
print(corr_eigVals)
print(corr_eigVects)

covMat= np.cov(X,rowvar=0) # 相关矩阵
print(covMat)
cov_eigVals,cov_eigVects=np.linalg.eig(np.mat(covMat)) #求特征值和特征向量
print(cov_eigVals)
print(cov_eigVects)

eigValInd=np.argsort(cov_eigVals) #对特征值的下标进行排序操作
topNfeat=9999999
eigValInd=eigValInd[:-(topNfeat+1):-1]
redEigVals=cov_eigVals[eigValInd]
redEigVects=cov_eigVects[:,eigValInd]
print(redEigVals)
print(redEigVects)

pca = PCA(n_components=0.9)
pca.fit(X)

print(pca.explained_variance_ratio_) #特征方差的百分比
print(pca.explained_variance_) #特征值
print(pca.components_) #特征向量

#因子分析
fa = FactorAnalyzer(rotation=None, n_factors=2, method=‘principal’)
fa.fit(X)
fa_sd = fa.get_factor_variance()
fa_df = pd.DataFrame({‘特征值’: fa_sd[0], ‘方差贡献率’: fa_sd[1], ‘方差累计贡献率’: fa_sd[2]})
print(fa_df)
print(fa.get_communalities()) #公因子方差
print(fa.loadings_) #成分矩阵

*第五章*

import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #线性判别分析
from sklearn.naive_bayes import GaussianNB #贝叶斯判别

X_train= np.array([[52.89,25.47,15.46,6.66,5.30],[64.59,27.57,19.63,11.27,6.12],
[102.59,44.35,32,18.83,7.41],[139.12,63.48,44.44,17.47,13.73],
[160.69,73.32,54.15,18.40,14.82],[189.26,89.41,57.16,24.55,18.14],
[225.61,106.56,65.59,33.95,19.51],[268.25,114.22,87.90,36.06,30.07],
[302.36,136.02,96.86,38,31.48],[348.63,155.01,109.66,48.55,35.41],
[408.86,189.97,127.12,42.74,49.03],[438.6,189.9,151.92,47.28,49.5],
[543.85,272.8,168.06,52.89,50.1],[575.62,277.22,189.03,61.52,47.85],
[975.54,416.64,300.79,111.06,147.05],[1095.34,483.98,335.93,95.90,179.53],
[1334.91,609.69,389.14,112.5,223.58],[1688.5,779.94,483.36,134.4,290.80]])

y_train = np.array([1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2]) #1为G1,2为G2

X_test=np.array([[703.26,359.64,223.08,63.37,57.17],[816.22,398.6,269.85,69.99,77.78]])

#线性判别分析
LDA = LinearDiscriminantAnalysis(store_covariance=True)
LDA.fit(X_train, y_train)
print(‘mean:’)
print(LDA.means_) #均值
print(‘cov:’)
print(LDA.covariance_) #协方差
print(‘decision:’)
print(LDA.coef_) #输出线性回归的截距和各个系数,coef为控制变量系数
print(LDA.intercept_)
print(‘predict:’)
LDA.score(X_train,y_train)
print(LDA.predict(X_test))

#贝叶斯判别
gnb = GaussianNB()
gnb.fit(X_train, y_train)

print(‘predict:’) #预测值
print(gnb.predict(X_train))

print(‘Accuracy:’) #精确度
print(gnb.score(X_train, y_train))

print(‘class_prior_’) #属性
print(gnb.class_prior_)

print(‘class_count’) #每一类的概率
print(gnb.class_count_)
print(‘sigma’)
print(gnb.sigma_) #每个类中各个特征的方差
print(‘theta’)
print(gnb.theta_) ##每个类中各个特征的平均

print(‘predict_proba’) #返回的是一个 n 行 k 列的数组
print(gnb.predict_proba(X_train))

*第六章*

import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram #聚类分析
from sklearn.cluster import KMeans #K-Means聚类
#method:single, complete,average,centroid, ward, median
# 最短距离 最大距离 平均距离 重心距离 利差平均和 中位数

data=np.array([[7.90,39.77,8.49,12.94,19.27,11.05,2.04,13.29],[7.68,50.37,11.35,13.30,19.25,14.59,2.75,14.87],
[9.42,27.93,8.20,8.14,16.17,9.42,1.55,9.76],[9.16,27.98,9.01,9.32,15.99,9.10,1.82,11.35],
[10.06,28.64,10.52,10.05,16.18,8.39,1.96,10.81]])
z = linkage(data, method = ‘single’, metric = ‘euclidean’) # method = 'single’时的谱系聚类图
print(z)
p = dendrogram(z) # 画谱系聚类图
plt.show()

#去掉metric = 'euclidean’时的谱系聚类图
data1=[2,2,7,6,6,6,6,7,9,9,1,5,4,6,6,6,7,8,9,6,5,6,5,5,6,8,9,5,9,9,9,10,8,9,7,7,7,8,9,9,2,1,5,10,9,1,3,10,9,4,10,9,10,9,8]
#下三角距离矩阵按列输入
z = linkage(data1, method = ‘single’) # method = 'single’时的谱系聚类图,去掉metric = ‘euclidean’
print(z)
p = dendrogram(z) # 画谱系聚类图
plt.show()

# K-Means聚类
K = 3
center=np.array([[4,3],[-3,2],[0,-1]])
data =np.array([[0,5],[2,3],[2,5],[4,4],[4,3],[5,1],[6,2],[-4,3],[-3,2],[-3,0],[-5,2],[1,1],[0,-1],[0,-2],[-1,-1],[-1,-3]])

kc=KMeans(n_clusters=K,init= center).fit(data)
print(kc.labels_) #标签
print(kc.cluster_centers_) #聚类中心

  • 2
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值