多因素分析时由于自变量较多导致分析过程复杂,分析过程复杂且容易逻辑混乱,现将主要思路总结如下
主要思路
一、独立性分析
1、分析各自变量直接是否独立,如果独立则分析较为简单,如果不独立则较为困难。
2、分析各自变量和因变量之间是否有关系,无关系则无需分析,减少要分析的自变量。
3、主要方法为各类检验,如卡方检验、F检验、T检验等。以及相关性检验。
二、转换为单变量分析
将其他变量固定,每次单分析一个变量。
三、分析主要变量
对变量中的主要类别进行分析,次要类别先不分析,一般是分析样本量里占比高的类别。
四、聚类分析
将变量聚类,然后按聚好的类,分类分析。聚类可以为
无、其他算法
1、关联分析。
2、机器学习算法。
相关Python 工具
一、数据清洗
1、选择关键列
2、按一定的规则对某些列归类,实际就是多重if 和elif
import pandas as pd
from pandasrw import load,dump
# 根据规则填充数据
def fill_data_category(row):
if row['无线协议'] == 'Wi-Fi 5' and row['带宽'] == 80:
return 'C1'
elif row['无线协议'] == 'Wi-Fi 6' and row['频段'] == '2.4G' and row['带宽'] == 20:
return 'C2'
elif row['无线协议'] == 'Wi-Fi 6' and row['频段'] == '5G' and row['带宽'] == 80:
return 'C3'
else:
return '其他'
# 根据规则填充天线数数据
def fill_data_antenna(row):
if row['无线协议'] == 'Wi-Fi 5' and row['天线数'] == 2:
return 2
elif row['无线协议'] == 'Wi-Fi 5' and row['天线数'] == 4:
return 2
elif row['无线协议'] == 'Wi-Fi 5' and row['天线数'] == 5:
return 2
elif row['无线协议'] == 'Wi-Fi 5' and row['天线数'] == 6:
return 3
elif row['无线协议'] == 'Wi-Fi 6' and row['天线数'] == 2:
return 2
elif row['无线协议'] == 'Wi-Fi 6' and row['天线数'] == 4:
return 2
elif row['无线协议'] == 'Wi-Fi 6' and row['天线数'] == 5:
return 3
elif row['无线协议'] == 'Wi-Fi 6' and row['天线数'] == 6:
return 3
else:
# 如果条件都不满足,则返回空格 ""。
return ""
if __name__ == '__main__':
path_s = r"D:\data\2023\wifi 测试\WIFI测试跟踪表--20230611-核准-V2.xlsx"
path_t = r"D:\data\2023\wifi 测试\归类.csv"
df_s = load(path_s,sheetname="测试问题跟踪表")
df = df_s
key_col=['墙体类别', '测试终端类型及型号', "天线数",'测试软件名称', '路由器索引', '无线协议', '频段', '电平', '带宽',
'下行', '上行', '测试场景']
df = df[key_col]
df['类型'] = df.apply(fill_data_category, axis=1)
df = df[df['类型'] != "其他"]
df['等效天线数'] = df.apply(fill_data_antenna, axis=1)
dump(df,path_t)
二、单变量分析
import pandas as pd
from pandasrw import load,dump
from functools import reduce
import time
"""
df:要分析的数据
essential_list:要因所在的列,即因变量,是多个列名的列表
analysis_col:本次要分析的列,即每次分析的列,是列名
target_col:目标列,要汇总的列,是列名
"""
def df_filter(df, essential_list, analysis_col, target_col):
clst = [i for i in essential_list if i != analysis_col]
# 将列表中包含的列的内容作为字符串合并成新列group_col
df["group_col"] = df[clst].astype(str).apply(lambda x: '/'.join(x), axis=1)
name_list = df[analysis_col].unique().tolist()
result_list = []
for name in name_list:
data = df[df[analysis_col] == name]
# 对新列groupby计算目标列的均值、最大值、最小值和方差
result = data.groupby("group_col")[target_col].agg(['mean', 'max', 'min', 'var'])
result = result.rename(
columns={'mean': f"mean:{name}", 'max': f"max:{name}", 'min': f"min:{name}", 'var': f"var:{name}"})
result_list.append(result)
#分列和改变列的顺序
pf = reduce(df_merge, result_list)
pf.reset_index(inplace=True)
df_m = df['group_col'].str.split('/', expand=True)
df_m.columns = essential_list
df_t = pd.concat([df_m, df], axis=1)
#将"group_col"放到第一列
df_t.insert(0, "group_col", df_t.pop("group_col"))
return pf,df_t
#只返回平均值
def df_filter_mean(df, essential_list, analysis_col, target_col):
clst = [i for i in essential_list if i != analysis_col]
# 将列表中包含的列的内容作为字符串合并成新列group_col
df["group_col"] = df[clst].astype(str).apply(lambda x: '/'.join(x), axis=1)
name_list = df[analysis_col].unique().tolist()
result_list = []
for name in name_list:
data = df[df[analysis_col] == name]
# 对新列groupby计算目标列的均值、最大值、最小值和方差
result = data.groupby("group_col")[target_col].agg(['mean'])
result = result.rename(
columns={'mean': f"mean:{name}"})
result_list.append(result)
# 分列和改变列的顺序
pf = reduce(df_merge, result_list)
pf.reset_index(inplace=True)
df_m = df['group_col'].str.split('/', expand=True)
df_m.columns = essential_list
df_t = pd.concat([df_m, df], axis=1)
# 将"group_col"放到第一列
df_t.insert(0, "group_col", df_t.pop("group_col"))
return pf, df_t
#合并各类分析的结果,how用outer,取并集
def df_merge(df1, df2):
df = pd.merge(df1, df2, on="group_col",how="outer")
return df
if __name__ == '__main__':
path_s = r"D:\data\2023\wifi 测试\WIFI测试分析表.csv"
path_t = r"D:\data\2023\wifi 测试\WIFI测试分析表123.xlsx"
#path_t =f"path_t{time.time()}"
print(path_t)
df = load(path_s, engine="pandas")
essential_list=['路由器型号','测试场景']
analysis_col = "测试软件名称"
target_col = "下行"
pf=df_filter(df, essential_list, analysis_col, target_col)[0]
dump(pf,path_t,engine="pandas")
三、相关性检验
1、卡方检验
import pandas as pd
import numpy as np
from pandasrw import load,dump
from functools import reduce
import time
from scipy.stats import chi2_contingency
#x,y为单列的pandas Series 或 1d numpy array
def chi2_r(x,y):
# 将分类变量和连续变量进行交叉制表
obs_table = pd.crosstab(x, y)
# 进行卡方检验并获取卡方值和p值
chi2, p, dof, ex = chi2_contingency(obs_table)
return chi2, p
#x,y为单列的pandas Series 或 1d numpy array
def cramers_v(x, y):
obs_table = pd.crosstab(x, y)
chi2, p, dof, ex = chi2_contingency(obs_table)
n = obs_table.sum().sum()
phi2 = chi2 / n
r, k = obs_table.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
V = np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
return V
#X是一个列名列表,每个元素都是列名,与Y列逐个进行卡方计算,Y是列名
def chi2_df(df,X_list,Y):
i=0
df_c=pd.DataFrame(columns=["列1","列2","chi2","p","v"])
y=df[Y]
for X in X_list:
x=df[X]
s=chi2_r(x,y)
df_c.at[i,"列1"]=X
df_c.at[i,"列2"]=Y
df_c.at[i,"chi2"]=s[0]
df_c.at[i,"p"]=s[1]
df_c.at[i,"v"]=cramers_v(x, y)
i+=1
return df_c
if __name__ == '__main__':
path_s = r"D:\data\2023\wifi 测试\WIFI测试分析表.csv"
path_t = r"D:\data\2023\wifi 测试\WIFI测试分析表相关性223.xlsx"
df_s = load(path_s, engine="pandas")
df=df_s
df_c = pd.DataFrame(columns=["列1", "列2", "chi2", "p", "v"])
X_list = ['测试软件名称', '路由器型号', '无线协议', '频段', '电平', '带宽', '信道', '下行', '测试场景']
Y="下行"
df_t=chi2_df(df,X_list,Y)
dump(df_t,path_t)
2、F检验 方差分析anova
import pandas as pd
import numpy as np
from pandasrw import load,dump
from functools import reduce
import time
from scipy.stats import chi2_contingency
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
def F_anova(df,category,continuous):
formula = f"{continuous} ~{category} "
# 运行一元线性回归模型并执行方差分析
model = ols(formula=formula, data=df).fit()
anova_table = anova_lm(model, typ=2)
return anova_table
def F_df(df,X_list,Y):
i=0
df_c=pd.DataFrame(columns=["列1","列2","F","p"])
for X in X_list:
print(X)
pf=df[[X,Y]]
pf.dropna(inplace=True, how='any')
s=F_anova(pf,X,Y)
df_c.at[i,"列1"]=X
df_c.at[i,"列2"]=Y
df_c.at[i,"F"]=s.iloc[0,2]
df_c.at[i,"p"]=s.iloc[0,3]
i+=1
return df_c
if __name__ == '__main__':
path_s =r"D:\data\2023\wifi 测试\WIFI测试跟踪表V2.csv"
path_t = r"D:\data\2023\wifi 测试\WIFI测试分析表相关性323.xlsx"
df_s = load(path_s, engine="pandas")
df=df_s
C=['测试软件名称', '无线协议', '频段', '电平', '带宽', '信道', '测试场景']
F = F_df(df, C, "下行")
print(F)