import pandas as pd
# 读取Excel文件
file_path = "量表编码.xlsx"
data = pd.read_excel(file_path)
# 假设多选题的列名为 'multiple_choice_question'
multiple_choice_cols = [
"7、",
"8、",
"9、",
"12、",
"14、",
"21、",
"25、",
]
# 处理多选题列,进行独热编码
def process_multiple_choice_columns(data, columns):
for col in columns:
if col in data.columns:
col_index = data.columns.get_loc(col)
multiple_choice_data = data[col].str.get_dummies("┋")
data = data.drop(columns=[col])
for new_col in multiple_choice_data.columns:
data.insert(
col_index, f"{col}_{new_col}", multiple_choice_data[new_col]
)
col_index += 1
return data
# 处理多选题列
data = process_multiple_choice_columns(data, multiple_choice_cols)
# 处理非数值型数据进行编码
def process_non_numeric_columns(data):
non_numeric_cols = data.select_dtypes(include=["object"]).columns
for col in non_numeric_cols:
data[col] = pd.Categorical(data[col]).codes
return data
# 编码非数值型数据
data = process_non_numeric_columns(data)
data_scaled = pd.DataFrame(data)
data_scaled.to_excel("data_scaled.xlsx")
关于问卷数据的多选题独热编码处理
最新推荐文章于 2025-05-10 22:06:54 发布