import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
# 要读取的文件名
file_name = 'test.txt'
# 读取数据
with open(file_name, 'r') as f:
data = f.readlines()
# 读取数据行数
n = len(data)
# 初始化输出数组
arr = []
for i in range(n):
# 提取一行字符串,去掉末尾的换行符
row = data[i].strip('\n')
# 该字符串拆分为数组,拼接到输出数组的下方
arr.append(row.split('\t'))
# 转为np数组
arr = np.array(arr)
# 设置表头
# head = ['carid'
# , 'tradetime'
# , 'brand'
# , 'serial'
# , 'model'
# , 'mileage'
# , 'color'
# , 'cityid'
# , 'carcode'
# , 'transfercount'
# , 'seatings'
# , 'registerDate'
# , 'licenseDate'
# , 'country')
# , 'maketype')
# , 'modelyear'
# , 'displacement'
# , 'gearbox'
# , 'oiltype'
# , 'newprice'
# , 'anon_1'
# , 'anon_2'
# , 'anon_3'
# , 'anon_4'
# , 'anon_5'
# , 'anon_6'
# , 'anon_7'
# , 'anon_8'
# , 'anon_9'
# , 'anon_10'
# , 'anon_11'
# , 'anon_12'
# , 'anon_13'
# , 'anon_14'
# , 'anon_15'
# , 'price'
# ]
head = ['carid'
, 'tradetime'
, 'brand'
, 'serial'
, 'model'
, 'mileage'
, 'color'
, 'cityid'
, 'carcode'
, 'transfercount'
, 'seatings'
, 'registerDate'
, 'licenseDate'
, 'country'
, 'maketype'
, 'modelyear'
, 'displacement'
, 'gearbox'
, 'oiltype'
, 'newprice'
, 'anon_1'
, 'anon_2'
, 'anon_3'
, 'anon_4'
, 'anon_5'
, 'anon_6'
, 'anon_7'
, 'anon_8'
, 'anon_9'
, 'anon_10'
, 'anon_11'
, 'anon_12'
, 'anon_13'
, 'anon_14'
, 'anon_15'
]
# 转为dataframe
df = pd.DataFrame(arr, columns=head)
# 将空单元格设置为nan
df[df == ''] = np.nan
# 统计缺失值比例
report = df.isnull().sum() / len(df)
# 筛选出存在缺失值的特征
boolens = [not i for i in report == 0]
report = report[boolens]
# 找到缺失值超过25%的特征
missing_index = report[report >= 0.25].index
# 删除这些特征
df = df.drop(missing_index, axis=1)
# 转为数值类型
df = df.apply(pd.to_numeric, errors='ignore')
# 日期字符串转为日期类型
df['tradetime'] = pd.to_datetime(df['tradetime'])
df['registerDate'] = pd.to_datetime(df['registerDate'])
df['licenseDate'] = pd.to_datetime(df['licenseDate'])
# 将日期的时间戳类型转为数值类型
df = df.apply(pd.to_numeric, errors='ignore')
# 统计anon_11特征中有哪些类型
anon_11_keys = set(df['anon_11'].values)
# 缺失值不算类型
anon_11_keys.remove(np.nan)
# 共有n个类型
n = len(anon_11_keys)
# 创建类型与数字间的映射字典
anon_11_dict = {key: value for key, value in zip(anon_11_keys, range(n))}
# 将不同的类型映射为不同的数值
df['anon_11'] = df['anon_11'].map(anon_11_dict)
# 统计anon_12特征中有哪些类型
anon_12_keys = set(df['anon_12'].values)
anon_12_keys.remove(np.nan)
# 共有n个类型
n = len(anon_12_keys)
# 创建类型与数字间的映射字典
anon_12_dict = {key: value for key, value in zip(anon_12_keys, range(n))}
# 将不同的类型映射为不同的数值
df['anon_12'] = df['anon_12'].map(anon_12_dict)
# 实例化KNN填充器
knn_imp = KNNImputer(n_neighbors=1)
# 对dataframe的数组填充
arr = knn_imp.fit_transform(df.values)
# 合成新dataframe
df = pd.DataFrame(arr, columns=df.columns)
# 构建逆向字典
verse_anon_11_dict = {key: value for value, key in zip(anon_11_dict.keys(), anon_11_dict.values())}
verse_anon_12_dict = {key: value for value, key in zip(anon_12_dict.keys(), anon_12_dict.values())}
# 将数值映射回字符串类型
df['anon_11'] = df['anon_11'].map(verse_anon_11_dict)
df['anon_12'] = df['anon_12'].map(verse_anon_12_dict)
# 数值类型转换回日期类型
df['tradetime'] = pd.to_datetime(df['tradetime'])
df['registerDate'] = pd.to_datetime(df['registerDate'])
df['licenseDate'] = pd.to_datetime(df['licenseDate'])
# 输出
df.to_csv('test.csv', index=None)
mathorcup数据大赛
最新推荐文章于 2023-03-10 10:03:59 发布