原始数据:
filecpu= "./statistic.csv"
filefre= "./statistic_freq.csv"
data_statistic = pd.read_csv(filecpu)
data_statistic_fre = pd.read_csv(filefre, sep=" ,split, ")
print(data_statistic.dtypes)
print('-----')
print(data_statistic_fre.dtypes)
print('-----')
#如果一列中含有多个类型,则该列的类型会是object,同样字符串类型的列也会被当成object类型.
清洗数据过程:
# 提取需要的2列数据
data_statistic_key = data_statistic[["Time", key]]
# 删除空数据的行
data_statistic_key = data_statistic_key.dropna(axis=0)
# 把日期数据转换成 datetime 的格式
data_statistic_key_time_ori = data_statistic_key["Time"]
time_standard = [parser.parse(x) for x in data_statistic_key_time_ori]
data_statistic_key["Time"] = time_standard
def cleaning_data_statistic(self, data):
#删除前10行数据
#data1 = data.drop(data.head(10).index)
key_data = data1.keys()
len_key = len(key_data)
#将% 替换为 空格
data1[key_data[len_key - 1]] = data1[key_data[len_key - 1]].replace('%', '')
data1_line2 = data1[key_data[len_key - 1]]
if data1_line2.dtypes == object:
#删除含有空格的数据
data1 = data1[~ data1_line2.str.contains(' ')]
#删除含有字符的数据
data1 = data1[~ data1_line2.str.contains(r'[A-Za-z\n]')]
data1_line2 = data1[key_data[len_key - 1]]
if data1_line2.dtypes == object:
#将数据类型转换为float,方便后续numpy数学运算
data1[key_data[len_key - 1]] = data1[key_data[len_key - 1]].astype('float')
return data1