清除缺失数据
# 导入所需的库文件
import numpy as np
import pandas as pd
import time, datetime
import matplotlib.pyplot as plt
# 导入数据
yellow_taxi = pd.DataFrame(pd.read_csv('./信息通信实验数据/yellow_tripdata_2016-01.csv'))
# yellow_taxi2 = pd.DataFrame(pd.read_csv('./信息通信实验数据/yellow_tripdata_2016-02.csv'))
# yellow_taxi3 = pd.DataFrame(pd.read_csv('./信息通信实验数据/yellow_tripdata_2016-03.csv'))
# 合并数据
# yellow_taxi = pd.concat([yellow_taxi1, yellow_taxi2], ignore_index=False)
# yellow_taxi = pd.concat([yellow_taxi, yellow_taxi3], ignore_index=False)
# 查看数据表维度
print(yellow_taxi.shape)
print(yellow_taxi.head(),"1")
# # 查看数据表列名称
print(yellow_taxi.columns)
print("读取成功")
# print(yellow_taxi.info())
rows_with_missing = yellow_taxi[yellow_taxi.isnull().any(axis=1)]
clean_data = yellow_taxi.drop(rows_with_missing.index)
# 保存处理后的数据到 CSV 文件
clean_data.to_csv('2016-1-clean_data.csv', index=False)
每个字段设置条件进行筛选
data = pd.read_csv('2016-1-clean_data.csv')
# 设置每个列的异常值条件判断
column_conditions = {
'passenger_count': lambda x: isinstance(x, int) and x > 0 and x < 10,
'trip_distance': lambda x: 0 < x <= 50,
'fare_amount': lambda x: 0 < x ,
'mta_tax': lambda x: 0 < x <= 50,
'tip_amount': lambda x: 0 < x <= 50,
'tolls_amount': lambda x: 0 < x <= 50,
}
# 处理数据集
for column, condition in column_conditions.items():
if not all(condition(x) for x in data[column]):
data = data.drop(data.index[data[column].apply(lambda x: not condition(x))])
# 处理数据集
# for column, condition in column_conditions.items():
# if not condition(data[column]).all():
# # 如果判断条件不成立,则删除该列
# data = data.drop([column], axis=1)
# 保存处理后的数据为 CSV 文件
data.to_csv('processed_data.csv', index=False)
print("finish")