数据清洗基础

最新推荐文章于 2023-12-14 09:51:44 发布

峰芒毕露438

最新推荐文章于 2023-12-14 09:51:44 发布

阅读量1.7k

点赞数 7

文章标签： python pandas 开发语言数据分析

本文链接：https://blog.csdn.net/qq_71139242/article/details/129835595

版权

第1关：读CSV文件

#********** Begin **********#
import pandas as pd
laptops=pd.read_csv("laptops.csv",encoding="Latin-1")
laptops.info()
#********** End **********#

第2关：清洗列名

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')
#********** Begin **********#
# laptops.columns=['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
#        'CPU', 'RAM', 'Storage', 'GPU', 'Operating System',
#        'Operating System Version', 'Weight', 'Price (Euros)']
# print(laptops.columns)
new_columns=[]
for x in laptops.columns:
    new_columns.append(x.strip())
laptops.columns=new_columns
print(laptops.columns)
#********** End **********#

第3关：清洗列名(续)

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')
#********** Begin **********#
new_columns=[]
for x in laptops.columns:
    x=x.strip()
    x=x.replace("Operating System","os")
    x=x.replace(" ","_")
    x=x.replace("(","")
    x=x.replace(")","")
    x=x.lower()
    new_columns.append(x)
laptops.columns=new_columns
print(laptops.columns)


#********** End **********#

第4关：将字符串列转换为数值列

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ", "_")
    col = col.replace("(", "")
    col = col.replace(")", "")
    col = col.lower()
    return col

new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
    
laptops.columns = new_columns
#********** Begin **********#
unique_ram=laptops["ram"].unique()
print(unique_ram)


#********** End **********#

第5关：删除非数字字符

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ", "_")
    col = col.replace("(", "")
    col = col.replace(")", "")
    col = col.lower()
    return col

new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)

laptops.columns = new_columns
#********** Begin **********#
laptops["ram"] = laptops["ram"].str.replace('GB','')
unique_ram=laptops["ram"].unique()
print(unique_ram)


#********** End **********#

第6关：将列转换为数字类型

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns

laptops["ram"] = laptops["ram"].str.replace('GB', '')
#********** Begin **********#
laptops["ram"]=laptops["ram"].astype(int)
print(laptops.dtypes)

#********** End **********#

第7关：列的重命名

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns

laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
#********** Begin **********#
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
print(laptops["ram_gb"].describe())
# count    1303.000000
# mean        8.382195
# std         5.084665
# min         2.000000
# 25%         4.000000
# 50%         8.000000
# 75%         8.000000
# max        64.000000
# Name: ram_gb, dtype: float64

#********** End **********#

第8关：从字符串中提取数值

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns

laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
#********** Begin **********#
laptops["gpu_manufacturer"]=laptops["gpu"].str.split().str[0]
laptops["cpu_manufacturer"]=laptops["cpu"].str.split().str[0]
print(laptops["cpu_manufacturer"].value_counts())

#********** End **********#

第9关：纠正错误值

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns

laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )
laptops["cpu_manufacturer"] = (laptops["cpu"]
                                       .str.split()
                                       .str[0]
                               )

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}
#********** Begin **********#
laptops["os"]=laptops["os"].map(mapping_dict)
print(laptops["os"].value_counts())


#********** End **********#

第10关：删除缺失值

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns
laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)

laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )
laptops["cpu_manufacturer"] = (laptops["cpu"]
                                       .str.split()
                                       .str[0]
                               )

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}
laptops["os"] = laptops["os"].map(mapping_dict)
#********** Begin **********#
laptops_no_null_rows=(laptops.dropna(axis=0))
laptops_no_null_cols=(laptops.dropna(axis=1))
print(laptops_no_null_rows.head(20))
print(laptops_no_null_cols.head(20))


#********** End **********#

第11关：填充缺失值

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col

new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)

laptops.columns = new_columns
laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )
laptops["cpu_manufacturer"] = (laptops["cpu"]
                                       .str.split()
                                       .str[0]
                               )

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}
laptops["os"] = laptops["os"].map(mapping_dict)
#********** Begin **********#
value_counts_before = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
print(value_counts_before)
laptops.loc[laptops["os"] == "No OS", "os_version"] ="Version Unknown"
laptops.loc[laptops["os"] == "macOS", "os_version"] ="Version Unknown"
value_counts_before = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
print(value_counts_before)

#********** End **********#

第12关：挑战：对字符串列进行清洗

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col

new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)

laptops.columns = new_columns
laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )
laptops["cpu_manufacturer"] = (laptops["cpu"]
                                       .str.split()
                                       .str[0]
                               )

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}
laptops["os"] = laptops["os"].map(mapping_dict)
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"
laptops.loc[laptops["os"] == "No OS", "os_version"] = "Version Unknown"
#********** Begin **********#
laptops["weight"] =laptops["weight"].str.replace('kg','')
laptops["weight"] =laptops["weight"].str.replace('s','')
laptops["weight"]=laptops["weight"].astype(float)
laptops.rename({"weight": "weight_kg"}, axis=1, inplace=True)
laptops.to_csv('laptops_clean.csv', index=False)
print(laptops.columns)
#********** End **********#