数据清洗基础

第1关:读CSV文件

#********** Begin **********#
import pandas as pd
laptops=pd.read_csv("laptops.csv",encoding="Latin-1")
laptops.info()
#********** End **********#

 第2关:清洗列名

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')
#********** Begin **********#
# laptops.columns=['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
#        'CPU', 'RAM', 'Storage', 'GPU', 'Operating System',
#        'Operating System Version', 'Weight', 'Price (Euros)']
# print(laptops.columns)
new_columns=[]
for x in laptops.columns:
    new_columns.append(x.strip())
laptops.columns=new_columns
print(laptops.columns)
#********** End **********#

第3关:清洗列名(续) 

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')
#********** Begin **********#
new_columns=[]
for x in laptops.columns:
    x=x.strip()
    x=x.replace("Operating System","os")
    x=x.replace(" ","_")
    x=x.replace("(","")
    x=x.replace(")","")
    x=x.lower()
    new_columns.append(x)
laptops.columns=new_columns
print(laptops.columns)


#********** End **********#

 第4关:将字符串列转换为数值列

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ", "_")
    col = col.replace("(", "")
    col = col.replace(")", "")
    col = col.lower()
    return col

new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
    
laptops.columns = new_columns
#********** Begin **********#
unique_ram=laptops["ram"].unique()
print(unique_ram)


#********** End **********#

 第5关:删除非数字字符

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ", "_")
    col = col.replace("(", "")
    col = col.replace(")", "")
    col = col.lower()
    return col

new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)

laptops.columns = new_columns
#********** Begin **********#
laptops["ram"] = laptops["ram"].str.replace('GB','')
unique_ram=laptops["ram"].unique()
print(unique_ram)


#********** End **********#

 第6关:将列转换为数字类型

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns

laptops["ram"] = laptops["ram"].str.replace('GB', '')
#********** Begin **********#
laptops["ram"]=laptops["ram"].astype(int)
print(laptops.dtypes)

#********** End **********#

第7关:列的重命名 

 

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns

laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
#********** Begin **********#
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
print(laptops["ram_gb"].describe())
# count    1303.000000
# mean        8.382195
# std         5.084665
# min         2.000000
# 25%         4.000000
# 50%         8.000000
# 75%         8.000000
# max        64.000000
# Name: ram_gb, dtype: float64

#********** End **********#

第8关:从字符串中提取数值

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns

laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
#********** Begin **********#
laptops["gpu_manufacturer"]=laptops["gpu"].str.split().str[0]
laptops["cpu_manufacturer"]=laptops["cpu"].str.split().str[0]
print(laptops["cpu_manufacturer"].value_counts())

#********** End **********#

 第9关:纠正错误值

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns

laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )
laptops["cpu_manufacturer"] = (laptops["cpu"]
                                       .str.split()
                                       .str[0]
                               )

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}
#********** Begin **********#
laptops["os"]=laptops["os"].map(mapping_dict)
print(laptops["os"].value_counts())


#********** End **********#

 第10关:删除缺失值

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
laptops.columns = new_columns
laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)

laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )
laptops["cpu_manufacturer"] = (laptops["cpu"]
                                       .str.split()
                                       .str[0]
                               )

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}
laptops["os"] = laptops["os"].map(mapping_dict)
#********** Begin **********#
laptops_no_null_rows=(laptops.dropna(axis=0))
laptops_no_null_cols=(laptops.dropna(axis=1))
print(laptops_no_null_rows.head(20))
print(laptops_no_null_cols.head(20))


#********** End **********#

 第11关:填充缺失值

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col

new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)

laptops.columns = new_columns
laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )
laptops["cpu_manufacturer"] = (laptops["cpu"]
                                       .str.split()
                                       .str[0]
                               )

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}
laptops["os"] = laptops["os"].map(mapping_dict)
#********** Begin **********#
value_counts_before = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
print(value_counts_before)
laptops.loc[laptops["os"] == "No OS", "os_version"] ="Version Unknown"
laptops.loc[laptops["os"] == "macOS", "os_version"] ="Version Unknown"
value_counts_before = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
print(value_counts_before)

#********** End **********#

 第12关:挑战:对字符串列进行清洗

import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col

new_columns = []
for c in laptops.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)

laptops.columns = new_columns
laptops["ram"] = laptops["ram"].str.replace('GB', '').astype(int)
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )
laptops["cpu_manufacturer"] = (laptops["cpu"]
                                       .str.split()
                                       .str[0]
                               )

mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}
laptops["os"] = laptops["os"].map(mapping_dict)
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"
laptops.loc[laptops["os"] == "No OS", "os_version"] = "Version Unknown"
#********** Begin **********#
laptops["weight"] =laptops["weight"].str.replace('kg','')
laptops["weight"] =laptops["weight"].str.replace('s','')
laptops["weight"]=laptops["weight"].astype(float)
laptops.rename({"weight": "weight_kg"}, axis=1, inplace=True)
laptops.to_csv('laptops_clean.csv', index=False)
print(laptops.columns)
#********** End **********#

 

  • 7
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值