####按照惯例导入两个常用的数据处理的包,numpy与pandasimport numpy as np
import pandas as pd
# 从csv文件读取数据,数据表格中只有5行,里面包含了float,string,int三种数据python类型,也就是分别对应的pandas的float64,object,int64# csv文件中共有六列,第一列是表头,其余是数据。
df = pd.read_csv("sales_data_types.csv")
print(df)
Customer Number Customer Name 20162017 \
010002 Quest Industries $125,000.00$162,500.001552278 Smith Plumbing $920,000.00$1,012,000.00223477 ACME Industrial $50,000.00$62,500.00324900 Brekke LTD $350,000.00$490,000.004651029 Harbor Co $15,000.00$12,750.00
Percent Growth Jan Units Month Day Year Active
030.00% 5001102015 Y
110.00% 7006152014 Y
225.00% 1253292016 Y
34.00% 7510272015 Y
4 -15.00% Closed 222014 N
df.dtypes
CustomerNumber int64
CustomerNameobject2016object2017objectPercentGrowthobjectJanUnitsobjectMonth int64
Day int64
Year int64
Activeobject
dtype: object
Customer Number Customer Name 20162017 \
010002 Quest Industries $125,000.00$162,500.001552278 Smith Plumbing $920,000.00$1,012,000.00223477 ACME Industrial $50,000.00$62,500.00324900 Brekke LTD $350,000.00$490,000.004651029 Harbor Co $15,000.00$12,750.00
Percent Growth Jan Units Month Day Year Active
030.00% 5001102015 Y
110.00% 7006152014 Y
225.00% 1253292016 Y
34.00% 7510272015 Y
4 -15.00% Closed 222014 N
# 然后像2016,2017 Percent Growth,Jan Units 这几列带有特殊符号的object是不能直接通过astype("flaot)方法进行转化的,# 这与python中的字符串转化为浮点数,都要求原始的字符都只能含有数字本身,不能含有其他的特殊字符# 我们可以试着将将Active列转化为布尔值,看一下到底会发生什么,五个结果全是True,说明并没有起到什么作用
defconvert_currency(var):"""
convert the string number to a float
_ 去除$
- 去除逗号,
- 转化为浮点数类型
"""
new_value = var.replace(",","").replace("$","")
return float(new_value)
Customer Number int32
Customer Name object
2016float642017float64
Percent Growth float64
Jan Units object
Month int64
Day int64
Year int64
Active bool
dtype: object
Customer Number int32
Customer Name object
2016float642017float64
Percent Growth float64
Jan Units float64
Month int64
Day int64
Year int64
Active bool
Start_date datetime64[ns]
dtype: object
# 将这些转化整合在一起defconvert_percent(val):"""
Convert the percentage string to an actual floating point percent
- Remove %
- Divide by 100 to make decimal
"""
new_val = val.replace('%', '')
return float(new_val) / 100
df_2 = pd.read_csv("sales_data_types.csv",dtype={"Customer_Number":"int"},converters={
"2016":convert_currency,
"2017":convert_currency,
"Percent Growth":convert_percent,
"Jan Units":lambda x:pd.to_numeric(x,errors="coerce"),
"Active":lambda x: np.where(x=="Y",True,False)
})
df_2.dtypes
Customer Number int64
Customer Name object
2016float642017float64
Percent Growth float64
Jan Units float64
Month int64
Day int64
Year int64
Active bool
dtype: object