'''
练习1
• 读取北向.csv 指定 trade_date 为 行索引
• 查看数据的基本信息 有无缺失值 对其缺失值进行处理
• 删除缺失值所在行
• 查看数据的基本信息 查看数据是否清洗完毕
• 标签为 index 这列没啥用 将该列删除
• 观察数据是否有重复行
• 将重复行进行删除
• 将行索引 进行升序
• 将处理好的数据 保存至 北向(副).csv
'''
import numpy as np
import pandas as pd
data = pd.read_csv(r"北向.csv")
print(data)
'''
index trade_date ggt_ss ggt_sz hgt sgt north_money south_money
0 0 20190624 -541.17 792.38 -757.96 -1153.14 -1911.10 251.21
1 1 20190621 -97.40 701.36 3722.36 3608.14 7330.50 603.96
2 2 20190620 660.05 555.23 1914.44 3650.47 5564.91 1215.28
3 3 20190619 -491.58 186.47 2092.51 2831.23 4923.74 -305.11
4 4 20190618 1667.40 832.29 974.92 617.24 1592.16 2499.69
... ... ... ... ... ... ... ... ...
879 295 20190612 2032.73 912.14 1467.34 -181.33 1286.01 2944.87
880 296 20190611 2699.37 1038.56 3774.59 3171.37 6945.96 3737.93
881 297 20190610 1160.59 703.69 4957.98 2939.29 7897.27 1864.28
882 298 20190606 -13.56 -20.15 1500.24 -421.68 1078.56 -33.71
883 299 20190605 218.43 394.27 2276.22 781.60 3057.82 612.70
884 rows × 8 columns
'''
'''
练习2
读取 FoodFacts.csv 数据,该数据是全球食品数据,需分析每个国家添加剂的平均使用。
步骤分析
• 1.读取数据
• 2.数据质量考量
• 3.清洗数据
• 4.对各个国家的使用数量进行统计
• 1.清洗,统计国家数据
• 2.通过国家统计添加剂用量
• 5.保存统计结果
'''
data_food = pd.read_csv(r"FoodFacts.csv ")
data_food.info()
data_food.head(10)
data = data_food.dropna(axis=1,how="all")
data_food.info()
data_food
data_food.columns
'''
Index(['code', 'url', 'creator', 'created_t', 'created_datetime',
'last_modified_t', 'last_modified_datetime', 'product_name',
'generic_name', 'quantity',
...
'caffeine_100g', 'taurine_100g', 'ph_100g',
'fruits_vegetables_nuts_100g', 'collagen_meat_protein_ratio_100g',
'cocoa_100g', 'chlorophyl_100g', 'carbon_footprint_100g',
'nutrition_score_fr_100g', 'nutrition_score_uk_100g'],
dtype='object', length=159)
'''
data1= pd.read_csv(r"FoodFacts.csv ",usecols=["countries_en","additives_n"])
data1.info()
data1.head()
'''
countries_en additives_n
0 France NaN
1 France NaN
2 France NaN
3 France NaN
4 France NaN
'''
data1 = data1.dropna()
data1
'''
countries_en additives_n
5 United Kingdom 0.0
6 France 0.0
8 France 0.0
10 United Kingdom 5.0
11 United Kingdom 5.0
... ... ...
65480 United States 4.0
65490 France 0.0
65494 France 0.0
65499 France 0.0
65501 France 0.0
43616 rows × 2 columns
'''
data_country = data1['countries_en'][~data1['countries_en'].str.contains(',')]
count=data_country.drop_duplicates().count()
total_countries = data_country.drop_duplicates()
mean_additive_list=[]
for country in total_countries:
a = data1[data1["countries_en"].str.contains(country,case=False)]
#print(a)
mean_additive=a["additives_n"].mean()
mean_additive_list.append(mean_additive)
needed_data = pd.DataFrame({
"country":total_countries,
"mean_additive":mean_additive_list
})
needed_data
'''
country mean_additive
5 United Kingdom 1.259009
6 France 1.930422
15 Spain 0.930324
22 Germany 0.777923
69 United States 2.180608
... ... ...
62678 Iraq 1.500000
63052 Nederland 0.000000
64087 Singapore 1.000000
64096 Indonesia 2.125000
65403 Burkina Faso 1.666667
84 rows × 2 columns
'''