Python数据分析与机器学习-Pandas

源码下载地址:

http://download.csdn.net/download/adam_zs/10174600


import pandas

food_info = pandas.read_csv("food_info.csv")  # 默认第一行作为列名
print(food_info)
print(type(food_info))  # DataFrame
print(food_info.dtypes)

first_rows = food_info.head()  # 默认显示前5行
print(first_rows)
print(food_info.head(3))  # 显示前3行
print(food_info.tail(3))  # 显示后3行
print(food_info.shape)
print(food_info.columns)  # 列名
'''
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
       'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
       'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
       'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
       'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
       'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
       'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
       'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
       'Cholestrl_(mg)']
'''
print(food_info.loc[0])  # 第一行
print(food_info.loc[6])  # 第七行
# print(food_info.loc[8620]) #KeyError: 'the label [8620] is not in the [index]'

print(food_info.loc[3:6])  # 切片,返回3,4,5,6行,包括第3行和第6行
print(food_info.loc[[2, 5, 10]])  # 返回2, 5, 10行
print(food_info["NDB_No"])  # 显示NDB_No列数据
print(food_info[["Zinc_(mg)", "Copper_(mg)"]])  # 显示Zinc_(mg),Copper_(mg)列数据

col_names = food_info.columns.tolist()
gram_columns = []
for c in col_names:
    if c.endswith("(g)"):
        gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))

import pandas

food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
# ['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)',
# #'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
# 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)',
# 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
# 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)',
# 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
print(food_info[col_names])

print(food_info["Iron_(mg)"])
print(food_info["Iron_(mg)"] / 1000)
print(food_info["Iron_(mg)"] + 100)
print(food_info["Iron_(mg)"] - 100)
print(food_info["Iron_(mg)"] * 2)

# 对应位置相乘、相除
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000
food_info["Iron_(g)"] = iron_grams

weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat

print(food_info["Energ_Kcal"])
max_calories = food_info["Energ_Kcal"].max()
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat

# inplace=True,对DataFrame进行就地排序,而不是返回新的DataFrame。
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=True)  # 升序排列,默认升序
print(food_info["Sodium_(mg)"])
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)  # 降序排列
print(food_info["Sodium_(mg)"])

import pandas as pd
import numpy as np

# 泰克尼克号获救信号
'''
'PassengerId' 乘客id
 'Survived' 标签值
 'Pclass' 仓位等级
 'Name' 姓名
 'Sex' 性别
 'Age' 年龄
 'SibSp' 兄弟姐妹数量
 'Parch' 老人孩子总数
 'Ticket' 票号
 'Fare' 票价
 'Cabin' 座位
 'Embarked' 登船地点
'''

# pd.set_option('display.height', 1000)
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

titanic_survival = pd.read_csv("titanic_train.csv")
print(titanic_survival.columns)
print(titanic_survival.head())

# The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
# we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
# Age没有值的数量
age = titanic_survival["Age"]
print(age)
age_is_null = pd.isnull(age)  # pd.isnull true:是缺失值 flase:不是缺失值
print(age_is_null)
age_null_true = age[age_is_null]
print(len(age_null_true))  # Age缺失的数量 177

mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age)  # nan

good_ages = titanic_survival["Age"][age_is_null == False]
print(sum(good_ages) / len(good_ages))  # 29.6991176471

correct_mean_age = titanic_survival["Age"].mean()  # mean,平均年龄,自动筛选出没有age的数据
print(correct_mean_age)  # 29.69911764705882

# 每个仓位票价的均价
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    fare_for_class = pclass_rows["Fare"].mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)

# pivot_table 数据统计
# index 按照那个分组
# values 是我们要应用计算的列
# aggfunc 指定我们要执行的计算
passenger_Fare = titanic_survival.pivot_table(index="Pclass", values="Fare", aggfunc=np.mean)
print(passenger_Fare)

passenger_Fare = titanic_survival.pivot_table(index="Pclass", values="Fare")  # 默认求平均值
print(passenger_Fare)

# 按照Embarked进行分组,求Fare,Survived的和
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare", "Survived"], aggfunc=np.sum)
print(port_stats)

print(titanic_survival)
# 指定axis= 1或axis='columns' 删除任何具有空值的列
print(titanic_survival.fillna(0))  # 缺失值都填充为0
print(titanic_survival.dropna(axis=1))  # 删除有空值的列
print(titanic_survival.dropna(axis=0))  # 删除有空值的行
print(titanic_survival.dropna(axis=0, subset=["Age", "Sex"]))  # 删除"Age", "Sex"有空值的行

row_index_83_age = titanic_survival.loc[83, "Age"]  # 行号、列名
row_index_1000_pclass = titanic_survival.loc[766, "Pclass"]
print(titanic_survival.loc[83])
print(row_index_83_age)
print(titanic_survival.loc[766])
print(row_index_1000_pclass)

new_titanic_survival = titanic_survival.sort_values("Age", ascending=True)
print(new_titanic_survival)
print(new_titanic_survival.loc[:10])
itanic_reindexed = new_titanic_survival.reset_index(drop=True)  # 重建索引,返回重建索引后的矩阵
print(itanic_reindexed.loc[:10])


# 返回一系列的第一百个项目
def hundredth_row(column):
    return column.loc[99]


# apply运行自定义函数
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)


# 缺失值的数量
def not_null_count(column):
    column_null = pd.isnull(column)
    return len(titanic_survival[column_null])


print(pd.isnull(titanic_survival))
print(len(pd.isnull(titanic_survival)))  # 891
column_null_count = titanic_survival.apply(not_null_count, axis=0)  # axis=0 列,axis=1 行
print(column_null_count)


def which_class(row):
    pclass = row["Pclass"]
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"


classes = titanic_survival.apply(which_class, axis=1)
print(classes)


def is_minor(row):
    if row["Age"] > 18:
        return True
    else:
        return False


ages = titanic_survival.apply(is_minor, axis=1)
print(ages)


def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"


age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)

titanic_survival['age_labels'] = age_labels
# 成年、未成年的获救率
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survival)

import pandas as pd

pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)

# Series (collection of values) 一行或者一列
# DataFrame (collection of Series objects)
# Panel (collection of DataFrame objects)

# DataFrame中一行或者一列是 Series
# Series.values 是 ndarray

# 电影评分
fandango = pd.read_csv('fandango_score_comparison.csv')
print(fandango.head())
series_film = fandango['FILM']
print(type(series_film))  # Series
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print(series_rt[0:5])

from pandas import Series

film_names = series_film.values
print(type(film_names))  # ndarray
print(film_names)
rt_scores = series_rt.values
print(rt_scores)
# 电影名字film_names作为索引
series_custom = Series(rt_scores, index=film_names)
print(series_custom)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])

# int index is also aviable
series_custom = Series(rt_scores, index=film_names)
print(series_custom)
print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
fiveten = series_custom[5:10]
print(fiveten)

original_index = series_custom.index.tolist()
print(original_index)
sorted_index = sorted(original_index)
print(sorted_index)
sorted_by_index = series_custom.reindex(sorted_index)
print(sorted_by_index)

sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
print(sc2[:10])
print(sc3[:10])

import numpy as np

print(series_custom.head())
# Add each value with each other
print(np.add(series_custom.head(), series_custom.head()))
# Apply sine function to each value
print(np.sin(series_custom))
# Return the highest value (will return a single value not a Series)
print(np.max(series_custom))

print(fandango.head())
series_custom = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'].values)
print(series_custom.head())
print(series_custom > 50)

series_greater_than_50 = series_custom[series_custom > 50]
criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print(both_criteria)

# data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users) / 2
print(rt_mean)

import pandas as pd

pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)

fandango = pd.read_csv('fandango_score_comparison.csv')
print(fandango.head())
print(type(fandango))

# DataFrame指定索引
fandango_films = fandango.set_index('FILM', drop=True)  # 以FILM作为索引,删除原FILM的列
print(fandango_films)
print(fandango_films.index)

# Slice using either bracket notation or loc[]
print(fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"])
print(fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"])

# Specific movie
print(fandango_films.loc['Kumiko, The Treasure Hunter (2015)'])

# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
print(fandango_films.loc[movies])

# When selecting multiple rows, a DataFrame is returned,
# but when selecting an individual row, a Series object is returned instead

import numpy as np

print(fandango_films.head())
types = fandango_films.dtypes
print(types)
float_columns = types[types.values == 'float64'].index
float_df = fandango_films[float_columns]
print(float_df)

# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))
print(deviations)

# np.std标准差
print(float_df.head())
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
print(rt_mt_user.apply(lambda x: np.std(x), axis=1))


  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
【为什么要学习这门课?】  几乎世界上的每一家公司都在评估自己的数字战略,并寻找利用数字化进行业务转型的机会。大数据分析机器学习是这一战略的核心。几乎每个行业的高管、数字架构师、IT管理员和通信运营人员都需要了解数据处理和人工智能的基础知识。 【课程亮点】  在本课程中,经验丰富的两位讲师提供了有效的经验指导,带领大家探索大数据分析、监督学习、无监督学习和神经网络的基本原理。除了深入研究基本概念外,还举例介绍了不同行业的大数据和机器学习用例,并演示了数据科学家和研究人员在不同领域使用的最常见工具(如Hadoop、TensorFlow、Matlab/Octave、R和Python)。通过本课程的学习能够帮助您熟练掌握大数据分析机器学习的原理及相关应用技能。  【讲师介绍】  Jerome Henry(杰罗姆·亨利)—— Cisco思科首席工程师、思科最佳培训讲师Jerome Henry(杰罗姆·亨利)Jerome Henry(杰罗姆·亨利)目前是思科系统公司企业基础设施和解决方案组的首席工程师。他有超过15年在15个国家用4种语言教授思科技术课程的丰富经验,超过10,000小时的培训经历,也让他被授予思科IT培训奖最佳讲师银质奖章。其实,杰罗姆很早起就为欧洲无线网关制造商 Airespace公司(后被思科收购)提供技术咨询和培训,教授异构网络和无线集成。杰罗姆是一名获得认证的无线网络专家(CWNE No. 45),开发了多门思科课程,撰写了几本无线书籍。同时,他也是IEEE的成员,并于2013年被提升为高级成员,并加入了Wi-Fi联盟工作组,专注于物联网和低功耗研究。 Robert Barton(罗伯特·巴顿)—— 双CCIE & CCDE、思科数据中心专家Robert Barton(罗伯特·巴顿)Robert Barton(罗伯特·巴顿)毕业于哥伦比亚大学工程物理学专业,拥有路由、交换和安全双CCIEs,也是加拿大第一个CCDE。罗伯特目前在思科担任数据中心专家、公共部门系统工程师,主要从事无线和安全架构方面的工作。 【课程收获】 1、了解静态和实时流数据是如何收集、分析和使用的;2、了解机器学习和模仿人类思维的关键工具和方法; 3、如何收集非结构化数据,为分析和可视化做准备; 4、学会比较和对比各种大数据架构; 5、学会将有监督学习、线性回归、数据拟合及强化学习应用到机器学习上,以产生想要的信息结果; 6、将分类技术应用于机器学习,以更好地分析数据; 7、利用无监督学习的好处,收集到你意想不到的数据价值; 8、了解人工神经网络(ANNs)如何进行深度学习,并获得令人叹服的结果;9、应用主成分分析(PCA)改进数据分析的管理;10、了解在真实系统上实现机器学习的关键方法,以及在进行机器学习项目时必须考虑的各种事项;

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值