pandas 学习笔记

pandas 是一个强大的时间序列数据处理工具包,由两个比较重要的数据结构,series和dataframe,可以简单的将其抽象为一个表格。

series可以表达一行数据,理解为一维数组。s = pandas.Series([4,2,5,0,6,3])

DataFrame表示二维数组, df = pandas.DataFrame(numpy.random.randn(6,4), columns=list('ABCD'))


%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Series 对象可以理解为一维数组
s = pd.Series([4, 2, 5, 0, 6, 3])

# DataFrame 是二维数组对象
df = pd.DataFrame(np.random.randn(6,4), columns=list('ABCD'))

#访问数据
df.iloc[0]
df.A
print("Row data type: {}".format(type(df.iloc[0])))
print("Column data type: {}".format(type(df.A)))
#查看维度信息
df.shape
#访问前n,后n行数据
df.head(3)
df.tail(2)
#行索引,列索引
df.columns
df.index
#查看统计信息,如平均值,标准差,最小值,最大值

df.describe()


#数据排序
df.sort_index(axis=1, ascending=False)
df.sort_values(by='B')

#数据访问
df[3:5]
df[['A', 'B', 'D']]
df.loc[3, 'A']
df.iloc[3, 0]
df.iloc[2:5, 0:2]
df[df.C > 0]
a = df.loc[0].tolist()      #选取第一行数据并将第一行数据转变为list
df.loc[0]                   #选取第一行数据
df.loc[0:2]                 #选取的是前闭后闭的区间,与iloc是由区别的
df.iloc[0:2]                #选取的是前闭后开的区间,与iloc是由区别的
df.loc[0:2,0:2]             #取中间某一个区域的值,前闭后闭的区间
df.iloc[0:2,0:2]            #取中间某一个区域的值,前闭后开的区间
df.loc[0,1]                 #选取的是第一行,第二列的元素
df.iloc[0,1]                #选取的是第一行,第二列的元素
#对数据进行修改
df["TAG"] = ["cat", "dog", "cat", "cat", "cat", "dog"]
df.groupby('TAG').sum() #根据TAG列做分组统计




读写csv文件

pandas.read_csv('data.csv')

pandas.to_csv('data.csv')

时间序列及可视化

n_items = 366
ts = pd.Series(np.random.randn(n_items), index=pd.date_range('20000101', periods=n_items))
print(ts.shape)
ts.head(5)

ts.resample("1m").sum()

plt.figure(figsize=(10, 6), dpi=144)
cs = ts.cumsum()
cs.plot()

plt.figure(figsize=(10, 6), dpi=144)
ts.resample("1m").sum().plot.bar()

pandas统计数据,处理缺失值

import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()

#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
#print(age.loc[0:10])
age_is_null = pd.isnull(age)
#print age_is_null
age_null_true = age[age_is_null]
#print age_null_true
age_null_count = len(age_null_true)
print(age_null_count)

#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print mean_age

#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print correct_mean_age

# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print correct_mean_age

#mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print fares_by_class

#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print passenger_survival

passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)

port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)

#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
#print new_titanic_survival

row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print row_index_83_age
print row_index_1000_pclass

new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print new_titanic_survival[0:10]
itanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed.iloc[0:10])

# This function returns the hundredth item from a series
def hundredth_row(column):
    # Extract the hundredth item
    hundredth_item = column.iloc[99]
    return hundredth_item

# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print hundredth_row

def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

column_null_count = titanic_survival.apply(not_null_count)
print column_null_count

#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"

classes = titanic_survival.apply(which_class, axis=1)
print classes

def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False

minors = titanic_survival.apply(is_minor, axis=1)
#print minors

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic_survival.apply(generate_age_label, axis=1)
print age_labels

titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print age_group_survival

自定义函数apply和统计pivot_table功能比较强大。

另外一个例子


# coding: utf-8

# In[1]:


#Series (collection of values)
#DataFrame (collection of Series objects)
#Panel (collection of DataFrame objects)


# In[2]:


#A Series object can hold many data types, including
#float - for representing float values
#int - for representing integer values
#bool - for representing Boolean values
#datetime64[ns] - for representing date & time, without time-zone
#datetime64[ns, tz] - for representing date & time, with time-zone
#timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)
#category - for representing categorical values
#object - for representing String values

#FILM - film name
#RottenTomatoes - Rotten Tomatoes critics average score
#RottenTomatoes_User - Rotten Tomatoes user average score
#RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)
#RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)
#Metacritic - Metacritic critics average score
#Metacritic_User - Metacritic user average score


# In[27]:


import pandas as pd
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])


# In[30]:


# Import the Series object from pandas
from pandas import Series

film_names = series_film.values
#print type(film_names)
#print film_names
rt_scores = series_rt.values
#print rt_scores
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]


# In[31]:


# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)


# In[33]:


original_index = series_custom.index.tolist()
#print original_index
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
#print sorted_by_index


# In[35]:


sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
#print(sc2[0:10])
print(sc3[0:10])


# In[36]:


#The values in a Series object are treated as an ndarray, the core data type in NumPy
import numpy as np
# Add each value with each other
print np.add(series_custom, series_custom)
# Apply sine function to each value
np.sin(series_custom)
# Return the highest value (will return a single value not a Series)
np.max(series_custom)


# In[37]:


#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]

criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print both_criteria


# In[38]:


#data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2

print(rt_mean)



# coding: utf-8

# In[2]:


import pandas as pd


# In[10]:


#will return a new DataFrame that is indexed by the values in the specified column 
#and will drop that column from the DataFrame
#without the FILM column dropped 
fandango = pd.read_csv('fandango_score_comparison.csv')
print type(fandango)
fandango_films = fandango.set_index('FILM', drop=False)
#print(fandango_films.index)


# In[4]:


# Slice using either bracket notation or loc[]
fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]

# Specific movie
fandango_films.loc['Kumiko, The Treasure Hunter (2015)']

# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
fandango_films.loc[movies]

#When selecting multiple rows, a DataFrame is returned, 
#but when selecting an individual row, a Series object is returned instead


# In[15]:


#The apply() method in Pandas allows us to specify Python logic
#The apply() method requires you to pass in a vectorized operation 
#that can be applied over each Series object.
import numpy as np

# returns the data types as a Series
types = fandango_films.dtypes
#print types
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
#print float_df
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))

print(deviations)


# In[16]:


rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
rt_mt_user.apply(lambda x: np.std(x), axis=1)

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值