import pandas as pd
import numpy as np
step_data = [3620,7891,9761,
3907,4338,5373]
print(type(step_data))
step_counts = pd.Series(step_data,name='steps')
print(step_counts)
print(type(step_counts))
step_counts.index = pd.date_range('20200219',periods=6)
print(step_counts)
print(step_counts['2020-02-23']) #Just like a dictionary
print(step_counts[4])#Or by index position-like an array
print(step_counts['2020-02'])#Select all of February
print(step_counts.dtypes)#View the data type
step_counts = step_counts.astype(np.float)#Convert to a float
print(step_counts.dtypes)
step_counts[1:3]=np.NaN #Create invaild data
step_counts = step_counts.fillna(0.)#Now fill it in with zeros
print(step_counts[1:3])
print(step_counts)
cycling_data = [10.7, 0, None, 2.4, 15.3, 10.9, 0, None] #Cycling distance
joined_data = list(zip(step_data,cycling_data)) #Creats a tuple of data
#zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表。如果各个迭代器的元素个数不一致,
#则返回列表长度与最短的对象相同,利用 * 号操作符,可以将元组解压为列表。
#list() 方法用于将元组转换为列表。注:元组与列表是非常类似的,区别在于元组的元素值不能修改,元组是放在括号中,列表是放于方括号中。
activity_df = pd.DataFrame(joined_data)#The datafame #DataFrame是Python中Pandas库中的一种数据结构,它类似excel,是一种二维表。
#DataFrame的单元格可以存放数值、字符串等,这和excel表很像,同时DataFrame可以设置列名columns与行名index。
print(activity_df)
activity_df = pd.DataFrame(joined_data, index=pd.date_range('20200219',periods=6),columns=['Walking','Cycling'])
#Add column(列) names to dataframe #periods(周期)
print(activity_df)
print(activity_df.loc['2020-2-19'])#loc——通过行标签索引行数据 Select row of data by index name
print(activity_df.iloc[-3])#iloc——通过行号索引行数据 print(asc) Select row of data by integer position
#ix——通过行标签或者行号索引行数据(基于loc和iloc 的混合)
print(activity_df['Walking'])#Name of column
print(activity_df.Walking)#Object-oriented approach面向对象方法
print(activity_df.iloc[:,0])#First column
print(activity_df.iloc[:,1])#Second column
filepath = 'data/Iris_Data.csv'#File location of the data file
data = pd.read_csv(filepath)#Import the data
print(data)#Print a few rows
data['sepal_area'] = data.sepal_length*data.sepal_width#Create a new column that is a product of both measurements
print(data.iloc[:5, -3:])#Print a few rows and columns
data['abbrev']=(data.species.apply(lambda x:x.replace('Iris-','')))#The lambda function applies what foloows it to each row of data
#此函数将后面的内容应用于每一行数据
print(data.iloc[:5,-3:])#Note that there are other ways to accomplish the above
small_data = pd.concat([data.iloc[:2],data.iloc[-2:]])#Concatenate the first two and last two rows
print(small_data.iloc[:,-3:])#See the 'join' method for SQL style joining of dataframes
group_sizes = (data.groupby('species').size())#Use the size method with a dataframe to get count for a series,use the ,value_counts method
print(group_sizes)
print(data.mean())#Mean calculated on a dataframe #计算数据上的平均值
print(data.petal_length.median())#Median calculated on a Series #按级数计算的中值
print(data.petal_length.mode())#Mode calculated on s series 按系数计算模式
print(data.petal_length.std(),#Standard dev, Variance, and SEM #标准差 方差
data.petal_length.var(),
data.petal_length.sem())
print(data.quantile(0))#As well as quantiles 分位数
print(data.describe())#describe() 函数可以查看数据的基本情况,包括:count 非空值数、mean 平均值、std 标准差、max 最大值、min 最小值、(25%、50%、75%)分位数等
sample = (data.sample(n=5,replace=False,random_state=42))#Sample 5 rows without repalcement
print(sample.iloc[:,-3:])
import matplotlib.pyplot as plt
plt.figure()
plt.plot(data.sepal_length,data.sepal_width,ls='',marker='o')
plt.plot(data.sepal_length,data.sepal_width,ls='',marker='o',label='sepal')
plt.plot(data.petal_length,data.petal_width,ls='',marker='o',label='petal')
plt.show()
plt.hist(data.sepal_length,bins=25)
plt.show()
fig, ax = plt.subplots()
ax.barh(np.arange(10),data.sepal_width.iloc[:10])
#Set poasition of ticks and tick labels
ax.set_yticks(np.arange(0.4,10.4,1.0))
ax.set_yticklabels(np.arange(1,11))
ax.set(xlabel='xlabel',ylabel='ylabel',title='Title')
plt.show()
(data.groupby('species').mean().plot(color=['red','blue','black','green'],fontsize=10.0,figsize=(4,4)))
plt.show()
import seaborn as sns
sns.jointplot(x='sepal_length',y='sepal_width',data=data,size=4)
plt.show()
sns.pairplot(data,hue='species',size=3)
plt.show()