#coding=utf-8
import numpy as np
import pandas as pd
df1 = pd.DataFrame(pd.read_csv('./house_data/all/test.csv',header=1)) # 读取.csv文件
print df1.shape
df2 = pd.DataFrame({"id":[1002,1001,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]},
columns =['id','date','city','category','age','price'])
df3=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y'],
"m-point":[10,12,20,40,40,40,30,20]})
print df3.info
'''
print df2.shape # 维度查看
print df2.info # 数据表基本信息(维度、列名称、数据格式、所占空间等)
print df2.dtypes # 查看每一列的数据类型
print df2["city"].dtype # object
print df2.isnull()
print df2['price'].unique() # 即看某一列的值有哪些,避免重复
print df2.values # 查看数据表的值
print '=============='
print df2.head(3) #默认前10行数据
print df2.tail(3) #默认后10 行数据
'''
# 数据表清洗
df3 = df2.fillna(value=0) # 用数字0填充空值
print df3.info
df2['price'] = df2['price'].fillna(df2['price'].mean()) # 使用列prince的均值对NA进行填充 ,注意等号左边的
print df2.info
df2['city']=df2['city'].map(str.strip) # 去除空格
print df2
df2['city']=df2['city'].str.lower() # 大小写转换
print df2.info
print df2.dtypes # 查看每一列的数据类型
# df2['price'] = df2['price'].astype(int) # 更改列的数据格式
# print df2.dtypes # 查看每一列的数据类型
df2=df2.rename(columns={'category': 'category-size'}) # 对列刚改名字,注意接收的是df2对象
print df2.info
# df2['city'] = df2['city'].drop_duplicates() # 删除后出现的重复值
# print df2.info
df2['city'] = df2['city'].drop_duplicates(keep='last') # 删除先出现的重复值:
print df2.info
df2['city'] = df2['city'].replace('sh', 'shanghai') # 数据替换
print df2.info
print '======='
''''
df_inner=pd.merge(df2,df3,how='inner') # 匹配合并,交集
print 'inner:'
print df_inner.info
df_outer=pd.merge(df2,df3,how='outer') #并集
print 'outer:'
print df_outer.info
print '========='
df_left=pd.merge(df2,df3,how='left')
print 'left:'
print df_left
print '========='
df_rigth=pd.merge(df2,df3,how='right')
print 'right:'
print df_rigth
'''
df2 = df2.set_index('id') # 设置id为索引
df2 = df2.sort_index() # 按照索引值排序
print df2
df2 = df2.sort_values(by=['age'])
print df2
df2['group'] = np.where(df2['price'] > 3000,'high','low') # 增加了group列
print df2
# df2['price'] = np.where(df2['price'] > 3000,'high','low') # 增加了group列
# print df2
print df2.dtypes
df2.loc[(df2['city'] == 'beijing') & (df2['price'] >= 4000), 'sign']=1
print df2
print '====='
print df2.loc[df2['city']== 'beijing'] # loc函数按标签值进行提取
print '===='
print df2.iloc[0:2] # 按索引提取区域行数值
df2 = df2.reset_index() # 重设索引
df2=df2.set_index('date') # 设置日期为索引
print '****'
print df2[:'2013-01-03']
print df2.iloc[:3,:2] # 冒号前后的数字不再是索引的标签名称,而是数据所在的位置,从0开始,前三行,前两列。
print df2.iloc[[0,2,5],[2]] # 提取第0、2、5行,4、5列 !!!! 若把日期作为索引,则一定会显示日期
print df2
print df2.ix[:'2013-01-03',:4] # 2013-01-03号之前,前四列数据 使用ix按索引标签和位置混合提取数据
print df2['city'].isin(['beijing']) # 整列数据输出为False或True ,判断city列的值是否为北京
print df2['city'].isin(['beijing','shanghai']) # 输出True或者False ,判断city列里是否包含beijing和shanghai
print df2.loc[df2['city'].isin(['beijing','shanghai'])] # 判断city列里是否包含beijing和shanghai,并帅选出来
print df2
print pd.DataFrame(df2['city'].str[:3]) # 提取city的前三个字符,并生成数据表,注意只有city列
# 使用与、或、非三个条件配合大于、小于、等于对数据进行筛选,并进行计数和求和。
# 使用 与 进行筛选
print df2.loc[(df2['age'] > 25) & (df2['city'] == 'beijing'), ['id','city','age','category-size','price']]
#使用 或 进行筛选
print '或:',df2.loc[(df2['age'] > 25) | (df2['city'] == 'beijing'), ['id','city','age','category-size','price']]
#使用 非 进行筛选,并按id进行排序,city.count()并进行计数,返回4
print df2.loc[(df2['city'] != 'beijing'), ['id','city','age','category','gender']].sort_values(['id']).city.count()
# 使用query函数进行筛选
print df2.query('city == ["beijing","shanghai"]')
# 对筛选后的结果按prince进行求和
print df2.query('city == ["beijing","shanghai"]').price.sum() # 11031.0
print '*****'
print df2
print df2.groupby('city').count() # 按城市对所有的列进行计数汇总
print '*****'
print df2.groupby('city').id.count() # 按城市对id列进行计数汇总
print df2.groupby(['city','age']).id.count() # 对两个字段进行汇总计数,必须 city age 相同,才能累加
print df2.groupby('city')['price'].agg([len,np.sum, np.mean]) # 对city字段进行汇总,并分别计算prince的合计和均值
'''
len sum mean
city
beijing 1.0 4432.0 4432.0
guangzhou 1.0 2133.0 2133.0
shanghai 2.0 6599.0 3299.5
shenzhen 1.0 5433.0 5433.0
'''
df = df2.sample(n=3) # 选取3个样本
print df
print '===='
weights = [0.8, 0, 0, 0, 0.1, 0.1]
print df2.sample(n=2, weights=weights)
print df2.describe().round(2).T # 描述性统计
'''
count mean std min 25% 50% 75% max
id 6.0 1003.5 1.87 1001.0 1002.25 1003.5 1004.75 1006.0
age 6.0 36.5 10.88 23.0 32.00 33.0 41.50 54.0
price 6.0 3299.5 1523.35 1200.0 2424.62 3299.5 4148.88 5433.0
sign 1.0 1.0 NaN 1.0 1.00 1.0 1.00 1.0
'''
print df2['price'].std() # 计算某个字段的标准差 1523.35163373
print df2['price'].cov(df2['age']) # 不支持 字符串
print df2.cov() # 数据表中所有字段间的协方差
'''
id age price sign
id 3.5 -4.9 1526.1 NaN
age -4.9 118.3 -1353.5 NaN
price 1526.1 -1353.5 2320600.2 NaN
sign NaN NaN NaN NaN
'''
# 两个字段的相关性分析
print df2['price'].corr(df2['age']) # 相关系数在-1到1之间,接近1为正相关,接近-1为负相关,0为不相关 -0.0816894035549328
# 所有字段的相关性分析
print df2.corr()
df2.to_csv('./excel_to_python.csv') # 写入CSV
df2.to_excel('./excel_to_python.xlsx', sheet_name='bluewhale_cc') # 写入Excel