PYTHON之pandas包练习
#* coding:utf-8 *
‘’’
@author: xiaolin
感谢这位作者!!
原文:https://blog.csdn.net/chekongfu/article/details/82694835
‘’’
import numpy as np
import pandas as pd
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
df = pd.DataFrame(pd.read_csv('tags.csv',header=1))
df = pd.DataFrame(
{
"id":[1001,1002,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,45],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200.9,np.nan,2133,5433.2,np.nan,4432]
},
columns =['id','date','city','category','age','price'])
df1 = pd.DataFrame({
"id":[1001,1002,1003,1004,1005,1006,1007,108],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y',],
"m-point":[10,12,20,40,40,40,30,20]})
if __name__ == '__main__':
print('********************************')
print('一、信息表信息查看')
print('********************************')
print('查看前2行')
print(df.head(2))
print('===============================')
print('查看后2行')
print(df.tail(2))
print('===============================')
print('维度查看')
print(df.shape)
print('===============================')
print('数据表基本信息(维度、列名称、数据格式、所占空间等)')
print(df.info())
print('===============================')
print('每一列数据的格式')
print(df.dtypes)
print('===============================')
print('category列格式')
print(df['category'].dtype)
print('===============================')
print('空值')
print(df.isnull())
print('===============================')
print('查看price列空值')
print(df['price'].isnull())
print('===============================')
print('查看age列的唯一值')
print(df['age'].unique())
print('===============================')
print('查看数据表的值')
print(df.values)
print('===============================')
print('查看列名称')
print(df.columns)
print('===============================')
print('********************************')
print('二、数据表清洗 ')
print('********************************')
print('使用列prince的均值对NA进行填充')
print(df)
print(df.fillna(value=100))
#fillna()的参数如下,其中value可以取df.median、df.mean
#fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)
print( df['price'].fillna(df['price'].mean()) )
print('===============================')
print('清除city字段的字符前后空格,字符的大小写转换')
df['city']=df['city'].map(str.strip)
#大小写转换(小写lower)
df['city'] = df['city'].map(str.upper)
print( df )
print('===============================')
print('更改数据格式')
print(df['price'].fillna(value=100).astype('int'))
print('===============================')
print('更改列名称')
print(df.rename(columns={'category': 'category-size'}))
print('===============================')
print('删除后出现的重复值')
print(df['city'].drop_duplicates())
print('===============================')
print('数据替换')
df['city'] = df['city'].replace('SH', 'shanghai')
print(df)
print('===============================')
print('********************************')
print('三、数据预处理 ')
print('********************************')
print('内连接')
df_inner=pd.merge(df,df1,how='inner') # 匹配合并,交集
print( df_inner )
print('===============================')
print('左连接')
df_left=pd.merge(df,df1,how='left') # 左关联
print( df_left )
print('===============================')
print('右连接')
df_right=pd.merge(df,df1,how='right') # 右关联
print( df_right )
print('===============================')
print('外连接')
df_outer=pd.merge(df,df1,how='outer') # 外连接
print( df_outer )
print('===============================')
print('设置索引列')
df_outer.set_index('id')
df_outer = df_outer.sort_values('age')
#df_outer = df_outer.sort_index()
print(df_outer)
print('===============================')
print('如果price列的值>3000,group列显示high,否则显示low')
df_outer['group'] = np.where(df_outer['price'] > 3000,'high','low')
print(df_outer)
print('===============================')
print('对复合多个条件的数据进行分组标记')
df_inner.loc[(df_inner['city'].map(str.lower) == 'beijing') & (df_inner['price'] >= 4000), 'sign'] = 1
print(df_inner)
print('===============================')
print('对category字段的值依次进行分列,并创建数据表,索引值为df_inner的索引列,列名称为category和size')
split = pd.DataFrame(
(x.split('-') for x in df_inner['category']),
index=df_inner.index,
columns=['category','size'])
print("split")
print(split)
print('===============================')
print('将完成分裂后的数据表和原df_inner数据表进行匹配')
df_inner=pd.merge(df_inner,split,right_index=True, left_index=True)
print( df_inner )
print('===============================')
print('********************************')
print('四、数据提取')
print('********************************')
print('按索引提取单行的数值')
print( df_inner.loc[0] )
print('===============================')
print('按索引提取区域行数值')
print( df_inner.iloc[0:2] )
print('===============================')
print('重设索引')
df_inner.reset_index()
print('===============================')
print('设置日期为索引')
df_inner=df_inner.set_index('date')
print('===============================')
print('提取4日之前的所有数据')
print( df_inner[:'2013-01-04'] )
print('===============================')
print('使用iloc按位置区域提取数据')
print(df_inner.iloc[:2,:2]) #冒号前后的数字不再是索引的标签名称,而是数据所在的位置,从0开始,前三行,前两列。
print('===============================')
print('适应iloc按位置单独提起数据')
print( df_inner.iloc[[0,2,5],[4,5]] )#提取第0、2、5行,4、5列
print('===============================')
print('使用ix按索引标签和位置混合提取数据')
print(df_inner.ix[:'2013-01-03',:4] )#2013-01-03号之前,前四列数据
print('===============================')
print('判断city列的值是否为北京')
print(df_inner['city'].isin(['BEIJING']))
print('===============================')
print('判断city列里是否包含beijing和shanghai,然后将符合条件的数据提取出来')
print(df_inner.loc[df_inner['city'].isin(['BEIJING','shanghai'])])
print('===============================')
print('提取前三个字符,并生成数据表')
test = pd.DataFrame(df['category'].str[:3])
print(test)
print('===============================')
print('********************************')
print('五、数据筛选')
print('********************************')
print('找出年龄大于25并且在beijing的记录')
print(df.loc[
(df['age'] > 25) & (df['city'].str.lower() == 'beijing'),
['id','city','age','category','price']])
print('===============================')
print('找出年龄大于35或者在beijing的记录')
print('===============================')
print(df.loc[
(df['age'] > 35) | (df['city'].str.lower() == 'beijing'),
['id','city','age','category','price']])
print('===============================')
print('找出不在beijing的记录,对筛选后的数据按city列进行计数')
print('===============================')
test = df.loc[
(df['city'].str.lower() != 'beijing'),
['id','city','age','category','price']]
print(test)
print('总数:'+str(test.sort_values(['age'],ascending = False).city.count()))
print('===============================')
print('使用query函数进行筛选')
print('===============================')
test = df.query('city == ["BEIJING", "shanghai"]')
#注意这里的单引号与双引号,如果同为双引号或单引号会报错
print(test)
print('===============================')
print('对筛选后的结果按prince进行求和')
print('===============================')
print( df.query('city == ["BEIJING", "shanghai"]').price.sum() )
print('===============================')
print('********************************')
print('六、数据汇总')
print('********************************')
print('对所有的列进行计数汇总')
print('===============================')
print( df.groupby('city').count() )
print('===============================')
print('按城市对id字段进行计数')
print('===============================')
print( df.groupby('city')['id'].count() )
print('===============================')
print('对两个字段进行汇总计数')
print('===============================')
print( df.groupby(['city','age'])['id'].count() )
print('===============================')
print('对city字段进行汇总,并分别计算prince的合计和均值')
print('===============================')
print( df.groupby('city')['price'].agg([len,np.sum, np.mean]) )
print('===============================')
print('********************************')
print('七、数据统计')
print('********************************')
print('简单的数据采样 ')
print('===============================')
print( df_inner.sample(n=3) )
print('===============================')
print('手动设置采样权重 ')
print('===============================')
weights = [0, 0.1, 0.7, 0, 0.2, 0.1]
print( df.sample(n=2, weights=weights) )
print('===============================')
print('采样后不放回')
print('===============================')
print( df.sample(n=6, replace=False) )
print('===============================')
print('采样后放回')
print('===============================')
print( df.sample(n=6, replace=True) )
print('===============================')
print('数据表描述性统计')
print('===============================')
#round函数设置显示小数位,T表示转置
print( df.describe().round(2).T )
print('===============================')
print('计算列的标准差')
print('===============================')
print( df['price'].std() )
print('===============================')
print('计算两个字段间的协方差')
print('===============================')
print( df['price'].cov(df_inner['age']) )
print('===============================')
print('数据表中所有字段间的协方差')
print('===============================')
print( df.cov() )
print('===============================')
print('两个字段的相关性分析')
print('===============================')
##相关系数在-1到1之间,接近1为正相关,接近-1为负相关,0为不相关
print( df['price'].corr(df_inner['age']) )
print('===============================')
print('数据表的相关性分析')
print('===============================')
print( df.corr() )
print('===============================')
print('********************************')
print('八、数据输出')
print('********************************')
print('数据导出:写入Excel')
#此处不知道什么原因,将内容写入到.xlsx文件中一直存在问题
df.to_excel('excel_to_python.xls',sheet_name='test1')
print('写入成功')
print('===============================')
print('数据导出:写入到CSV')
df.to_csv('excel_to_python.csv')
print('写入成功')
print('===============================')
显示结果
********************************
一、信息表信息查看
********************************
查看前2行
df.head(2)
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.9
1 1002 2013-01-03 SH 100-B 44 NaN
===============================
查看后2行
df.tail(2)
id date city category age price
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 45 4432.0
===============================
维度查看
(6, 6)
===============================
数据表基本信息(维度、列名称、数据格式、所占空间等)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id 6 non-null int64
date 6 non-null datetime64[ns]
city 6 non-null object
category 6 non-null object
age 6 non-null int64
price 4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 276.0+ bytes
None
===============================
每一列数据的格式
id int64
date datetime64[ns]
city object
category object
age int64
price float64
dtype: object
===============================
category列格式
object
===============================
空值
id date city category age price
0 False False False False False False
1 False False False False False True
2 False False False False False False
3 False False False False False False
4 False False False False False True
5 False False False False False False
===============================
查看price列空值
0 False
1 True
2 False
3 False
4 True
5 False
Name: price, dtype: bool
===============================
查看age列的唯一值
[23 44 54 32 34 45]
===============================
查看数据表的值
[[1001L Timestamp('2013-01-02 00:00:00') 'Beijing ' '100-A' 23L 1200.9]
[1002L Timestamp('2013-01-03 00:00:00') 'SH' '100-B' 44L nan]
[1003L Timestamp('2013-01-04 00:00:00') ' guangzhou ' '110-A' 54L 2133.0]
[1004L Timestamp('2013-01-05 00:00:00') 'Shenzhen' '110-C' 32L 5433.2]
[1005L Timestamp('2013-01-06 00:00:00') 'shanghai' '210-A' 34L nan]
[1006L Timestamp('2013-01-07 00:00:00') 'BEIJING ' '130-F' 45L 4432.0]]
===============================
查看列名称
Index([u'id', u'date', u'city', u'category', u'age', u'price'], dtype='object')
===============================
********************************
二、数据表清洗
********************************
使用列prince的均值对NA进行填充
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.9
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.2
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 45 4432.0
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.9
1 1002 2013-01-03 SH 100-B 44 100.0
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.2
4 1005 2013-01-06 shanghai 210-A 34 100.0
5 1006 2013-01-07 BEIJING 130-F 45 4432.0
0 1200.900
1 3299.775
2 2133.000
3 5433.200
4 3299.775
5 4432.000
Name: price, dtype: float64
===============================
清除city字段的字符前后空格,字符的大小写转换
id date city category age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.9
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.2
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 45 4432.0
===============================
更改数据格式
0 1200
1 100
2 2133
3 5433
4 100
5 4432
Name: price, dtype: int32
===============================
更改列名称
id date city category-size age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.9
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.2
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 45 4432.0
===============================
删除后出现的重复值
0 BEIJING
1 SH
2 GUANGZHOU
3 SHENZHEN
4 SHANGHAI
Name: city, dtype: object
===============================
数据替换
id date city category age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.9
1 1002 2013-01-03 shanghai 100-B 44 NaN
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.2
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 45 4432.0
===============================
********************************
三、数据预处理
********************************
内连接
id date city category age price gender m-point pay
0 1001 2013-01-02 BEIJING 100-A 23 1200.9 male 10 Y
1 1002 2013-01-03 shanghai 100-B 44 NaN female 12 N
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male 20 Y
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.2 female 40 Y
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N
5 1006 2013-01-07 BEIJING 130-F 45 4432.0 female 40 Y
===============================
左连接
id date city category age price gender m-point pay
0 1001 2013-01-02 BEIJING 100-A 23 1200.9 male 10 Y
1 1002 2013-01-03 shanghai 100-B 44 NaN female 12 N
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0 male 20 Y
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.2 female 40 Y
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN male 40 N
5 1006 2013-01-07 BEIJING 130-F 45 4432.0 female 40 Y
===============================
右连接
id date city category age price gender m-point pay
0 1001 2013-01-02 BEIJING 100-A 23.0 1200.9 male 10 Y
1 1002 2013-01-03 shanghai 100-B 44.0 NaN female 12 N
2 1003 2013-01-04 GUANGZHOU 110-A 54.0 2133.0 male 20 Y
3 1004 2013-01-05 SHENZHEN 110-C 32.0 5433.2 female 40 Y
4 1005 2013-01-06 SHANGHAI 210-A 34.0 NaN male 40 N
5 1006 2013-01-07 BEIJING 130-F 45.0 4432.0 female 40 Y
6 1007 NaT NaN NaN NaN NaN male 30 N
7 108 NaT NaN NaN NaN NaN female 20 Y
===============================
外连接
id date city category age price gender m-point pay
0 1001 2013-01-02 BEIJING 100-A 23.0 1200.9 male 10 Y
1 1002 2013-01-03 shanghai 100-B 44.0 NaN female 12 N
2 1003 2013-01-04 GUANGZHOU 110-A 54.0 2133.0 male 20 Y
3 1004 2013-01-05 SHENZHEN 110-C 32.0 5433.2 female 40 Y
4 1005 2013-01-06 SHANGHAI 210-A 34.0 NaN male 40 N
5 1006 2013-01-07 BEIJING 130-F 45.0 4432.0 female 40 Y
6 1007 NaT NaN NaN NaN NaN male 30 N
7 108 NaT NaN NaN NaN NaN female 20 Y
===============================
设置索引列
id date city category age price gender m-point pay
0 1001 2013-01-02 BEIJING 100-A 23.0 1200.9 male 10 Y
3 1004 2013-01-05 SHENZHEN 110-C 32.0 5433.2 female 40 Y
4 1005 2013-01-06 SHANGHAI 210-A 34.0 NaN male 40 N
1 1002 2013-01-03 shanghai 100-B 44.0 NaN female 12 N
5 1006 2013-01-07 BEIJING 130-F 45.0 4432.0 female 40 Y
2 1003 2013-01-04 GUANGZHOU 110-A 54.0 2133.0 male 20 Y
6 1007 NaT NaN NaN NaN NaN male 30 N
7 108 NaT NaN NaN NaN NaN female 20 Y
===============================
如果price列的值>3000,group列显示high,否则显示low
id date city category ... gender m-point pay group
0 1001 2013-01-02 BEIJING 100-A ... male 10 Y low
3 1004 2013-01-05 SHENZHEN 110-C ... female 40 Y high
4 1005 2013-01-06 SHANGHAI 210-A ... male 40 N low
1 1002 2013-01-03 shanghai 100-B ... female 12 N low
5 1006 2013-01-07 BEIJING 130-F ... female 40 Y high
2 1003 2013-01-04 GUANGZHOU 110-A ... male 20 Y low
6 1007 NaT NaN NaN ... male 30 N low
7 108 NaT NaN NaN ... female 20 Y low
[8 rows x 10 columns]
===============================
对复合多个条件的数据进行分组标记
id date city category ... gender m-point pay sign
0 1001 2013-01-02 BEIJING 100-A ... male 10 Y NaN
1 1002 2013-01-03 shanghai 100-B ... female 12 N NaN
2 1003 2013-01-04 GUANGZHOU 110-A ... male 20 Y NaN
3 1004 2013-01-05 SHENZHEN 110-C ... female 40 Y NaN
4 1005 2013-01-06 SHANGHAI 210-A ... male 40 N NaN
5 1006 2013-01-07 BEIJING 130-F ... female 40 Y 1.0
[6 rows x 10 columns]
===============================
对category字段的值依次进行分列,并创建数据表,索引值为df_inner的索引列,列名称为category和size
split
category size
0 100 A
1 100 B
2 110 A
3 110 C
4 210 A
5 130 F
===============================
将完成分裂后的数据表和原df_inner数据表进行匹配
id date city category_x ... pay sign category_y size
0 1001 2013-01-02 BEIJING 100-A ... Y NaN 100 A
1 1002 2013-01-03 shanghai 100-B ... N NaN 100 B
2 1003 2013-01-04 GUANGZHOU 110-A ... Y NaN 110 A
3 1004 2013-01-05 SHENZHEN 110-C ... Y NaN 110 C
4 1005 2013-01-06 SHANGHAI 210-A ... N NaN 210 A
5 1006 2013-01-07 BEIJING 130-F ... Y 1.0 130 F
[6 rows x 12 columns]
===============================
********************************
四、数据提取
********************************
按索引提取单行的数值
id 1001
date 2013-01-02 00:00:00
city BEIJING
category_x 100-A
age 23
price 1200.9
gender male
m-point 10
pay Y
sign NaN
category_y 100
size A
Name: 0, dtype: object
===============================
按索引提取区域行数值
id date city category_x ... pay sign category_y size
0 1001 2013-01-02 BEIJING 100-A ... Y NaN 100 A
1 1002 2013-01-03 shanghai 100-B ... N NaN 100 B
[2 rows x 12 columns]
===============================
重设索引
===============================
设置日期为索引
===============================
提取4日之前的所有数据
id city category_x age ... pay sign category_y size
date ...
2013-01-02 1001 BEIJING 100-A 23 ... Y NaN 100 A
2013-01-03 1002 shanghai 100-B 44 ... N NaN 100 B
2013-01-04 1003 GUANGZHOU 110-A 54 ... Y NaN 110 A
[3 rows x 11 columns]
===============================
使用iloc按位置区域提取数据
id city
date
2013-01-02 1001 BEIJING
2013-01-03 1002 shanghai
===============================
适应iloc按位置单独提起数据
price gender
date
2013-01-02 1200.9 male
2013-01-04 2133.0 male
2013-01-07 4432.0 female
===============================
使用ix按索引标签和位置混合提取数据
id city category_x age
date
2013-01-02 1001 BEIJING 100-A 23
2013-01-03 1002 shanghai 100-B 44
===============================
判断city列的值是否为北京
date
2013-01-02 True
2013-01-03 False
2013-01-04 False
2013-01-05 False
2013-01-06 False
2013-01-07 True
Name: city, dtype: bool
===============================
判断city列里是否包含beijing和shanghai,然后将符合条件的数据提取出来
id city category_x age ... pay sign category_y size
date ...
2013-01-02 1001 BEIJING 100-A 23 ... Y NaN 100 A
2013-01-03 1002 shanghai 100-B 44 ... N NaN 100 B
2013-01-07 1006 BEIJING 130-F 45 ... Y 1.0 130 F
[3 rows x 11 columns]
===============================
提取前三个字符,并生成数据表
category
0 100
1 100
2 110
3 110
4 210
5 130
===============================
********************************
五、数据筛选
********************************
找出年龄大于25并且在beijing的记录
id city age category price
5 1006 BEIJING 45 130-F 4432.0
===============================
找出年龄大于35或者在beijing的记录
===============================
id city age category price
0 1001 BEIJING 23 100-A 1200.9
1 1002 shanghai 44 100-B NaN
2 1003 GUANGZHOU 54 110-A 2133.0
5 1006 BEIJING 45 130-F 4432.0
===============================
找出不在beijing的记录,对筛选后的数据按city列进行计数
===============================
id city age category price
1 1002 shanghai 44 100-B NaN
2 1003 GUANGZHOU 54 110-A 2133.0
3 1004 SHENZHEN 32 110-C 5433.2
4 1005 SHANGHAI 34 210-A NaN
总数:4
===============================
使用query函数进行筛选
===============================
id date city category age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.9
1 1002 2013-01-03 shanghai 100-B 44 NaN
5 1006 2013-01-07 BEIJING 130-F 45 4432.0
===============================
对筛选后的结果按prince进行求和
===============================
5632.9
===============================
********************************
六、数据汇总
********************************
对所有的列进行计数汇总
===============================
id date category age price
city
BEIJING 2 2 2 2 2
GUANGZHOU 1 1 1 1 1
SHANGHAI 1 1 1 1 0
SHENZHEN 1 1 1 1 1
shanghai 1 1 1 1 0
===============================
按城市对id字段进行计数
===============================
city
BEIJING 2
GUANGZHOU 1
SHANGHAI 1
SHENZHEN 1
shanghai 1
Name: id, dtype: int64
===============================
对两个字段进行汇总计数
===============================
city age
BEIJING 23 1
45 1
GUANGZHOU 54 1
SHANGHAI 34 1
SHENZHEN 32 1
shanghai 44 1
Name: id, dtype: int64
===============================
对city字段进行汇总,并分别计算prince的合计和均值
===============================
len sum mean
city
BEIJING 2.0 5632.9 2816.45
GUANGZHOU 1.0 2133.0 2133.00
SHANGHAI 1.0 0.0 NaN
SHENZHEN 1.0 5433.2 5433.20
shanghai 1.0 0.0 NaN
===============================
********************************
七、数据统计
********************************
简单的数据采样
===============================
id city category_x age ... pay sign category_y size
date ...
2013-01-07 1006 BEIJING 130-F 45 ... Y 1.0 130 F
2013-01-02 1001 BEIJING 100-A 23 ... Y NaN 100 A
2013-01-04 1003 GUANGZHOU 110-A 54 ... Y NaN 110 A
[3 rows x 11 columns]
===============================
手动设置采样权重
===============================
id date city category age price
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
5 1006 2013-01-07 BEIJING 130-F 45 4432.0
===============================
采样后不放回
===============================
id date city category age price
0 1001 2013-01-02 BEIJING 100-A 23 1200.9
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.2
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
1 1002 2013-01-03 shanghai 100-B 44 NaN
5 1006 2013-01-07 BEIJING 130-F 45 4432.0
===============================
采样后放回
===============================
id date city category age price
1 1002 2013-01-03 shanghai 100-B 44 NaN
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
3 1004 2013-01-05 SHENZHEN 110-C 32 5433.2
2 1003 2013-01-04 GUANGZHOU 110-A 54 2133.0
4 1005 2013-01-06 SHANGHAI 210-A 34 NaN
0 1001 2013-01-02 BEIJING 100-A 23 1200.9
===============================
数据表描述性统计
===============================
count mean std min 25% 50% 75% max
id 6.0 1003.50 1.87 1001.0 1002.25 1003.5 1004.75 1006.0
age 6.0 38.67 11.09 23.0 32.50 39.0 44.75 54.0
price 4.0 3299.78 1966.39 1200.9 1899.98 3282.5 4682.30 5433.2
===============================
计算列的标准差
===============================
1966.39058917
===============================
计算两个字段间的协方差
===============================
nan
===============================
数据表中所有字段间的协方差
===============================
id age price
id 3.500000 5.800000 3.242617e+03
age 5.800000 123.066667 2.646583e+03
price 3242.616667 2646.583333 3.866692e+06
===============================
两个字段的相关性分析
===============================
nan
===============================
数据表的相关性分析
===============================
id age price
id 1.000000 0.279463 0.792163
age 0.279463 1.000000 0.098074
price 0.792163 0.098074 1.000000
===============================
********************************
八、数据输出
********************************
数据导出:写入Excel
写入成功
===============================
数据导出:写入到CSV
写入成功
===============================