首先导入相关包
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
#数据分箱技术Binning
数据分箱技术
创建一个数列,长度为20,数值从25到100
score_list = np.random.randint(25,100,size=20)
print(score_list)
设置区间bins,并统计落入各区间的个数
#统计落入各区间的个数,突然想到可以用到直方图统计
bins = [0,59,70,80,100]
score_cut = pd.cut(score_list,bins)
打印一下score_cut
print(score_cut)
把每个分箱的个数统计出来~
rint(pd.value_counts(score_cut))
创建一个空的DateFrame
df = DataFrame()
给加一个score列~
df['score'] = score_list
rands随机生成三个字符串
df['student'] = [pd.util.testing.rands(3) for i in range(20)]
df['Categories'] = pd.cut(df['score'],bins)
print(df)
打印结果
给每个箱子一个标签,low,ok,good,great
df['Categories'] = pd.cut(df['score'],bins,labels=['Low','Ok','Good','Great'])
print(df)
数据分组技术
df = pd.read_csv('city_weather.csv')
print(df)
看看数据
通过city分组
g = df.groupby(df['city'])
print(g)
打印结果
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000018242F94F70>
分组
#分组
print(g.groups)
{'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'), 'GZ': Int64Index([14, 15, 16, 17], dtype='int64'), 'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64'), 'SZ': Int64Index([18, 19], dtype='int64')}
打印下
df_bj = g.get_group('BJ')
print(df_bj)
print('-'*100)
print(df_bj.mean())
分别对gz sh …求平均值,最大值
print(g.mean())
print(g.max())
把g用list改变下
print(list(g))
然后将它变成一个字典
print(dict(list(g)))
看一下这个字典中key值为北京的value
print(dict(list(g))['BJ'])
…这样看看
#name 和group_df是自己命名的..因为有元组
for name,group_df in g:
print(name)
print('-'*100)
print(group_df)
数据聚合技术
df = pd.read_csv('city_weather.csv')
g = df.groupby('city')
print(dict(list(g)))
print(g.agg('min'))
print(g.min())
def foo(attr):
return attr.max() - attr.min()
print(g.agg(foo))#得到新的DateFrame
print(df)
g_new = df.groupby(['city','wind'])
print(g_new.groups)
print(g_new.get_group(('BJ',3)))
for (name,tem),group in g_new:
print(name,tem)
print(group)