python pandas

import os,sys,pdb,pickle
import numpy as np
import pandas as pd

df = pd.read_csv('miss.txt', encoding='gb2312')
bs =  df['result'].str.contains(r'null')
print 'record with null\n',df[bs]


bs = df.duplicated()
print 'duplicated \n', df[bs]

#按"source", 'result'两列分组,返回每一组的大小
grp = df.groupby(['source', 'result']).size()
#排序(grp本身只包含group的size, 所以此处是对size排序)
grp.sort()
#保存到csv中
grp.to_csv('duplicated.txt',encoding='gb2312')



===================================================================

import os,sys,pdb,pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('dx.txt', header=None, names=[ 'score', '1', '2', '3', '4', '5','6','nan'])
print df.head()
print df.describe()


plt.figure()
for k in range(1,7,1):
    l = str(k)
    idx = df['score'] > 850
    good = df[idx][l]
    idx = df['score'] < 700
    bad = df[idx][l]

    if 0:
        print 'good ================'
        print good.describe()
        print 'bad ================'
        print bad.describe()

    plt.subplot(6,2,(k-1)*2+1)
    bins = np.arange(0.1,0.2,0.001)
    plt.hist(good.values,bins)
    plt.subplot(6,2,(k-1)*2+2)
    plt.hist(bad.values,bins)
    plt.title(l)
plt.show()


plt.figure()
idx = df['score'] > 850
good = df[idx]
idx = df['score'] < 700
bad = df[idx]
plt.plot(bad['4'].values, bad['6'].values, 'or')
plt.plot(good['4'].values, good['6'].values, 'xb')

plt.show()


if 0:
    idx = df['score'] > 900
    print '>900'
    print [df[idx].mean(), df[idx].std()]


    idx = df['score'] < 700
    print '<700'
    print [df[idx].mean(), df[idx].std()]



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值