import numpy as np
import pandas as pd
df = pd.read_csv('miss.txt', encoding='gb2312')
bs = df['result'].str.contains(r'null')
print 'record with null\n',df[bs]
bs = df.duplicated()
print 'duplicated \n', df[bs]
#按"source", 'result'两列分组,返回每一组的大小
grp = df.groupby(['source', 'result']).size()
#排序(grp本身只包含group的size, 所以此处是对size排序)
grp.sort()
#保存到csv中
grp.to_csv('duplicated.txt',encoding='gb2312')
===================================================================
import os,sys,pdb,pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('dx.txt', header=None, names=[ 'score', '1', '2', '3', '4', '5','6','nan'])
print df.head()
print df.describe()
plt.figure()
for k in range(1,7,1):
l = str(k)
idx = df['score'] > 850
good = df[idx][l]
idx = df['score'] < 700
bad = df[idx][l]
if 0:
print 'good ================'
print good.describe()
print 'bad ================'
print bad.describe()
plt.subplot(6,2,(k-1)*2+1)
bins = np.arange(0.1,0.2,0.001)
plt.hist(good.values,bins)
plt.subplot(6,2,(k-1)*2+2)
plt.hist(bad.values,bins)
plt.title(l)
plt.show()
plt.figure()
idx = df['score'] > 850
good = df[idx]
idx = df['score'] < 700
bad = df[idx]
plt.plot(bad['4'].values, bad['6'].values, 'or')
plt.plot(good['4'].values, good['6'].values, 'xb')
plt.show()
if 0:
idx = df['score'] > 900
print '>900'
print [df[idx].mean(), df[idx].std()]
idx = df['score'] < 700
print '<700'
print [df[idx].mean(), df[idx].std()]