import pandas
ba=pandas.read_csv('D://Python projects//reference data//4.8//data.csv')
bc=ba['id|comments|title'].str.split('|',3,True)
#单条件抽取
bc.columns=['id','comments','title']
#将字符型转成整数型
bc['id']=bc['id'].astype(int)
bc['comments']=bc['comments'].astype(int)
bc['title']=bc['title'].astype(str)
#抽取comments大于10000的记录
bc[bc.comments>10000]
#范围抽取,左右都是闭合的
bc[bc.comments.between(5000,6000)]
#过滤空值
bc[pandas.isnull(bc.title)]
#字符匹配
bc[bc.title.str.contains('小米',na=False)]
#取反
bc[~bc.title.str.contains('小米',na=False)]
#组合逻辑
bc[(bc.comments>=5000)&(bc.comments<=6000)]
#随机抽样
import numpy
import pandas
ca=pandas.read_csv('D://Python projects//reference data//4.9//data.csv')
#设置随机种子
numpy.random.seed(2)
#按照个数抽样
ca.sample(n=10)
#按百分比抽样
ca.sample(frac=0.05)
#可放回抽样
ca.sample(n=10,replace=True)
#典型抽样,分层抽样
cla=ca.groupby('class')
cla.groups
#使用字典定义分层抽样的方法
#按指定数量分层抽样
typical={
1:2,
2:3,
3:4}
def typicaltemping(group,typical):
name=group.name
n=typical[name]
return group.sample(n=n)
result=ca.groupby(
'class',group_keys=False
).apply(typicaltemping,typical)
#按百分比分层抽样
typicab={
1:0.1,
2:0.2,
3:0.3}
def typicalpercent(groupper,typicab):
name=groupper.name
frac=typicab[name]
return groupper.sample(frac=frac)
result=ca.groupby(
'class',group_keys=False
).apply(typicalpercent,typicab)