python使用pandas抽样训练数据中某个类别

# -*- coding: utf-8 -*-  
  
import numpy  
from sklearn import metrics  
from sklearn.svm import LinearSVC  
from sklearn.naive_bayes import MultinomialNB  
from sklearn import linear_model  
from sklearn.datasets import load_iris  
from sklearn.cross_validation import train_test_split  
from sklearn.preprocessing import OneHotEncoder, StandardScaler  
from sklearn import cross_validation  
from sklearn import preprocessing  
import scipy as sp
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest ,chi2
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
#import iris_data  

'''
creativeID,userID,positionID,clickTime,conversionTime,connectionType,
telecomsOperator,appPlatform,sitesetID,positionType,age,gender,
education,marriageStatus,haveBaby,hometown,residence,appID,appCategory,label
'''



def test():
    df = pd.read_table("/var/lib/mysql-files/data1.csv", sep=",")
    df1 = df[["connectionType","telecomsOperator","appPlatform","sitesetID",
              "positionType","age","gender","education","marriageStatus",
              "haveBaby","hometown","residence","appCategory","label"]]
    print df1["label"].value_counts()
    N_data = df1[df1["label"]==0]
    P_data = df1[df1["label"]==1]
    N_data = N_data.sample(n=P_data.shape[0], frac=None, replace=False, weights=None, random_state=2, axis=0)
    #print df1.loc[:,"label"]==0
    print P_data.shape
    print N_data.shape
    
    data = pd.concat([N_data,P_data])
    print data.shape
    data = data.sample(frac=1).reset_index(drop=True) 
    print data[["label"]]
    return

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值