Dataquest学习总结[10]

最新推荐文章于 2020-06-24 18:50:29 发布

sodleave

最新推荐文章于 2020-06-24 18:50:29 发布

阅读量685

点赞数

分类专栏： python数据分析

本文链接：https://blog.csdn.net/sodleave/article/details/72900133

版权

python数据分析专栏收录该内容

12 篇文章 0 订阅

订阅专栏

Step 6: Machine Learning

Decision Trees

>>Introduction to Decision Trees

构建决策树时将类别型特征转换为数值型数据：

用到pandas的categorical，使用Categorical.from_array方法

numpy.bincount 计算array中各值出现的频次，类似于pandas的value_counts()

# Convert a single column from text categories to numbers
col = pandas.Categorical.from_array(income["workclass"])
income["workclass"] = col.codes
cats=['education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country','high_income']
for cat in cats:
    col = pandas.Categorical.from_array(income[cat])
    income[cat] = col.codes

#ID3
# Create a dictionary to hold the tree  
tree = {}
nodes = []
def id3(data, target, columns, tree):
    unique_targets = pandas.unique(data[target])
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]
    if len(unique_targets) == 1:
        if unique_targets==1:
            tree['label']=1
        else:
            tree['label']=0
        return
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    tree['column']=best_column
    tree['median']=column_median
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])
# Call the function on our data to set the counters properly
id3(data, "high_income", ["age", "marital_status"], tree)

#格式化输出ID3 tree
def print_with_depth(string, depth):
    prefix = "    " * depth
    print("{0}{1}".format(prefix, string)) 
def print_node(tree, depth):
    if "label" in tree:
        print_with_depth("Leaf: Label {0}".format(tree["label"]), depth)
        return
    print_with_depth("{0} > {1}".format(tree["column"], tree["median"]), depth)
    branches = [tree["left"], tree["right"]]  
    for branch in branches:
        print_node(branch,depth+1)
print_node(tree, 0)

#决策树预测
def predict(tree, row):
    if "label" in tree:
        return tree["label"]
    column = tree["column"]
    median = tree["median"]
    if row[column] <= median:
        return predict(tree["left"], row)
    else:
        return predict(tree["right"], row)
print(predict(tree, data.iloc[0]))

#使用匿名函数预测整个DataFrame
def batch_predict(tree, df):
    return df.apply(lambda x: predict(tree,x),axis=1)
predictions = batch_predict(tree, new_data)

>> Applying Decision Trees

决策树中一些控制过拟合的参数：

决策树的优缺点：

advantages:
Easy to interpret
Relatively fast to fit and make predictions
Able to handle multiple types of data
Able to pick up nonlinearities in data, and usually fairly accurate
disadvantage:
their tendency to overfit.

#调用sklearn库训练决策树
from sklearn.tree import DecisionTreeClassifier
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
# Set random_state to 1 to make sure the results are consistent
clf = DecisionTreeClassifier(random_state=1)
clf.fit(income[columns],income['high_income'])

#划分训练集80% 测试集20%
import numpy
import math
numpy.random.seed(1)
income = income.reindex(numpy.random.permutation(income.index))
train_max_row = math.floor(income.shape[0] * .8)
train=income.iloc[:train_max_row]
test=income.iloc[train_max_row:]

#计算auc
from sklearn.metrics import roc_auc_score
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
error=roc_auc_score(test['high_income'],predictions)

#调整模型参数，防止过拟合
clf = DecisionTreeClassifier(random_state=1,min_samples_split=13,max_depth=7)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)
#通过训练集和测试集的auc来看模型的拟合情况，从而调整模型参数

>>Random Forest

对本身差别比较大的模型进行融合往往效果更明显，比如融合决策树和逻辑斯蒂回归

如果两个模型的预测能力auc差的比较大，融合后可能不会有太明显提升，需要对二者之间赋权值

随机森林产生：为了让融合的各个子树具有差异性，引入Variation With Bagging

numpy.random.choice()从一个list随机选出几个，random.choice(columns,2)即从columns中选出两个

随机森林的说明：documentation

随机森林参数：

random forest相对于单颗决策树能更好的减少过拟合

随机森林的优缺点：

#手动构造随机森林
tree_count = 10
bag_proportion = .6
predictions = []
for i in range(tree_count):
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2, splitter='random',max_features='auto')
    clf.fit(bag[columns], bag["high_income"])
    predictions.append(clf.predict_proba(test[columns])[:,1])
combined = numpy.sum(predictions, axis=0) / 10
rounded = numpy.round(combined)
print(roc_auc_score(test["high_income"], rounded))

#用sklearn库构造随机森林
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=5, random_state=1, min_samples_leaf=2)
clf.fit(train[columns],train['high_income'])
pre=clf.predict(test[columns])
auc=roc_auc_score(test['high_income'],pre)

注意：决策树/随机森林既可以做分类又可以做回归（RandomForestClassifer/RandomForestRegressor）

Machine Learning Project

数据集： Lending Club

>>Data Cleaning

将数据集中某些值替换为其他值：replace ，对DataFrame操作一般传入的是嵌套的字典

numpy.nan即空值，在对Series进行unique操作时，nan也会算进来，[1,2,Nan].unique() 也有三个值，不会把Nan排除

#数据预处理
import pandas as pd
loans_2007 = pd.read_csv('LoanStats3a.csv', skiprows=1) #忽略第一行
half_count = len(loans_2007) / 2
loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)#删除有50%的值为空的列
loans_2007 = loans_2007.drop(['desc', 'url'],axis=1)
loans_2007.to_csv('loans_2007.csv', index=False)

loans_2007 = pd.read_csv("loans_2007.csv")
loans_2007.drop_duplicates()
print(loans_2007.iloc[0])
print(loans_2007.shape[1])

#排除一些有信息泄露的数据，冗余数据，或者需要其他数据才能变成有用特征的数据
drop_cols=["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"]
loans_2007=loans_2007.drop(drop_cols,axis=1)
loans_2007 = loans_2007.drop(["zip_code","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv", "total_rec_prncp"], axis=1)
loans_2007 = loans_2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1)

#构造label，排除一些不明确rows，将label映射为正类和负类
loans_2007=loans_2007[(loans_2007['loan_status']=='Fully Paid')|(loans_2007['loan_status']=='Charged Off')]
re={'loan_status':{'Fully Paid':1,'Charged Off':0}}
loans_2007=loans_2007.replace(re)

#排除单值列：每一列排除空值后如果只有一个有效值，则将其删除
drop_columns=[]
for col in loans_2007.columns.tolist():
    uni_value=loans_2007[col].dropna().unique()
    if len(uni_value)==1:
        drop_columns.append(col)
loans_2007=loans_2007.drop(drop_columns,axis=1)

>>Preparing the Feature

选择DataFrame中特定元素类型的列构建新的DataFrame

get_dummies 将类型变量转为数值变量，类似于OneHotEncoder

#读入数据，统计每一列空值Nan的个数
import pandas as pd
loans = pd.read_csv('filtered_loans_2007.csv')
null_counts = loans.isnull().sum()
print(null_counts)
#output：
#title                    10
#revol_util               50
#last_credit_pull_d        2
#pub_rec_bankruptcies    697

#移除含有空值的行，以及空值较多的列
loans=loans.drop('pub_rec_bankruptcies',axis=1)
loans=loans.dropna(axis=0)
print(loans.dtypes.value_counts())
#output：
#object     11
#float64    10
#int64       1

#选出并查看object类型的列
object_columns_df=loans.select_dtypes(include=['object'])
print(object_columns_df.iloc[0])

#探索category数值的列
cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']
for c in cols:
    print(loans[c].value_counts())

mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}
loans=loans.drop(['last_credit_pull_d', 'addr_state', 'title', 'earliest_cr_line'],axis=1)
loans['int_rate']=loans['int_rate'].str.rstrip('%').astype('float64')
loans['revol_util']=loans['revol_util'].str.rstrip('%').astype('float64')
loans=loans.replace(mapping_dict)

cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)

>>Making predictions

针对正负样本极不均匀的问题（正样本是负样本6倍）：

1.可以对正样本进行采样，或者构造一些负样本

Use oversampling and undersampling to ensure that the classifier gets input that has a balanced number of each class.

2.还有一种方式是在模型选择使，对不同的label赋予不同的权值：setting the class_weight parameter to balanced

Tell the classifier to penalize misclassifications of the less prevalent class more than the other class

import pandas as pd
loans=pd.read_csv('cleaned_loans_2007.csv')
print(loans.info())

#计算TP,TN,FP,FN
import pandas as pd
tn=sum((predictions==0)&(loans['loan_status']==0))
tp=sum((predictions==1)&(loans['loan_status']==1))
fn=sum((predictions==0)&(loans['loan_status']==1))
fp=sum((predictions==1)&(loans['loan_status']==0))

#根据样本的分布和实际场景选择评价指标
# Predict that all loans will be paid off on time.
predictions = pd.Series(numpy.ones(loans.shape[0]))
fp=sum((predictions==1)&(loans['loan_status']==0))
tp=sum((predictions==1)&(loans['loan_status']==1))
fn=sum((predictions==0)&(loans['loan_status']==1))
tn=sum((predictions==0)&(loans['loan_status']==0))
fpr=fp/(fp+tn)
tpr=tp/(tp+fn)
print(fpr,tpr)

from sklearn.linear_model import LogisticRegression
features=loans.drop('loan_status',axis=1)
target=loans['loan_status']
lr = LogisticRegression()
lr.fit(features,target)
predictions=lr.predict(features)

#交叉验证
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict, KFold
lr = LogisticRegression()
kf = KFold(features.shape[0], random_state=1)
predictions=cross_val_predict(lr,features,target,cv=kf)
tp=sum((predictions==1)&(target==1))
tn=sum((predictions==0)&(target==0))
fp=sum((predictions==1)&(target==0))
fn=sum((predictions==0)&(target==1))
tpr=tp/(tp+fn)
fpr=fp/(fp+tn)
print(fpr,tpr)

#调整参数克服样本不均匀，增加负样本错误的惩罚
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict
lr=LogisticRegression(class_weight='balanced')
kf=KFold(features.shape[0],random_state=1)
predictions=cross_val_predict(lr,features,target,cv=kf)
tp=sum((predictions==1)&(target==1))
tn=sum((predictions==0)&(target==0))
fp=sum((predictions==1)&(target==0))
fn=sum((predictions==0)&(target==1))
tpr=tp/(tp+fn)
fpr=fp/(fp+tn)
print(fpr,tpr)

sodleave

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Dataquest学习总结[10]

Step 6: Machine Learning Decision Trees>>Introduction to Decision Trees构建决策树时将类别型特征转换为数值型数据：用到pandas的categorical，使用Categorical.from_array方法numpy.bincount 计算array中各值出现的频次，类似于pandas的value_c
复制链接

扫一扫