3-2 特征二值化

特征二值化

请参考《数据准备和特征工程》中的相关章节,调试如下代码。


基础知识

import pandas as pd

pm25 = pd.read_csv("/home/aistudio/data/data20505/pm2.csv")
pm25.head()
RANKCITY_IDCITY_NAMEExposed days
01594拉萨2
12579玉溪7
23263厦门8
34267泉州9
45271漳州10
import numpy as np

# 以平均值作为阈值对特征"Exposed days"进行二值化
pm25['bdays'] = np.where(pm25["Exposed days"] > pm25["Exposed days"].mean(), 1, 0)
pm25.sample(10)
RANKCITY_IDCITY_NAMEExposed daysbdays
174197598宝鸡1281
260283364安阳2461
137152183南通1030
249272358开封2101
546244朔州580
217240315东营1701
869686抚顺780
7786303上饶710
190213608商洛1381
180203438岳阳1321
from sklearn.preprocessing import Binarizer

# 创建二值化模型,并用平均值赋值
bn = Binarizer(threshold=pm25["Exposed days"].mean())   

# 用得到的模型进行训练并同时实现特征转换,pm25[["Exposed days"]将Series转换为DataFrame对象
print(type(pm25["Exposed days"]))
result = bn.fit_transform(pm25[["Exposed days"]])   

pm25['sk-bdays'] = result
pm25.sample(10)
RANKCITY_IDCITY_NAMEExposed daysbdayssk-bdays
92103147双鸭山8200
1213273南平1800
175198545遂宁12911
157179449怀化11911
6775507钦州6500
404588丹东4900
4146462珠海4900
121133230衢州9400
7078510玉林6700
176199199泰州13011
pm25[["Exposed days"]].shape
(264, 1)
pm25["Exposed days"].shape
(264,)
pm25["Exposed days"].values.reshape((-1, 1)).shape
(264, 1)
from sklearn.preprocessing import binarize

# 训练出的模型fbin可以对其他DataFrame对象进行二值化操作
fbin = binarize(pm25[['Exposed days']], threshold=pm25['Exposed days'].mean())

fbin[[1, 50, 100, 150, 200]]
array([[0],
       [0],
       [0],
       [1],
       [1]])
# scale(float):正态分布的标准差,越大曲线越矮胖
gau = np.random.normal(loc=0, scale=1.0, size=100)
gau
array([-0.16138569, -1.12381876,  1.08345071, -0.3374515 , -0.4377176 ,
       -0.18485122, -0.47717794, -0.15147513,  1.34975203, -0.06388386,
        1.2794776 , -0.67413457, -2.03388881,  1.77891998,  3.45445178,
       -0.93258988,  0.39723041, -1.23677885,  1.87841988, -0.48846415,
        0.23898558,  0.08322678,  0.50841094, -0.59189042, -0.86218771,
        0.13808454, -1.420791  , -0.52815037,  0.37716549,  0.55944191,
       -0.81171679,  0.26489442, -0.62432789, -0.43654577,  0.226915  ,
       -1.01346821, -1.42727242, -0.45127134, -0.18215018,  1.11537106,
        0.30099939, -1.2661621 , -0.11895918,  1.69860201, -0.67702066,
       -1.00599679, -0.69448062, -0.94056253, -0.25849202, -0.08299086,
        1.48469908, -1.19206442,  0.95998195, -1.800488  , -0.04798554,
       -0.77909029,  0.9659936 , -0.38096705,  1.52306246, -2.033555  ,
       -1.82498521, -0.4745455 , -0.20866822,  2.01935722,  0.46819346,
        0.37152816, -0.20247084, -1.14168624,  1.04413851,  0.98376221,
        0.27129983, -0.66495964, -0.99604697,  0.31477433,  1.14606679,
        0.92117707,  0.91663896,  0.96625631, -2.00554469,  1.02536304,
       -0.63002324,  1.71252177, -0.65706596, -1.33159033,  0.08011075,
        1.62804803, -1.63617324,  1.42729399, -2.14112983,  0.95559999,
       -0.74515346,  1.29242505,  0.03208948, -0.45625835,  1.24445081,
       -1.53939509,  0.40075234, -0.97061926,  0.39624106, -0.14267309])
# Binarizer()的参数默认为0,reshape(-1, 1)中-1表示按照行的方式的获取,结果形成一列
gau_bin = Binarizer().fit_transform(gau.reshape(-1, 1))   
gau_bin.reshape(1,-1) [0]
array([0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1.,
       0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
       1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 1.,
       1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0.])

项目案例

%matplotlib inline
import matplotlib.pyplot as plt
import cv2

# 写一个专门在Jupyter中显示图片的函数
def show_img(img):    
    if len(img.shape) == 3:

        # 分离得到各个通道的灰度值(单通道图像)
        b, g, r = cv2.split(img)

        # 合并单通道成多通道(不能合并多个多通道图像)。   
        img = cv2.merge([r, g, b])
        plt.imshow(img)
    else:
        plt.imshow(img, cmap="gray")
    plt.axis("off")
    plt.show()

laoqi = cv2.imread("work/images/laoqi.png")
show_img(laoqi)

在这里插入图片描述

# BGR转GRAY
gray_laoqi = cv2.cvtColor(laoqi, cv2.COLOR_BGR2GRAY)
show_img(gray_laoqi)

在这里插入图片描述

# cv2.threshold(img, threshold, maxval,type)
# threshold是设定的阈值127
# maxval是当灰度值大于(或小于)阈值127时将该灰度值赋成的值255
# type规定的是当前二值化的方式  ,cv2.THRESH_BINARY:大于阈值127的部分被置为255,小于部分被置为0    
ret,thr = cv2.threshold(gray_laoqi, 127, 255, cv2.THRESH_BINARY)
show_img(thr)

在这里插入图片描述

动手练习

pd.read_csv("/home/aistudio/data/data20512/marathon.csv").head()
agegendersplitfinal
033M01:05:3802:08:51
132M01:06:2602:09:28
231M01:06:4902:10:42
338M01:06:1602:13:45
431M01:06:3202:13:59
import datetime

# 将时间换算为datetime.timedelta类型,split代表半马用时
def convert_time(s):
    h,m,s = map(int, s.split(":"))
    return datetime.timedelta(hours=h, minutes=m, seconds=s)

marathon = pd.read_csv("/home/aistudio/data/data20512/marathon.csv",
                      converters={"split":convert_time, 
                                  "final":convert_time})
print(marathon.dtypes)
marathon.head()
age                 int64
gender             object
split     timedelta64[ns]
final     timedelta64[ns]
dtype: object
agegendersplitfinal
033M0 days 01:05:380 days 02:08:51
132M0 days 01:06:260 days 02:09:28
231M0 days 01:06:490 days 02:10:42
338M0 days 01:06:160 days 02:13:45
431M0 days 01:06:320 days 02:13:59
# 由于转换为int后的存储单位是纳秒,因此需要乘以1e-9
marathon['split'] = marathon['split'].astype(int) * 1e-9
marathon['final'] = marathon['final'].astype(int) * 1e-9

marathon.head()
agegendersplitfinal
033M3938.00.000008
132M3986.00.000008
231M4009.00.000008
338M3976.00.000008
431M3992.00.000008
marathon['frac'] = 1 - 2 * marathon['split'] / marathon["final"]
marathon.head()
agegendersplitfinalfrac
033M3938.07731.0-0.018756
132M3986.07768.0-0.026262
231M4009.07842.0-0.022443
338M3976.08025.00.009097
431M3992.08039.00.006842
marathon['split_frac'] = np.where(marathon['frac']>0, 0, 1)
marathon.sample(10)
agegendersplitfinalfracsplit_frac
1019943M6524.014839.00.1206950
985336W6906.014724.00.0619400
138539M5304.011185.00.0515870
292351M5957.012162.00.0203910
3551342M9818.024291.00.1916350
281232M5390.012090.00.1083540
2148260M7353.017853.00.1762730
3182559M8110.021184.00.2343280
2518938W7816.018852.00.1708040
1595922W7438.016395.00.0926500
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

绿洲213

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值