特征二值化
请参考《数据准备和特征工程》中的相关章节,调试如下代码。
基础知识
import pandas as pd
pm25 = pd.read_csv("/home/aistudio/data/data20505/pm2.csv")
pm25.head()
| RANK | CITY_ID | CITY_NAME | Exposed days |
---|
0 | 1 | 594 | 拉萨 | 2 |
---|
1 | 2 | 579 | 玉溪 | 7 |
---|
2 | 3 | 263 | 厦门 | 8 |
---|
3 | 4 | 267 | 泉州 | 9 |
---|
4 | 5 | 271 | 漳州 | 10 |
---|
import numpy as np
pm25['bdays'] = np.where(pm25["Exposed days"] > pm25["Exposed days"].mean(), 1, 0)
pm25.sample(10)
| RANK | CITY_ID | CITY_NAME | Exposed days | bdays |
---|
174 | 197 | 598 | 宝鸡 | 128 | 1 |
---|
260 | 283 | 364 | 安阳 | 246 | 1 |
---|
137 | 152 | 183 | 南通 | 103 | 0 |
---|
249 | 272 | 358 | 开封 | 210 | 1 |
---|
54 | 62 | 44 | 朔州 | 58 | 0 |
---|
217 | 240 | 315 | 东营 | 170 | 1 |
---|
86 | 96 | 86 | 抚顺 | 78 | 0 |
---|
77 | 86 | 303 | 上饶 | 71 | 0 |
---|
190 | 213 | 608 | 商洛 | 138 | 1 |
---|
180 | 203 | 438 | 岳阳 | 132 | 1 |
---|
from sklearn.preprocessing import Binarizer
bn = Binarizer(threshold=pm25["Exposed days"].mean())
print(type(pm25["Exposed days"]))
result = bn.fit_transform(pm25[["Exposed days"]])
pm25['sk-bdays'] = result
pm25.sample(10)
| RANK | CITY_ID | CITY_NAME | Exposed days | bdays | sk-bdays |
---|
92 | 103 | 147 | 双鸭山 | 82 | 0 | 0 |
---|
12 | 13 | 273 | 南平 | 18 | 0 | 0 |
---|
175 | 198 | 545 | 遂宁 | 129 | 1 | 1 |
---|
157 | 179 | 449 | 怀化 | 119 | 1 | 1 |
---|
67 | 75 | 507 | 钦州 | 65 | 0 | 0 |
---|
40 | 45 | 88 | 丹东 | 49 | 0 | 0 |
---|
41 | 46 | 462 | 珠海 | 49 | 0 | 0 |
---|
121 | 133 | 230 | 衢州 | 94 | 0 | 0 |
---|
70 | 78 | 510 | 玉林 | 67 | 0 | 0 |
---|
176 | 199 | 199 | 泰州 | 130 | 1 | 1 |
---|
pm25[["Exposed days"]].shape
(264, 1)
pm25["Exposed days"].shape
(264,)
pm25["Exposed days"].values.reshape((-1, 1)).shape
(264, 1)
from sklearn.preprocessing import binarize
fbin = binarize(pm25[['Exposed days']], threshold=pm25['Exposed days'].mean())
fbin[[1, 50, 100, 150, 200]]
array([[0],
[0],
[0],
[1],
[1]])
gau = np.random.normal(loc=0, scale=1.0, size=100)
gau
array([-0.16138569, -1.12381876, 1.08345071, -0.3374515 , -0.4377176 ,
-0.18485122, -0.47717794, -0.15147513, 1.34975203, -0.06388386,
1.2794776 , -0.67413457, -2.03388881, 1.77891998, 3.45445178,
-0.93258988, 0.39723041, -1.23677885, 1.87841988, -0.48846415,
0.23898558, 0.08322678, 0.50841094, -0.59189042, -0.86218771,
0.13808454, -1.420791 , -0.52815037, 0.37716549, 0.55944191,
-0.81171679, 0.26489442, -0.62432789, -0.43654577, 0.226915 ,
-1.01346821, -1.42727242, -0.45127134, -0.18215018, 1.11537106,
0.30099939, -1.2661621 , -0.11895918, 1.69860201, -0.67702066,
-1.00599679, -0.69448062, -0.94056253, -0.25849202, -0.08299086,
1.48469908, -1.19206442, 0.95998195, -1.800488 , -0.04798554,
-0.77909029, 0.9659936 , -0.38096705, 1.52306246, -2.033555 ,
-1.82498521, -0.4745455 , -0.20866822, 2.01935722, 0.46819346,
0.37152816, -0.20247084, -1.14168624, 1.04413851, 0.98376221,
0.27129983, -0.66495964, -0.99604697, 0.31477433, 1.14606679,
0.92117707, 0.91663896, 0.96625631, -2.00554469, 1.02536304,
-0.63002324, 1.71252177, -0.65706596, -1.33159033, 0.08011075,
1.62804803, -1.63617324, 1.42729399, -2.14112983, 0.95559999,
-0.74515346, 1.29242505, 0.03208948, -0.45625835, 1.24445081,
-1.53939509, 0.40075234, -0.97061926, 0.39624106, -0.14267309])
gau_bin = Binarizer().fit_transform(gau.reshape(-1, 1))
gau_bin.reshape(1,-1) [0]
array([0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1.,
0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0.,
1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 1.,
1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0.])
项目案例
%matplotlib inline
import matplotlib.pyplot as plt
import cv2
def show_img(img):
if len(img.shape) == 3:
b, g, r = cv2.split(img)
img = cv2.merge([r, g, b])
plt.imshow(img)
else:
plt.imshow(img, cmap="gray")
plt.axis("off")
plt.show()
laoqi = cv2.imread("work/images/laoqi.png")
show_img(laoqi)
gray_laoqi = cv2.cvtColor(laoqi, cv2.COLOR_BGR2GRAY)
show_img(gray_laoqi)
ret,thr = cv2.threshold(gray_laoqi, 127, 255, cv2.THRESH_BINARY)
show_img(thr)
动手练习
pd.read_csv("/home/aistudio/data/data20512/marathon.csv").head()
| age | gender | split | final |
---|
0 | 33 | M | 01:05:38 | 02:08:51 |
---|
1 | 32 | M | 01:06:26 | 02:09:28 |
---|
2 | 31 | M | 01:06:49 | 02:10:42 |
---|
3 | 38 | M | 01:06:16 | 02:13:45 |
---|
4 | 31 | M | 01:06:32 | 02:13:59 |
---|
import datetime
def convert_time(s):
h,m,s = map(int, s.split(":"))
return datetime.timedelta(hours=h, minutes=m, seconds=s)
marathon = pd.read_csv("/home/aistudio/data/data20512/marathon.csv",
converters={"split":convert_time,
"final":convert_time})
print(marathon.dtypes)
marathon.head()
age int64
gender object
split timedelta64[ns]
final timedelta64[ns]
dtype: object
| age | gender | split | final |
---|
0 | 33 | M | 0 days 01:05:38 | 0 days 02:08:51 |
---|
1 | 32 | M | 0 days 01:06:26 | 0 days 02:09:28 |
---|
2 | 31 | M | 0 days 01:06:49 | 0 days 02:10:42 |
---|
3 | 38 | M | 0 days 01:06:16 | 0 days 02:13:45 |
---|
4 | 31 | M | 0 days 01:06:32 | 0 days 02:13:59 |
---|
marathon['split'] = marathon['split'].astype(int) * 1e-9
marathon['final'] = marathon['final'].astype(int) * 1e-9
marathon.head()
| age | gender | split | final |
---|
0 | 33 | M | 3938.0 | 0.000008 |
---|
1 | 32 | M | 3986.0 | 0.000008 |
---|
2 | 31 | M | 4009.0 | 0.000008 |
---|
3 | 38 | M | 3976.0 | 0.000008 |
---|
4 | 31 | M | 3992.0 | 0.000008 |
---|
marathon['frac'] = 1 - 2 * marathon['split'] / marathon["final"]
marathon.head()
| age | gender | split | final | frac |
---|
0 | 33 | M | 3938.0 | 7731.0 | -0.018756 |
---|
1 | 32 | M | 3986.0 | 7768.0 | -0.026262 |
---|
2 | 31 | M | 4009.0 | 7842.0 | -0.022443 |
---|
3 | 38 | M | 3976.0 | 8025.0 | 0.009097 |
---|
4 | 31 | M | 3992.0 | 8039.0 | 0.006842 |
---|
marathon['split_frac'] = np.where(marathon['frac']>0, 0, 1)
marathon.sample(10)
| age | gender | split | final | frac | split_frac |
---|
10199 | 43 | M | 6524.0 | 14839.0 | 0.120695 | 0 |
---|
9853 | 36 | W | 6906.0 | 14724.0 | 0.061940 | 0 |
---|
1385 | 39 | M | 5304.0 | 11185.0 | 0.051587 | 0 |
---|
2923 | 51 | M | 5957.0 | 12162.0 | 0.020391 | 0 |
---|
35513 | 42 | M | 9818.0 | 24291.0 | 0.191635 | 0 |
---|
2812 | 32 | M | 5390.0 | 12090.0 | 0.108354 | 0 |
---|
21482 | 60 | M | 7353.0 | 17853.0 | 0.176273 | 0 |
---|
31825 | 59 | M | 8110.0 | 21184.0 | 0.234328 | 0 |
---|
25189 | 38 | W | 7816.0 | 18852.0 | 0.170804 | 0 |
---|
15959 | 22 | W | 7438.0 | 16395.0 | 0.092650 | 0 |
---|