第七章数据清洗和处理

最新推荐文章于 2023-12-22 09:30:00 发布

热爱学习的小鲁同学

最新推荐文章于 2023-12-22 09:30:00 发布

阅读量2.5k

点赞数

分类专栏：《利用python进行数据分析》笔记文章标签： python

本文链接：https://blog.csdn.net/m0_45055763/article/details/124202695

版权

《利用python进行数据分析》笔记专栏收录该内容

7 篇文章 0 订阅

订阅专栏

7.1处理缺失数据

Nan（not a number ）在pandas表示缺失值

import pandas as pd
import numpy as np
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

#判断缺失数据
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

#none在数组中也可为NAN
string_data[0]=None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

dropna:删除缺失数据
fillna:插值方法填充缺失数据
isnull: 返回布尔值，表明哪些是缺失值
notnull :isnull的反面

滤除缺失数据

from numpy import nan as NA
data=pd.Series([1,NA,3.5,NA,7])
data.dropna()#dropna返回含有非空数据和索引值的series

0    1.0
2    3.5
4    7.0
dtype: float64

data[data.notnull()]#与上面结果相同

0    1.0
2    3.5
4    7.0
dtype: float64

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

cleaned=data.dropna()#dropna默认丢弃含有缺失值的行
cleaned

	0	1	2
0	1.0	6.5	3.0

data.dropna(how='all')#只丢弃全为NA的行

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
3	NaN	6.5	3.0

data[4]=NA
data#第4列全为NA

	0	1	2	4
0	1.0	6.5	3.0	NaN
1	1.0	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN
3	NaN	6.5	3.0	NaN

data.dropna(axis=1,how='all')#丢弃全为NA的列

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

df=pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1]=NA
df.iloc[:2,2]=NA
df

	0	1	2
0	1.230124	NaN	NaN
1	-0.671868	NaN	NaN
2	-0.596658	NaN	0.002418
3	-1.061044	NaN	-0.246041
4	-0.677290	-1.394329	-1.870510
5	-0.313459	0.133874	-1.172282
6	-0.495465	-0.954127	0.150156

df.dropna()

	0	1	2
4	-0.677290	-1.394329	-1.870510
5	-0.313459	0.133874	-1.172282
6	-0.495465	-0.954127	0.150156

#thresh=n 表示保留至少含有n个非缺失值的行
df.dropna(thresh=2)

	0	1	2
2	-0.596658	NaN	0.002418
3	-1.061044	NaN	-0.246041
4	-0.677290	-1.394329	-1.870510
5	-0.313459	0.133874	-1.172282
6	-0.495465	-0.954127	0.150156

填充缺失数据

df.fillna(0)#以0替换缺失值

	0	1	2
0	1.230124	0.000000	0.000000
1	-0.671868	0.000000	0.000000
2	-0.596658	0.000000	0.002418
3	-1.061044	0.000000	-0.246041
4	-0.677290	-1.394329	-1.870510
5	-0.313459	0.133874	-1.172282
6	-0.495465	-0.954127	0.150156

#使用字典对不同列填值
df.fillna({1:0.5,2:0})

	0	1	2
0	1.230124	0.500000	0.000000
1	-0.671868	0.500000	0.000000
2	-0.596658	0.500000	0.002418
3	-1.061044	0.500000	-0.246041
4	-0.677290	-1.394329	-1.870510
5	-0.313459	0.133874	-1.172282
6	-0.495465	-0.954127	0.150156

_=df.fillna(0,inplace=True)#暂时命个名，不然不会执行
df

	0	1	2
0	1.230124	0.000000	0.000000
1	-0.671868	0.000000	0.000000
2	-0.596658	0.000000	0.002418
3	-1.061044	0.000000	-0.246041
4	-0.677290	-1.394329	-1.870510
5	-0.313459	0.133874	-1.172282
6	-0.495465	-0.954127	0.150156

df=pd.DataFrame(np.random.randn(6,3))

df.iloc[2:,1]=NA

df.iloc[4:,2]=NA

df

	0	1	2
0	0.536292	-0.231305	-0.944116
1	-0.216595	1.808402	1.086082
2	-0.457510	NaN	-0.617013
3	-0.163709	NaN	0.450099
4	0.969959	NaN	NaN
5	1.136978	NaN	NaN

df.fillna(method='ffill')#ffill:用前一个非缺失值去填充该缺失值

	0	1	2
0	0.536292	-0.231305	-0.944116
1	-0.216595	1.808402	1.086082
2	-0.457510	1.808402	-0.617013
3	-0.163709	1.808402	0.450099
4	0.969959	1.808402	0.450099
5	1.136978	1.808402	0.450099

df.fillna(method='ffill',limit=2)#最多连续填充NAN的数目是两个

	0	1	2
0	0.536292	-0.231305	-0.944116
1	-0.216595	1.808402	1.086082
2	-0.457510	1.808402	-0.617013
3	-0.163709	1.808402	0.450099
4	0.969959	NaN	0.450099
5	1.136978	NaN	0.450099

data=pd.Series([1., NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

data.fillna(data.mean())#使用平均值填充

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

fillna参数：

value:用于填充缺失值的标量值或字典对象
method：插值方式，未指定方式为ffill
axis：默认为axis=0
inplace：如果为true，对原件更改
limit：向前/后可以连续填充最大数目

7.2数据转换

移除重复数据

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
       'k2': [1, 1, 2, 3, 3, 4, 4]})
data

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4
6	two	4

#duplicated返回布尔型seies，表示各行是否有重复行
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

data.drop_duplicates()#返回dataframe，重复行会被移除

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4

data['v1']=range(7)
data

	k1	k2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
5	two	4	5
6	two	4	6

data.drop_duplicates(['k1'])#根据k1列判断重复项

	k1	k2	v1
0	one	1	0
1	two	1	1

#duplicated和drop_duplicates默认保留第一个出现的值，传入keep=‘last’保留最后一个
data.drop_duplicates(['k1','k2'],keep='last')

	k1	k2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
6	two	4	6

利用函数或映射进行数据转换

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

#编写一个映射,说明肉来自哪个动物
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

lowercased=data['food'].str.lower()#使用series的str.lower(）方法小写
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

#series.map接受函数或字典型对象:bacon-bacon-pig
data['animal']=lowercased.map(meat_to_animal)
data

	food	ounces	animal
0	bacon	4.0	pig
1	pulled pork	3.0	pig
2	bacon	12.0	pig
3	Pastrami	6.0	cow
4	corned beef	7.5	cow
5	Bacon	8.0	pig
6	pastrami	3.0	cow
7	honey ham	5.0	pig
8	nova lox	6.0	salmon

data['food']

0          bacon
1    pulled pork
2          bacon
3       Pastrami
4    corned beef
5          Bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

data['food'].str.lower().map(meat_to_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

data['food'].map(lambda x:meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

替换值

data=pd.Series([1,-999,2,-999,-1000,3])
data

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64

#用nan替换-999
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

#替换多个值：输入替换值的列表
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

#每个值有不同的替换值
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

#传入字典
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

重新命名轴索引

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
    index=['Ohio', 'Colorado', 'New York'],
    columns=['one', 'two', 'three', 'four'])
data

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

data.index

Index(['Ohio', 'Colorado', 'New York'], dtype='object')

transform=lambda x: x[:4].upper()
data.index.map(transform)#x是对列表中的元素索引，而不使对列表索引

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

data.rename(index=str.title,columns=str.upper)

	ONE	TWO	THREE	FOUR
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

#rename+字典型实现对轴标签更新
data.rename(index={'Ohio':'INDIANA'},
           columns={'three':'peekaboo'})

	one	two	peekaboo	four
INDIANA	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

#就地修改某个数据
data.rename(index={'Ohio':'INDIANA'},inplace=True)

data

	one	two	three	four
INDIANA	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

离散化和面元划分

假设有一组数据划分到不同的年龄组，如何操作？

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins=[18,25,35,60,100]#划分区间
cats=pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

cats.codes#获得分组标签

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

pd.value_counts(cats)#每个间隔有几个

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

#更改闭端
pd.cut(ages,[18,26,36,61,100],right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

#传入列表或数组，设置每个面元名称(bin)
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

pd.cut(ages,bins,labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

#如果cut输入面元数量:根据最小值和最大值计算等长面元
data=np.random.rand(20)
pd.cut(data,4,precision=2)#precison=2，限定小数有2位

[(0.7, 0.92], (0.7, 0.92], (0.065, 0.28], (0.7, 0.92], (0.065, 0.28], ..., (0.7, 0.92], (0.49, 0.7], (0.28, 0.49], (0.7, 0.92], (0.065, 0.28]]
Length: 20
Categories (4, interval[float64]): [(0.065, 0.28] < (0.28, 0.49] < (0.49, 0.7] < (0.7, 0.92]]

#qcut可以使用样本分位数对面元划分，保证每个面元含有相同数据点
data=np.random.randn(1000)
cats=pd.qcut(data,4)#4分位
cats

[(-3.5669999999999997, -0.673], (-3.5669999999999997, -0.673], (-0.673, -0.039], (-3.5669999999999997, -0.673], (-0.039, 0.631], ..., (-3.5669999999999997, -0.673], (-3.5669999999999997, -0.673], (-0.039, 0.631], (-3.5669999999999997, -0.673], (-3.5669999999999997, -0.673]]
Length: 1000
Categories (4, interval[float64]): [(-3.5669999999999997, -0.673] < (-0.673, -0.039] < (-0.039, 0.631] < (0.631, 3.121]]

pd.value_counts(cats)

(0.631, 3.121]                   250
(-0.039, 0.631]                  250
(-0.673, -0.039]                 250
(-3.5669999999999997, -0.673]    250
dtype: int64

#自定义分位数
pd.qcut(data,[0,0.1,0.5,0.9,1])

[(-1.294, -0.039], (-1.294, -0.039], (-1.294, -0.039], (-3.5669999999999997, -1.294], (-0.039, 1.256], ..., (-1.294, -0.039], (-1.294, -0.039], (-0.039, 1.256], (-1.294, -0.039], (-1.294, -0.039]]
Length: 1000
Categories (4, interval[float64]): [(-3.5669999999999997, -1.294] < (-1.294, -0.039] < (-0.039, 1.256] < (1.256, 3.121]]

检测和过滤异常值

import pandas as pd
import numpy as np

#产生正太分布数据
data=pd.DataFrame(np.random.randn(1000,4))
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.007752	-0.032028	-0.037349	-0.036083
std	0.976770	0.974456	0.983429	1.015825
min	-3.357288	-3.298192	-2.813273	-3.235629
25%	-0.663744	-0.662803	-0.703169	-0.758496
50%	0.011468	-0.026073	-0.087028	-0.063262
75%	0.662235	0.581248	0.639237	0.642911
max	3.320222	2.833708	3.536139	2.816898

#找出第三列大于3的
col=data[2]
col[np.abs(col)>3]

992    3.536139
Name: 2, dtype: float64

#选出data中全部绝对值大于3的行
data[(np.abs(data)>3).any(1)]

	0	1	2	3
61	0.502113	-3.298192	-1.445427	0.728776
643	0.430629	-3.060744	0.731826	-1.039144
678	0.115404	0.017918	0.058429	-3.235629
718	3.320222	0.486255	0.686823	0.966785
750	-3.206397	-1.836857	1.102002	-0.180903
813	-3.357288	-0.662363	-1.293561	-1.962479
824	-0.950342	2.208761	-0.203996	-3.059786
992	0.920673	-0.688196	3.536139	0.528149

data[np.abs(data)>3]=np.sign(data)*3
data.describe()#可以看出mean改变

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.007996	-0.031669	-0.037885	-0.035788
std	0.973907	0.973313	0.981623	1.014933
min	-3.000000	-3.000000	-2.813273	-3.000000
25%	-0.663744	-0.662803	-0.703169	-0.758496
50%	0.011468	-0.026073	-0.087028	-0.063262
75%	0.662235	0.581248	0.639237	0.642911
max	3.000000	2.833708	3.000000	2.816898

#np.sign根据值的正负生成1和-1
np.sign(data).head()

	0	1	2	3
0	1.0	1.0	1.0	-1.0
1	-1.0	-1.0	-1.0	-1.0
2	1.0	-1.0	-1.0	-1.0
3	1.0	1.0	-1.0	1.0
4	1.0	1.0	-1.0	1.0

排列和随机采样

df=pd.DataFrame(np.arange(5*4).reshape(5,4))
df

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11
3	12	13	14	15
4	16	17	18	19

sampler=np.random.permutation(5)#产生一个随机排列
sampler

array([2, 1, 3, 0, 4])

df.take(sampler)#take按照给定位置索引

	0	1	2	3
2	8	9	10	11
1	4	5	6	7
3	12	13	14	15
0	0	1	2	3
4	16	17	18	19

df.sample(n=3)#在axis=0，随机抽样抽3个

	0	1	2	3
2	8	9	10	11
0	0	1	2	3
1	4	5	6	7

choices = pd.Series([5, 7, -1, 6, 4])

#sample随机替换原样本
draws=choices.sample(n=10,replace=True)
draws

0    5
1    7
4    4
1    7
0    5
2   -1
3    6
0    5
2   -1
4    4
dtype: int64

计算指标/哑变量

将分类变量转化为“哑变量”/指标矩阵

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
     'data1': range(6)})
df

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	b	5

pd.get_dummies(df['key'])

	a	b	c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

pd.get_dummies:pandas 实现one hot encode的方式

one-hot的基本思想：将离散型特征的每一种取值都看成一种状态，若你的这一特征中有N个不相同的取值，那么我们就可以将该特征抽象成N种不同的状态，one-hot编码保证了每一个取值只会使得一种状态处于“激活态”，也就是说这N种状态中只有一个状态位值为1，其他状态位都是0。

引用：https://zhuanlan.zhihu.com/p/139144355

#参数prefix：在列名前面加前缀
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

	key_a	key_b	key_c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

如果输入字符串’data1’,得到结果位series

df['data1']

0    0
1    1
2    2
3    3
4    4
5    5
Name: data1, dtype: int64

如果输入列表[‘data1’],则返回DataFrame

df[['data1']]

	data1
0	0
1	1
2	2
3	3
4	4
5	5

df_with_dummy=df[['data1']].join(dummies)#拼接两个dataframe
df_with_dummy

	data1	key_a	key_b	key_c
0	0	0	1	0
1	1	0	1	0
2	2	1	0	0
3	3	0	0	1
4	4	1	0	0
5	5	0	1	0

#多分类的处理

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('pydata-book/datasets/movielens/movies.dat', sep='::',
         header=None, names=mnames)
movies

E:\Anaconda\lib\site-packages\pandas\io\parsers.py:765: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  return read_csv(**locals())

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy
...	...	...	...
3878	3948	Meet the Parents (2000)	Comedy
3879	3949	Requiem for a Dream (2000)	Drama
3880	3950	Tigerland (2000)	Drama
3881	3951	Two Family House (2000)	Drama
3882	3952	Contender, The (2000)	Drama\|Thriller

3883 rows × 3 columns

movies.shape

(3883, 3)

#抽取不同的电影类型

all_genres=[]

for x in movies.genres:
    all_genres.extend(x.split('|'))
    
all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thriller',
 'Animation',
 "Children's",
 'Musical',
 'Romance',
 'Drama',
 'Romance',
 'Crime',
 'Thriller',
 'Action',
 'Drama',
 'Thriller',
 'Comedy',
 'Drama',
 "Children's",
 'Comedy',
 'Drama',
 'Adventure',
 "Children's",
 'Fantasy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Mystery',
 'Adventure',
 "Children's",
 'Fantasy',
 'Drama',
 'Thriller',
 'Drama',
 'Comedy',
 'Comedy',
 'Romance',
 'Comedy',
 'Sci-Fi',
 'Thriller',
 'Drama',
 'Comedy',
 'Romance',
 'Comedy',
 'Action',
 'Comedy',
 'Crime',
 'Horror',
 'Thriller',
 'Action',
 'Comedy',
 'Drama',
 'Drama',
 'Musical',
 'Drama',
 'Romance',
 'Comedy',
 'Drama',
 'Sci-Fi',
 'Thriller',
 'Documentary',
 'Drama',
 'Drama',
 'Thriller',
 'Drama',
 'Crime',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Drama',
 "Children's",
 'Comedy',
 'Comedy',
 'Action',
 'Thriller',
 'Drama',
 'Drama',
 'Thriller',
 'Comedy',
 'Romance',
 'Drama',
 'Action',
 'Thriller',
 'Comedy',
 'Drama',
 'Action',
 'Thriller',
 'Documentary',
 'Drama',
 'Thriller',
 'Comedy',
 'Comedy',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Drama',
 'Adventure',
 "Children's",
 'Comedy',
 'Musical',
 'Documentary',
 'Comedy',
 'Action',
 'Drama',
 'War',
 'Drama',
 'Thriller',
 'Action',
 'Adventure',
 'Crime',
 'Drama',
 'Mystery',
 'Drama',
 'Comedy',
 'Documentary',
 'Crime',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Drama',
 'Comedy',
 'Romance',
 'Drama',
 'Mystery',
 'Romance',
 'Drama',
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Drama',
 'Documentary',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Drama',
 'Documentary',
 'Comedy',
 'Documentary',
 'Documentary',
 'Drama',
 'Action',
 'Drama',
 'Drama',
 'Romance',
 'Comedy',
 'Drama',
 'Drama',
 'Comedy',
 'Action',
 'Adventure',
 "Children's",
 'Drama',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Drama',
 'Drama',
 'Romance',
 'War',
 'Horror',
 'Action',
 'Adventure',
 'Comedy',
 'Crime',
 'Drama',
 'Drama',
 'War',
 'Comedy',
 'Comedy',
 'War',
 'Adventure',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Mystery',
 'Sci-Fi',
 'Drama',
 'Thriller',
 'War',
 'Documentary',
 'Action',
 'Romance',
 'Thriller',
 'Crime',
 'Film-Noir',
 'Mystery',
 'Thriller',
 'Action',
 'Thriller',
 'Comedy',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Drama',
 'Romance',
 'Adventure',
 "Children's",
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Action',
 'Sci-Fi',
 'Thriller',
 'Action',
 'Adventure',
 'Sci-Fi',
 'Comedy',
 'Drama',
 'Comedy',
 'Horror',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Action',
 "Children's",
 'Drama',
 'Romance',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Thriller',
 'Comedy',
 'Comedy',
 'Horror',
 'Comedy',
 'Thriller',
 'Drama',
 'Documentary',
 'Drama',
 'Drama',
 'Comedy',
 'Drama',
 'Romance',
 'Horror',
 'Sci-Fi',
 'Drama',
 'Action',
 'Crime',
 'Sci-Fi',
 'Drama',
 'Musical',
 'Thriller',
 'Drama',
 'Drama',
 'Romance',
 'Comedy',
 'Action',
 'Comedy',
 'Drama',
 'Documentary',
 'Drama',
 'Romance',
 'Action',
 'Adventure',
 'Drama',
 'Western',
 'Drama',
 'Comedy',
 'Drama',
 'Drama',
 'Drama',
 'Romance',
 'Comedy',
 'Drama',
 'Thriller',
 'Comedy',
 'Drama',
 'Drama',
 'Horror',
 'Drama',
 'Romance',
 'Comedy',
 'Comedy',
 'Drama',
 'Romance',
 'Drama',
 'Thriller',
 'Thriller',
 'Action',
 'Comedy',
 'Drama',
 'Thriller',
 'Drama',
 'Thriller',
 'Comedy',
 'Comedy',
 'Drama',
 'Drama',
 'Comedy',
 'Comedy',
 'Drama',
 'Comedy',
 'Romance',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Animation',
 "Children's",
 'Comedy',
 'Romance',
 'Thriller',
 "Children's",
 'Drama',
 'Drama',
 'Musical',
 'Comedy',
 'Animation',
 "Children's",
 'Crime',
 'Drama',
 'Documentary',
 'Drama',
 'Fantasy',
 'Romance',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 "Children's",
 'Comedy',
 'Action',
 'Comedy',
 'Romance',
 'Drama',
 'Horror',
 'Drama',
 'Comedy',
 'Comedy',
 'Sci-Fi',
 'Mystery',
 'Thriller',
 'Adventure',
 "Children's",
 'Comedy',
 'Fantasy',
 'Romance',
 'Crime',
 'Drama',
 'Thriller',
 'Action',
 'Adventure',
 'Fantasy',
 'Sci-Fi',
 'Drama',
 "Children's",
 'Drama',
 'Drama',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Romance',
 'War',
 'Western',
 'Comedy',
 'Drama',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Drama',
 'Horror',
 'Comedy',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Comedy',
 'Drama',
 'Drama',
 'Thriller',
 'Drama',
 'Drama',
 'Crime',
 'Drama',
 'Action',
 'Crime',
 'Drama',
 'Horror',
 'Action',
 'Sci-Fi',
 'Thriller',
 'Comedy',
 'Romance',
 'Action',
 'Thriller',
 'Comedy',
 'Romance',
 'Crime',
 'Drama',
 'Thriller',
 'Action',
 'Drama',
 'Thriller',
 'Crime',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Romance',
 'Comedy',
 'Romance',
 'Crime',
 'Drama',
 'Drama',
 'Comedy',
 'Drama',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Romance',
 'Action',
 'Adventure',
 'Western',
 'Comedy',
 'Drama',
 'Comedy',
 'Drama',
 'Drama',
 'Drama',
 'Drama',
 'Comedy',
 'Horror',
 'Thriller',
 'Comedy',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Action',
 'Adventure',
 'Sci-Fi',
 "Children's",
 'Comedy',
 'Fantasy',
 'Drama',
 'Thriller',
 'Film-Noir',
 'Thriller',
 'Drama',
 'Comedy',
 'Drama',
 'Comedy',
 'Comedy',
 'Drama',
 'Action',
 'Comedy',
 'Musical',
 'Sci-Fi',
 'Horror',
 'Action',
 'Adventure',
 'Sci-Fi',
 'Comedy',
 'Horror',
 'Drama',
 'Horror',
 'Sci-Fi',
 'Comedy',
 'Drama',
 'Mystery',
 'Thriller',
 'Drama',
 'War',
 'Drama',
 'Sci-Fi',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 'Drama',
 'Drama',
 'Comedy',
 'Romance',
 "Children's",
 'Comedy',
 'Comedy',
 'Drama',
 'Drama',
 'Musical',
 'Drama',
 'Comedy',
 'Action',
 'Adventure',
 'Thriller',
 'Drama',
 'Mystery',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Action',
 'Romance',
 'Thriller',
 'Drama',
 "Children's",
 'Comedy',
 'Comedy',
 'Romance',
 'War',
 'Comedy',
 'Romance',
 'Drama',
 'Comedy',
 'Drama',
 'Romance',
 'Action',
 'Comedy',
 'Drama',
 'Romance',
 'Adventure',
 "Children's",
 'Romance',
 'Documentary',
 'Animation',
 "Children's",
 'Musical',
 'Drama',
 'Horror',
 'Comedy',
 'Crime',
 'Fantasy',
 'Action',
 'Comedy',
 'Western',
 'Drama',
 'Comedy',
 'Comedy',
 'Drama',
 'Comedy',
 'Drama',
 'Thriller',
 "Children's",
 'Comedy',
 'Drama',
 'Action',
 'Thriller',
 'Action',
 'Romance',
 'Thriller',
 'Comedy',
 'Romance',
 'Action',
 'Sci-Fi',
 'Action',
 'Adventure',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'Horror',
 'Western',
 'Action',
 'Drama',
 'Drama',
 'Action',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'War',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Drama',
 'Drama',
 'Horror',
 'Documentary',
 'Drama',
 'Drama',
 'Action',
 'Thriller',
 'Comedy',
 'Comedy',
 'Crime',
 'Drama',
 'Documentary',
 'Action',
 'Sci-Fi',
 'Drama',
 'Horror',
 'Thriller',
 'Drama',
 'Drama',
 'Comedy',
 'Comedy',
 'Drama',
 'Comedy',
 'Comedy',
 'Comedy',
 'Thriller',
 'Western',
 'Comedy',
 'Romance',
 'Drama',
 'Comedy',
 'Action',
 'Comedy',
 'Adventure',
 "Children's",
 'Thriller',
 'Action',
 'Thriller',
 'Drama',
 'Drama',
 'Romance',
 'Horror',
 'Sci-Fi',
 'Thriller',
 'Mystery',
 'Romance',
 'Thriller',
 'Drama',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Comedy',
 'Western',
 'Comedy',
 'Action',
 'Adventure',
 'Crime',
 'Comedy',
 'Sci-Fi',
 'Drama',
 'Thriller',
 'Comedy',
 'Action',
 'Comedy',
 'Drama',
 'Comedy',
 'Romance',
 'Comedy',
 'Action',
 'Sci-Fi',
 'Documentary',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Romance',
 'Drama',
 'Comedy',
 'Comedy',
 'Drama',
 'Drama',
 'Mystery',
 'Romance',
 'Drama',
 'Comedy',
 'Drama',
 'Thriller',
 'Adventure',
 "Children's",
 'Drama',
 'Drama',
 'Action',
 'Thriller',
 'Drama',
 'Western',
 'Action',
 'Comedy',
 'Drama',
 'Romance',
 'Action',
 'Adventure',
 'Crime',
 'Drama',
 'Thriller',
 'Action',
 'Adventure',
 'Crime',
 'Thriller',
 'Action',
 'Drama',
 'War',
 'Action',
 'Comedy',
 'War',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Romance',
 'Comedy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'War',
 'Action',
 'Thriller',
 'Drama',
 'Comedy',
 'Drama',
 'Drama',
 'Comedy',
 'Action',
 'Action',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Adventure',
 "Children's",
 'Action',
 'Comedy',
 'Comedy',
 'Comedy',
 'Western',
 'Drama',
 'Comedy',
 'Thriller',
 'Drama',
 'Comedy',
 'Mystery',
 'Action',
 'Crime',
 'Drama',
 'Action',
 'Thriller',
 'Drama',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Romance',
 'Drama',
 'Romance',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Action',
 "Children's",
 'Drama',
 'Action',
 'Sci-Fi',
 'Comedy',
 'Drama',
 'Action',
 'Drama',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Action',
 'Drama',
 'Horror',
 'Sci-Fi',
 'Comedy',
 'Mystery',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Drama',
 'War',
 'Action',
 'Drama',
 'Mystery',
 'Comedy',
 'Sci-Fi',
 'Thriller',
 'Comedy',
 'Crime',
 'Thriller',
 'Action',
 'Drama',
 'Drama',
 'Drama',
 'Drama',
 'Drama',
 'Drama',
 'War',
 'Drama',
 'Drama',
 'Drama',
 "Children's",
 'Drama',
 'Comedy',
 'Crime',
 'Horror',
 'Action',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Comedy',
 'Drama',
 'Drama',
 'Comedy',
 'Romance',
 'Thriller',
 'Film-Noir',
 'Sci-Fi',
 'Comedy',
 'Comedy',
 'Romance',
 'Thriller',
 'Action',
 'Drama',
 'Action',
 'Adventure',
 "Children's",
 'Sci-Fi',
 'Action',
 'Adventure',
 'Thriller',
 'Action',
 'Documentary',
 'Comedy',
 'Romance',
 "Children's",
 'Comedy',
 'Musical',
 'Action',
 'Adventure',
 'Comedy',
 'Western',
 'Thriller',
 'Action',
 'Crime',
 'Romance',
 'Documentary',
 'Drama',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Fantasy',
 'Comedy',
 'Drama',
 'Thriller',
 'Comedy',
 'Drama',
 'Drama',
 'Comedy',
 'Horror',
 'Comedy',
 'Romance',
 'Drama',
 'Comedy',
 'Drama',
 "Children's",
 'Comedy',
 'Comedy',
 'Drama',
 'Drama',
 'Drama',
 'Drama',
 'Comedy',
 'Drama',
 "Children's",
 'Comedy',
 'Comedy',
 'Adventure',
 "Children's",
 'Drama',
 'Mystery',
 'Thriller',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Drama',
 'Drama',
 'Comedy',
 "Children's",
 'Comedy',
 'Comedy',
 'Romance',
 'Thriller',
 'Animation',
 "Children's",
 'Comedy',
 'Musical',
 'Action',
 'Sci-Fi',
 'Thriller',
 'Adventure',
 ...]

extend函数：向列表尾部追加一个列表，将列表中的每个元素都追加进来，在原有列表上增加。

split函数：通过指定分隔符对字符串进行切片

genres=pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

#构建指标dataframe方法之一是从一个全零dataframe开始

zero_matrix=np.zeros((len(movies),len(genres)))#构建零矩阵
dummies=pd.DataFrame(zero_matrix,columns=genres)#每行为一个样本，每列为一个电影类型
dummies

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama	Action	Crime	Thriller	Horror	Sci-Fi	Documentary	War	Musical	Mystery	Film-Noir	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3878	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3879	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3880	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3881	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3882	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

3883 rows × 18 columns

gen = movies.genres[0]
gen

"Animation|Children's|Comedy"

gen.split('|')

['Animation', "Children's", 'Comedy']

dummies.columns.get_indexer(gen.split('|'))#在dummies的列索引

array([0, 1, 2], dtype=int64)

pandas.index.get_indexer(target, method=None, limit=None, tolerance=None)

作用：确定target的值在给的pandas的index的位置
target:输入的索引
method:选的方法，包括ffill，bfill
returns:从0到n - 1的整数表示这些位置处的索引与相应的target值匹配。

#每个样本使用one-hot编码
for i ,gen in enumerate(movies.genres):
    indices=dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i,indices]=1
    
dummies

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama	Action	Crime	Thriller	Horror	Sci-Fi	Documentary	War	Musical	Mystery	Film-Noir	Western
0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3878	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3879	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3880	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3881	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3882	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

3883 rows × 18 columns

#将one-hot和原始列表拼起来

movies_windic=movies.join(dummies.add_prefix('Genre_'))
movies_windic

	movie_id	title	genres	Genre_Animation	Genre_Children's	Genre_Comedy	Genre_Adventure	Genre_Fantasy	Genre_Romance	Genre_Drama	...	Genre_Crime	Genre_Thriller	Genre_Horror	Genre_Sci-Fi	Genre_Documentary	Genre_War	Genre_Musical	Genre_Mystery	Genre_Film-Noir	Genre_Western
0	1	Toy Story (1995)	Animation\|Children's\|Comedy	1.0	1.0	1.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy	0.0	1.0	0.0	1.0	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	3	Grumpier Old Men (1995)	Comedy\|Romance	0.0	0.0	1.0	0.0	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	4	Waiting to Exhale (1995)	Comedy\|Drama	0.0	0.0	1.0	0.0	0.0	0.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	5	Father of the Bride Part II (1995)	Comedy	0.0	0.0	1.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3878	3948	Meet the Parents (2000)	Comedy	0.0	0.0	1.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3879	3949	Requiem for a Dream (2000)	Drama	0.0	0.0	0.0	0.0	0.0	0.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3880	3950	Tigerland (2000)	Drama	0.0	0.0	0.0	0.0	0.0	0.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3881	3951	Two Family House (2000)	Drama	0.0	0.0	0.0	0.0	0.0	0.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3882	3952	Contender, The (2000)	Drama\|Thriller	0.0	0.0	0.0	0.0	0.0	0.0	1.0	...	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

3883 rows × 21 columns

movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western                                  0
Name: 0, dtype: object

7.3字符和操作

字符串对象方法

#以逗号分隔的字符串可用split分隔
val = 'a,b, guido'
val.split(',')

['a', 'b', ' guido']

val= 'a,b, guido '
pieces=[x.strip() for x in val.split(',')]
pieces#guido后面空格消失

['a', 'b', 'guido']

#使用加法将字符串以双冒号为分隔符连接
first,second,third=pieces
first+'::'+second+'::'+third

'a::b::guido'

语法： ‘sep’.join(seq)

参数说明
sep：分隔符。可以为空
seq：要连接的元素序列、字符串、元组、字典
上面的语法即：以sep作为分隔符，将seq所有的元素合并成一个新的字符串

返回值：返回一个以分隔符sep连接各个元素后生成的字符串

#join方法传入列表/元组
'::'.join(pieces)

'a::b::guido'

#in关键词看子串是否在列表中
'guido' in val

True

index() 方法查找指定值的首次出现。

如果找不到该值，index() 方法将引发异常。

index() 方法与 find() 方法几乎相同，唯一的区别是，如果找不到该值，则 find() 方法将返回 -1。

val.index(',')

val.find(':')

-1

val.count(',')#计算字串出现次数

#使用replace替换
val.replace(',','::')

'a::b:: guido '

在这里插入图片描述

正则式表达式

import re
text = "foo bar\t baz \tqux"

re.split('\s+',text)#\s+ 意思就是至少有一个空白字符存在

['foo', 'bar', 'baz', 'qux']

#re.compile自己编译regex以得到一个可重用的regex对象：
regex=re.compile('\s+')

regex.split(text)

['foo', 'bar', 'baz', 'qux']

#返回的是字符串中所有的匹配项(可以理解为找符合'\s+'样式的)
regex.findall(text)

[' ', '\t ', ' \t']

pandas的矢量化字符串函数

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
         'Rob': 'rob@gmail.com', 'Wes': np.nan}

data=pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

#通过str.contains函数：测试pattern或regex是否包含在Series或Index的字符串中。
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

热爱学习的小鲁同学

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
第七章数据清洗和处理

7.1处理缺失数据Nan（not a number ）在pandas表示缺失值import pandas as pdimport numpy as npstring_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])string_data0 aardvark1 artichoke2 NaN3 avocadodtype: object#判断缺失数据string_data.isn
复制链接

扫一扫