import pandas as pd
# 读取movie数据集,选取所有演员名和其Facebook likes
movie = pd.read_csv('data/movie.csv')
actor = movie[['movie_title', 'actor_1_name', 'actor_2_name', 'actor_3_name',
'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes']]
actor.head()
movie_title | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes |
---|
0 | Avatar | CCH Pounder | Joel David Moore | Wes Studi | 1000.0 | 936.0 | 855.0 |
---|
1 | Pirates of the Caribbean: At World's End | Johnny Depp | Orlando Bloom | Jack Davenport | 40000.0 | 5000.0 | 1000.0 |
---|
2 | Spectre | Christoph Waltz | Rory Kinnear | Stephanie Sigman | 11000.0 | 393.0 | 161.0 |
---|
3 | The Dark Knight Rises | Tom Hardy | Christian Bale | Joseph Gordon-Levitt | 27000.0 | 23000.0 | 23000.0 |
---|
4 | Star Wars: Episode VII - The Force Awakens | Doug Walker | Rob Walker | NaN | 131.0 | 12.0 | NaN |
---|
# 创建一个自定义函数,用来改变列名。wide_to_long要求分组的变量要有相同的数字结尾:
# 使用wide_to_long函数时,要求 1,2,3 这样的顺序信息在列名的最后,并用分隔符隔开。
# 加载数据,调整名字:
def change_col_name(col_name):
col_name = col_name.replace('_name', '')#把_name换成空白
if 'facebook' in col_name:
fb_idx = col_name.find('facebook')
col_name = col_name[:5] + col_name[fb_idx -1:] + col_name[5:fb_idx-1]
#print(fb_idx -1) #7,第二个数值从第7位开始取值最后。
return col_name
actor2 = actor.rename(columns=change_col_name)
actor2.head()
movie_title | actor_1 | actor_2 | actor_3 | actor_facebook_likes_1 | actor_facebook_likes_2 | actor_facebook_likes_3 |
---|
0 | Avatar | CCH Pounder | Joel David Moore | Wes Studi | 1000.0 | 936.0 | 855.0 |
---|
1 | Pirates of the Caribbean: At World's End | Johnny Depp | Orlando Bloom | Jack Davenport | 40000.0 | 5000.0 | 1000.0 |
---|
2 | Spectre | Christoph Waltz | Rory Kinnear | Stephanie Sigman | 11000.0 | 393.0 | 161.0 |
---|
3 | The Dark Knight Rises | Tom Hardy | Christian Bale | Joseph Gordon-Levitt | 27000.0 | 23000.0 | 23000.0 |
---|
4 | Star Wars: Episode VII - The Force Awakens | Doug Walker | Rob Walker | NaN | 131.0 | 12.0 | NaN |
---|
# 使用wide_to_long函数,同时stack两列actor和Facebook
stubs = ['actor', 'actor_facebook_likes']
actor2_tidy = pd.wide_to_long(actor2, stubnames=stubs, i=['movie_title'],j='actor_num',sep='_').reset_index()
actor2_tidy.head()
| movie_title | actor_num | actor | actor_facebook_likes |
---|
0 | Avatar | 1 | CCH Pounder | 1000.0 |
---|
1 | Pirates of the Caribbean: At World's End | 1 | Johnny Depp | 40000.0 |
---|
2 | Spectre | 1 | Christoph Waltz | 11000.0 |
---|
3 | The Dark Knight Rises | 1 | Tom Hardy | 27000.0 |
---|
4 | Star Wars: Episode VII - The Force Awakens | 1 | Doug Walker | 131.0 |
---|
#wide_to_long()基本语法如下:
'''
pd.wide_to_long(df, stubnames(提取以指定字符串开头的列),
i(用作索引的列),
j(提取开头后剩余的部分会成一列,在此指定列名),
sep(分隔符),
suffix(捕获正则表达式匹配的后缀)
)
'''
# 加载数据
df = pd.read_csv('data/stackme.csv')
df
| State | Country | a1 | b2 | Test | d | e |
---|
0 | TX | US | 0.45 | 0.3 | Test1 | 2 | 6 |
---|
1 | MA | US | 0.03 | 1.2 | Test2 | 9 | 7 |
---|
2 | ON | CAN | 0.70 | 4.2 | Test3 | 4 | 2 |
---|
# 对列重命名
df2 = df.rename(columns = {'a1':'group1_a1', 'b2':'group1_b2',
'd':'group2_a1', 'e':'group2_b2'})
df2
| State | Country | group1_a1 | group1_b2 | Test | group2_a1 | group2_b2 |
---|
0 | TX | US | 0.45 | 0.3 | Test1 | 2 | 6 |
---|
1 | MA | US | 0.03 | 1.2 | Test2 | 9 | 7 |
---|
2 | ON | CAN | 0.70 | 4.2 | Test3 | 4 | 2 |
---|
# 设定stubnames=['group1', 'group2'],对任何数字都起作用
pd.wide_to_long(df2, stubnames=['group1', 'group2'],
i=['State', 'Country', 'Test'],
j='Label',
suffix='.+',
sep='_')
| | | | group1 | group2 |
---|
State | Country | Test | Label | | |
---|
TX | US | Test1 | a1 | 0.45 | 2 |
---|
b2 | 0.30 | 6 |
---|
MA | US | Test2 | a1 | 0.03 | 9 |
---|
b2 | 1.20 | 7 |
---|
ON | CAN | Test3 | a1 | 0.70 | 4 |
---|
b2 | 4.20 | 2 |
---|