Table of Contents
问题背景
最近在某互联网公司实习,leader提了需求要处理一个数据,原始数据长这个样子(注:为了数据隐私,已进行加密处理) 需求是这个样子的:
需求1 :将interests_news进行切分开成一条一条的,先处理一种情况:如果反斜杠只取反斜杠后面的标签,也就是二级标签。处理完成之后,如果是逗号连接,具体如下:每一个标签形成一行,后面对应三个变量保持一致。处理ok后的数据结构如下: 需求2 :根据上述结果进行分组计算,也即根据兴趣点标签进行groupby 算users、ret_users的总和 以及两个变量的比值最终结果如下:
总结遇到的坑
字符串处理 用replace的时候 单引号要用双引号去引 不能单引号再引单引号 groupby的时候如果少了元素 如果方法没问题 那就是数据的坑 比如字符串有的多了空格 有的没有空格 那就不一样了 要考虑情况更加全面一点
导入相应的库+定义函数
import pandas as pd
import copy
import time
def f1 ( x) :
s = x. split( ',' )
n = len ( s)
for i in range ( n) :
if '/' in s[ i] :
s[ i] = s[ i] . split( '/' ) [ 1 ]
return s
处理流程
读入数据
data = pd. read_excel( './兴趣点.xlsx' , sheet_name= 'Sheet2' )
data = data. dropna( )
print ( data. shape)
data. head( )
(42934, 5)
day interests_news users ret_users 次留 1 20190311 ["娱乐/明星八卦"] 363.829957 2087 0.521120 2 20190311 ["健康/养生"] 27.469583 602 0.398086 3 20190311 ["娱乐/明星八卦","娱乐"] 411.894976 475 0.025857 4 20190311 ["体育/NBA"] 47.831449 432 0.058352 5 20190311 ["社会"] 183.316248 401 0.103651
筛选数据
data_10 = data[ data[ 'users' ] > 5 ]
data_10 = data_10. reset_index( drop= True )
print ( data_10. shape)
data_10. head( )
(403, 5)
day interests_news users ret_users 次留 0 20190311 ["娱乐/明星八卦"] 363.829957 2087 0.521120 1 20190311 ["健康/养生"] 27.469583 602 0.398086 2 20190311 ["娱乐/明星八卦","娱乐"] 411.894976 475 0.025857 3 20190311 ["体育/NBA"] 47.831449 432 0.058352 4 20190311 ["社会"] 183.316248 401 0.103651
字符串处理
df = copy. deepcopy( data_10)
df[ 'interests_news' ] = df[ 'interests_news' ] . apply ( lambda x: x. replace( '[' , '' ) . replace( ']' , '' ) . replace( '"' , '' ) )
df[ 'interests_news' ] = df[ 'interests_news' ] . map ( f1) . map ( str )
df[ 'interests_news' ] = df[ 'interests_news' ] . apply ( lambda x: x. replace( '[' , '' ) . replace( ']' , '' ) . replace( "'" , '' ) )
print ( df. shape)
df. head( )
(403, 5)
day interests_news users ret_users 次留 0 20190311 明星八卦 363.829957 2087 0.521120 1 20190311 养生 27.469583 602 0.398086 2 20190311 明星八卦, 娱乐 411.894976 475 0.025857 3 20190311 NBA 47.831449 432 0.058352 4 20190311 社会 183.316248 401 0.103651
对兴趣点变量进行拆分
思路梳理:
其实就是一个DataFrame的拼接,定义几个空list 然后对应的内容放进去 每次都形成一个新的DataFrame 形成过程先把数值不同的放在一起 把所有的DataFrame都放到一个list 最后进行concat
data_need = [ ]
for i in range ( len ( df) ) :
int_news = [ ]
s = df[ 'interests_news' ] [ i] . split( ',' )
n = len ( s)
for j in range ( n) :
int_news. append( s[ j] )
y = pd. DataFrame( { 'interests_news' : int_news} )
y[ 'users' ] = df[ 'users' ] [ i]
y[ 'ret_users' ] = df[ 'ret_users' ] [ i]
y[ '次留' ] = df[ '次留' ] [ i]
data_need. append( y)
data_new = pd. concat( data_need, axis = 0 )
print ( data_new. shape)
data_new. head( )
(934, 4)
interests_news users ret_users 次留 0 明星八卦 363.829957 2087 0.521120 0 养生 27.469583 602 0.398086 0 明星八卦 411.894976 475 0.025857 1 娱乐 411.894976 475 0.025857 0 NBA 47.831449 432 0.058352
分组计算
思路梳理:
先进行groupby 然后对分组后的结果进行循环 然后再对每个子块进行求和等等 结果都放到一个list里面去 然后构建字典 形成DataFrame 最后进行一个合并
data_new = data_new. reset_index( drop= True )
data_new[ 'interests_news' ] = data_new[ 'interests_news' ] . apply ( lambda x: x. strip( ) )
interests_news = [ ]
users = [ ]
ret_users = [ ]
ratio = [ ]
dg1 = data_new. groupby( 'interests_news' )
for i in dg1:
interests_news. append( i[ 0 ] )
users. append( i[ 1 ] [ 'users' ] . sum ( ) )
ret_users. append( i[ 1 ] [ 'ret_users' ] . sum ( ) )
ratio. append( i[ 1 ] [ 'ret_users' ] . sum ( ) / i[ 1 ] [ 'users' ] . sum ( ) )
y = pd. DataFrame( { 'interests_news' : interests_news,
'users' : users,
'ret_users' : ret_users,
'ratio' : ratio} )
y. to_csv( '大于5的用户数分组计算结果.csv' , index = False , encoding= 'gbk' )
print ( y. shape)
y. head( )
(102, 4)
interests_news users ret_users ratio 0 CBA 173.707081 256 1.473745 1 NBA 392.799830 884 2.250510 2 三农 87.064592 91 1.045201 3 世界史 67.310528 48 0.713113 4 两性 421.034683 405 0.961916
封装成函数
import pandas as pd
import copy
import time
def f1 ( x) :
s = x. split( ',' )
n = len ( s)
for i in range ( n) :
if '/' in s[ i] :
s[ i] = s[ i] . split( '/' ) [ 1 ]
return s
def Cleandata ( ) :
t0 = time. time( )
data = pd. read_excel( './兴趣点.xlsx' , sheet_name= 'Sheet2' )
data = data. dropna( )
print ( data. shape)
data. head( )
data_10 = data[ data[ 'users' ] > 5 ]
data_10 = data_10. reset_index( drop= True )
print ( data_10. shape)
df = copy. deepcopy( data_10)
df. head( )
df[ 'interests_news' ] = df[ 'interests_news' ] . apply ( lambda x: x. replace( '[' , '' ) . replace( ']' , '' ) . replace( '"' , '' ) )
df[ 'interests_news' ] = df[ 'interests_news' ] . map ( f1) . map ( str )
df[ 'interests_news' ] = df[ 'interests_news' ] . apply ( lambda x: x. replace( '[' , '' ) . replace( ']' , '' ) . replace( "'" , '' ) )
data_need = [ ]
for i in range ( len ( df) ) :
int_news = [ ]
s = df[ 'interests_news' ] [ i] . split( ',' )
n = len ( s)
for j in range ( n) :
int_news. append( s[ j] )
y = pd. DataFrame( { 'interests_news' : int_news} )
y[ 'users' ] = df[ 'users' ] [ i]
y[ 'ret_users' ] = df[ 'ret_users' ] [ i]
y[ '次留' ] = df[ '次留' ] [ i]
data_need. append( y)
data_new = pd. concat( data_need, axis = 0 )
print ( data_new. shape)
data_new. head( )
data_new = data_new. reset_index( drop= True )
data_new[ 'interests_news' ] = data_new[ 'interests_news' ] . apply ( lambda x: x. strip( ) )
interests_news = [ ]
users = [ ]
ret_users = [ ]
ratio = [ ]
dg1 = data_new. groupby( 'interests_news' )
for i in dg1:
interests_news. append( i[ 0 ] )
users. append( i[ 1 ] [ 'users' ] . sum ( ) )
ret_users. append( i[ 1 ] [ 'ret_users' ] . sum ( ) )
ratio. append( i[ 1 ] [ 'ret_users' ] . sum ( ) / i[ 1 ] [ 'users' ] . sum ( ) )
y = pd. DataFrame( { 'interests_news' : interests_news,
'users' : users,
'ret_users' : ret_users,
'ratio' : ratio} )
y. to_csv( '大于5的用户数分组计算结果.csv' , index = False , encoding= 'gbk' )
t1 = time. time( )
print ( '数据清洗完毕,所需时间为 %.2f s' % ( t1- t0) )
return y. head( )
Cleandata( )
(42934, 5)
(403, 5)
(934, 4)
数据清洗完毕,所需时间为 3.58 s
interests_news users ret_users ratio 0 CBA 173.707081 256 1.473745 1 NBA 392.799830 884 2.250510 2 三农 87.064592 91 1.045201 3 世界史 67.310528 48 0.713113 4 两性 421.034683 405 0.961916
数据
上述代码所用到的数据
有兴趣的小伙伴可以自己尝试一波哦~