分开拼接,split and join

万年搞不懂的分开拼接,split and join。必须线上线下同时总结一波,反击吧小涵!

#!/usr/bin/env python
# -*- coding:utf8 -*-
# @TIME  :2018/11/2 15:23
# @Author:Yolanda
# @File  :pinjie.py

#反击啊!少女
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

part1:单值变多值的拼接:

#写法一:
data=pd.DataFrame({'A':[1,2,3,4],'B':[5,6,7,8]})
print(data)
#    A  B
# 0  1  5
# 1  2  6
# 2  3  7
# 3  4  8
print(data['A'].dtype)#int64
#data=data.astype(str)#全部列转为str
data[['A','B']]=data[['A','B']].astype(str)#AB两列转为str
#data=data[['A','B']].astype(str)#这样会改变data的列数,只要左边有赋值为data就会改变列数
# print(data['A'].dtype)#obj。df不显示str,转换成str依然显示df
# print(data['B'].dtype)#obj。df不显示str,转换成str依然显示df
data['C']=data[['A','B']].values.tolist()#data[['A']]不会改变data列数,data还是原来的列数
data['C']=data['C'].apply(lambda x:' '.join(x))#空格拼接
print(data)
#    A  B    C
# 0  1  5  1 5
# 1  2  6  2 6
# 2  3  7  3 7
# 3  4  8  4 8

#写法二:
data=pd.DataFrame({'A':[1,2,3,4],'B':[5,6,7,8]})
print(data)
#    A  B
# 0  1  5
# 1  2  6
# 2  3  7
# 3  4  8
print(data['A'].dtype)#int64
data['A']=data['A'].astype(str)
# print(data['A'].dtype)#obj
data['B']=data['B'].astype(str)
# print(data['B'].dtype)#obj
data['C']=data[['A','B']].values.tolist()
data['C']=data['C'].apply(lambda x:' '.join(x))#空格拼接
print(data)
#    A  B    C
# 0  1  5  1 5
# 1  2  6  2 6
# 2  3  7  3 7
# 3  4  8  4 8

part2:纵向拼接:

data1 = pd.concat(data.iloc[:, i] for i in range(data[['A','B']].shape[1]))
print(data1)
# 0    1
# 1    2
# 2    3
# 3    4
# 0    5
# 1    6
# 2    7
# 3    8
# dtype: object
data1.index = np.arange(len(data1))
print(data1)
# 0    1
# 1    2
# 2    3
# 3    4
# 4    5
# 5    6
# 6    7
# 7    8
# dtype: object
print(type(data1))
# <class 'pandas.core.series.Series'>
data2=pd.DataFrame(data1).reset_index(drop=True).rename(columns={0:'D'})
print(data2)
#    D
# 0  1
# 1  2
# 2  3
# 3  4
# 4  5
# 5  6
# 6  7
# 7  8

part3:分开与拼接:

data=pd.DataFrame({'A':[1,2,3],'user_tags':['41,21,11','51,31,21','61,41,31']},index=['a','b','c'])
print(data)
#    A user_tags
# a  1     41,21,11
# b  2     51,31,21
# c  3     61,41,31
data['user_tags'] = data['user_tags'].apply(lambda x: x[1:] if x.startswith(',') else x)#去掉以逗号开头的
print(data)
#    A user_tags
# a  1     41,21,11
# b  2     51,31,21
# c  3     61,41,31
data['user_tag_length'] = data['user_tags'].apply(lambda x: len(x.split(',')))
print(data)
#    A user_tags  user_tag_length
# a  1     41,21,11                3
# b  2     51,31,21                3
# c  3     61,41,31                3
data['user_tags'] = data['user_tags'].apply(lambda x: x.split(','))#用逗号分隔
print(data)
#    A  user_tags  user_tag_length
# a  1  [41, 21, 11]                3
# b  2  [51, 31, 21]                3
# c  3  [61, 41, 31]                3
apps = data['user_tags'].apply(lambda x: ' '.join(x)).tolist()#用空格相连,还是不同list分开的
print(apps)
# ['41,21,11', '51,31,21', '61,41,31']


vectorizer=CountVectorizer()
##'user_tags'如果是:['4,2,1','5,3,2','6,4,3']会报错,必须是至少两个字符。如果一定要一个字符运行,写成如下形式:
# vectorizer=CountVectorizer(token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}')
## 可以识别utf-8编码的中文和单字符的,CountVectorizer源码写了正则token_pattern=r"(?u)\b\w\w+\b",
## 就是自动忽略单字符和中文,加入token_pattern重新规定可以识别中文和单字符.或者把源码里的token_pattern=r"(?u)\b\w\w+\b"改为
## token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}'),改为token_pattern=None就不行,应该是一定要读这个。
transformer=TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)

word=vectorizer.get_feature_names()
print(word)
# ['11', '21', '31', '41', '51', '61']
weight=cntTf.toarray()
print(weight)
# [[1 1 0 1 0 0]
#  [0 1 1 0 1 0]
#  [0 0 1 1 0 1]]
df_weight=pd.DataFrame(weight)
print(df_weight)
#    0  1  2  3  4  5
# 0  1  1  0  1  0  0
# 1  0  1  1  0  1  0
# 2  0  0  1  1  0  1
df_weight.columns=word#赋值,跟之前的word指代相同
print(word)
# ['11', '21', '31', '41', '51', '61']

temp_df= pd.DataFrame(df_weight.sum()).sort_values(by=[0],ascending=False).reset_index().\
        rename(columns={0: 'user_tags_freq'})#词频求和
print(temp_df)
#   index  user_tags_freq
# 0    21               2
# 1    31               2
# 2    41               2
# 3    11               1
# 4    51               1
# 5    61               1

加油,把知识拼接起来吧,小涵!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值