import pandas as pd
import numpy as np
#构造B列为多值,那么B列是字符串,也就是['','',''],这样可以split。不能写成[[],[],[]],这样是list,list不能split。
temp=pd.DataFrame({'A':[1,2,3],'B':['4,2,1','5,3,2','6,4,3']},index=['a','b','c'])
print(temp)
# A B
# a 1 4,2,1
# b 2 5,3,2
# c 3 6,4,3
for index, row in temp[['A','B']].iterrows():
print(index)
#a
#b
#c
print(row)#下面这些类是series
# A
# 1
# B
# 4, 2, 1
# Name: a, dtype: object
# A
# 2
# B
# 5, 3, 2
# Name: b, dtype: object
# A
# 3
# B
# 6, 4, 3
# Name: c, dtype: object
print(row['A'])
# 1
# 2
# 3
print(row['B'])
# 4, 2, 1
# 5, 3, 2
# 6, 4, 3
#统计词频
#写法1:(更简单?)
from collections import defaultdict
back = defaultdict(lambda :0)
for index, row in temp[['A', 'B']].iterrows():
word_list=row['B'].split(',')#这一列是以空格分隔的括号里就空的,以逗号分隔就是','
for word in word_list:
# print(back[word])#这种写法在这里写这一句,会打印0,因为上面已设置默认为0
back[word] = back[word] + 1
print(back[word])
# 1
# 1
# 1
# 1
# 1
# 2
# 1
# 2
# 2
print(back)
#defaultdict(<function <lambda> at 0x0000015191AFE598>, {'5': 1, '6': 1, '1': 1, '2': 2, '3': 2, '4': 2})
#写法2:
back = {}
for index, row in temp[['A', 'B']].iterrows():
word_list=row['B'].split(',')
for word in word_list:
# print(back[word])#会报错,因为字典统计词频需要首先有这个词
try:
back[word]=back[word]+1
except:
back[word]=1
print(back[word])
# 1
# 1
# 1
# 1
# 1
# 2
# 1
# 2
# 2
print(back)
#{'5': 1, '6': 1, '1': 1, '2': 2, '3': 2, '4': 2}