1. 频率分析
from prettytable import PrettyTable
from collections import Counter
for label, data in (('Word', words),('Screen Name', screen_names),('Hashtag', hashtags)):
pt = PrettyTable(field_names=[label, 'Count'])
c = Counter(data)
[ pt.add_row(kv) for kv in c.most_common()[:10] ]
pt.align[label], pt.align['Count'] = 'l', 'r' # Set column alignment
print pt
# A function for computing lexical diversity
def lexical_diversity(tokens):
return 1.0*len(set(tokens))/len(tokens)
# A function for computing the average number of words per tweet
def average_words(statuses):
total_words = sum([ len(s.split()) for s in statuses ])
return 1.0*total_words/len(statuses)
3. 可视化处理
for label, data in (('Words', words),('Screen Names', screen_names),('Hashtags', hashtags)):
# Build a frequency map for each set of data
# and plot the values
c = Counter(data)
plt.hist(c.values())
# Add a title and y-label ...
plt.title(label)
plt.ylabel("Number of items in bin")
plt.xlabel("Bins (number of times an item appeared)")
# ... and display as a new figure
plt.show()
4. 聚类分析
a. 数据清理
b. 相似性度量
5. 信息检索
a. TF-IDF
b. 余弦相似性
c. 词汇搭配检测