import pandas as pd
import numpy as np
import re
import nltk #pip install nltk#jieba
构造一个文本数据集
corpus =['The sky is blue and beautiful.','Love this blue and beautiful sky!','The quick brown fox jumps over the lazy dog.','The brown fox is quick and the blue dog is lazy!','The sky is very blue and the sky is very beautiful today','The dog is lazy but the brown fox is quick!']
labels =['weather','weather','animals','animals','weather','animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus,'Category': labels})
corpus_df = corpus_df[['Document','Category']]
corpus_df
Document
Category
0
The sky is blue and beautiful.
weather
1
Love this blue and beautiful sky!
weather
2
The quick brown fox jumps over the lazy dog.
animals
3
The brown fox is quick and the blue dog is lazy!
animals
4
The sky is very blue and the sky is very beaut...
weather
5
The dog is lazy but the brown fox is quick!
animals
基本预处理
nltk.download()
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
True
#词频与停用词
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')print(stop_words)defnormalize_document(doc):# lower case and remove special characters\whitespaces
doc = re.sub(r'[^a-zA-Z0-9\s]','', doc, re.I)
doc = doc.lower()
doc = doc.strip()# tokenize document
tokens = wpt.tokenize(doc)# filter stopwords out of document
filtered_tokens =[token for token in tokens if token notin stop_words]# re-create document from filtered tokens
doc =' '.join(filtered_tokens)return doc
normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(corpus)
norm_corpus
#The sky is blue and beautiful.
array(['sky blue beautiful', 'love blue beautiful sky',
'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
'sky blue sky beautiful today', 'dog lazy brown fox quick'],
dtype='<U30')
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)
Document
Category
ClusterLabel
0
The sky is blue and beautiful.
weather
0
1
Love this blue and beautiful sky!
weather
0
2
The quick brown fox jumps over the lazy dog.
animals
1
3
The brown fox is quick and the blue dog is lazy!
animals
1
4
The sky is very blue and the sky is very beaut...
weather
0
5
The dog is lazy but the brown fox is quick!
animals
1
主题模型
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1','T2'])
features
e:\ProgramData\Anaconda3\lib\site-packages\sklearn\decomposition\online_lda.py:294: DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21
DeprecationWarning)
e:\ProgramData\Anaconda3\lib\site-packages\sklearn\decomposition\online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
DeprecationWarning)
T1
T2
0
0.190615
0.809385
1
0.176860
0.823140
2
0.846148
0.153852
3
0.815229
0.184771
4
0.180563
0.819437
5
0.839140
0.160860
主题和词的权重
tt_matrix = lda.components_
for topic_weights in tt_matrix:
topic =[(token, weight)for token, weight inzip(vocab, topic_weights)]
topic =sorted(topic, key=lambda x:-x[1])
topic =[item for item in topic if item[1]>0.6]print(topic)print()
e:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
if __name__ == '__main__':
import pandas as pdimport numpy as npimport reimport nltk #pip install nltk#jieba构造一个文本数据集corpus = ['The sky is blue and beautiful.', 'Love this blue and beautiful sky!', 'T...