# -*- coding: utf-8 -*-
"""
Created on Wed Oct 26 21:35:31 2016
@author: sirius
test word2word
"""
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
vectorizer=CountVectorizer(min_df=1)
"""
hash处理,把所有的词全部列出来,然后根据有几句话分为几行,
这句话中出现该单词则标为1,不是则为0
"""
corpus=[
'this is the first document',
'this is the second document',
'And the third one',
'what the hell is that']
x=vectorizer.fit_transform(corpus).toarray()
"""
结果如下:
[[0 1 1 0 1 0 0 0 1 0 1 0]
[0 1 0 0 1 0 1 0 1 0 1 0]
[1 0 0 0 0 1 0 0 1 1 0 0]
[0 0 0 1 1 0 0 1 1 0 0 1]]
"""
"""
-----------------------n-gram----------------
在中文中, 我讨厌你 你讨厌我 用上述的方法是区别不出来的
而在n-gram中,是把“我讨厌你”分词为 [我、讨厌、你、我讨厌、讨厌你],
既含有一元的,也含有二元的(两两相连),这里用到了2-gram,
如果是三个三个相连,则可以用3-gram等等。
"""
bigram_vectorizer=CountVectorizer(ngram_range=(1,2),
token_pattern=r'\b\w+\b',min_df=1)
x2=bigram_vectorizer.fit_transform(corpus).toarray()
"""
结果如下:
[[0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0]
[0 0 1 0 0 0 0 1 0 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0]
[1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0]
[0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1]]
"""
analyze=bigram_vectorizer.build_analyzer()
analyze('Bi-gram are cool!')==(
['bi','gram','are','cool',
'bi gram','gram are','are'])
"""
>>> analyze('Bi-gram are cool!')==(
... ['bi','gram','are','cool',
... 'bi gram','gram are','are cool'])
True
>>> bigram_vectorizer=CountVectorizer(ngram_range=(1,2),
... token_pattern=r'\b\w+\b',min_df=1)
>>> analyze=bigram_vectorizer.build_analyzer()
>>> analyze('Bi-gram are cool!')==(
... ['bi','gram','are','cool',
... 'bi gram','gram are','are'])
False
"""
机器学习(九)使用sklearn库进行数据分析_——文本特征处理
最新推荐文章于 2022-01-22 03:36:54 发布