import jieba
from collections import Counter
data ='北京大学和清华大学是中国的顶尖大学'print('单词统计')
words =list(jieba.cut(data))print(Counter(words))print('字符统计')print(Counter(list(data)))
Building prefix dict from the default dictionary ...
单词统计
Dumping model to file cache C:\Users\Pc\AppData\Local\Temp\jieba.cache
Loading model cost 1.128 seconds.
Prefix dict has been built succesfully.
Counter({'北京大学': 1, '和': 1, '清华大学': 1, '是': 1, '中国': 1, '的': 1, '顶尖': 1, '大学': 1})
字符统计
Counter({'大': 3, '学': 3, '北': 1, '京': 1, '和': 1, '清': 1, '华': 1, '是': 1, '中': 1, '国': 1, '的': 1, '顶': 1, '尖': 1})
2. 语言模型
2. 语言模型
2.1 n-gram模型(考虑句子中单词之间的顺序)
当n取1、2、3时,n-gram模型分别称为unigram、bigram、trigram语言模型
unigram一元分词,把句子分成一个一个的汉字
bigram二元分词,把句子从头到尾每两个字组成一个词语
trigram三元分词,把句子从头到尾每三个字组成一个词语
比如:
西安交通大学:
unigram 形式为:西/安/交/通/大/学
bigram形式为: 西安/安交/交通/通大/大学
trigram形式为:西安交/安交通/交通大/通大学
2.2词袋模型(不考虑句子中单词之间的顺序)
将所有词语装进一个袋子里,不考虑其词法和语序的问题,即每个词语都是独立的。
例句:
句1:Jane wants to go to Shenzhen.
句2:Bob wants to go to Shanghai.
建立一个数组用于映射匹配:[Jane, wants, to, go, Shenzhen, Bob, Shanghai]
构建词袋模型:
句1:[1,1,2,1,1,0,0]
句2:[0,1,2,1,0,1,1]
3. 文本矩阵化:要求采用词袋模型且是词级别的矩阵化
import jieba
import pandas as pd
import tensorflow as tf
from collections import Counter
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
# 读取停用词defread_stopword(filename):
stopword =[]
fp =open(filename,'r')for line in fp.readlines():
stopword.append(line.replace('\n',''))
fp.close()return stopword
# 切分数据,并删除停用词defcut_data(data, stopword):
words =[]for content in data['content']:
word =list(jieba.cut(content))for w inlist(set(word)&set(stopword)):while w in word:
word.remove(w)
words.append(' '.join(word))
data['content']= words
return data
# 获取单词列表defword_list(data):
all_word =[]for word in data['content']:
all_word.extend(word)
all_word =list(set(all_word))return all_word
# 计算文本向量deftext_vec(data):
count_vec = CountVectorizer(max_features=300, min_df=2)
count_vec.fit_transform(data['content'])
fea_vec = count_vec.transform(data['content']).toarray()return fea_vec
if __name__ =='__main__':
data = pd.read_csv('./data/cnews/cnews.test.txt', names=['title','content'], sep='\t')# (10000, 2)
data = data.head(50)
stopword = read_stopword('./data/stopword.txt')
data = cut_data(data, stopword)
fea_vec = text_vec(data)print(fea_vec)