import jieba import numpy as np from tensorflow.contrib import learn DOCUMENTS = [ '这是一条测试1', '这是一条测试2', '这是一条测试3', ] def chinese_tokenizer(docs): for doc in docs: yield list(jieba.cut(doc)) # 最长词袋长度,最小频率,分词函数 vocab = learn.preprocessing.VocabularyProcessor(10, 0, tokenizer_fn=chinese_tokenizer) x = list(vocab.fit_transform(DOCUMENTS)) print(np.array(x))
import jieba import numpy as np from tensorflow.contrib import learn DOCUMENTS = [ '这是一条测试1', '这是一条测试2', '这是一条测试3', ] def chinese_tokenizer(DOCUMENTS): lists = [] for doc in DOCUMENTS: alist = "" for i in list(jieba.cut(doc)): alist = alist + i + " " lists.append(alist) return lists x_text = chinese_tokenizer(DOCUMENTS) print(x_text) max_document_length = max([len(x.split(" ")) for x in x_text]) vocab = learn.preprocessing.VocabularyProcessor(max_document_length, 0) x = list(vocab.fit_transform(x_text)) print(np.array(x))
tf1.X 利用VocabularyProcessor制作词袋
最新推荐文章于 2022-06-27 14:56:40 发布