from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer from sklearn2pmml.feature_extraction.text import Splitter from sklearn.cluster import KMeans from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml
// 读取文件内容,目前文件分词为空格分隔 with open('data/segment{}.txt'.format(label), 'r', encoding='utf8') as f: sentences = f.readlines()
// 构建pipeline pipeline = PMMLPipeline([("td_vector", TfidfVectorizer(max_df=0.7, min_df=0.01, tokenizer=Splitter(), norm=None)), ("km", KMeans(n_clusters=100, random_state=1000))]) // 需要注意的是TfidfVectorizer 1)一定不能使用正则;2)使用分词器tokenizer=Splitter() pipeline.fit(sentences) sklearn2pmml(pipeline, "hzd.pmml")