FastText
文本预处理
text = text.decode("utf-8").encode("utf-8")
seg_text = jieba.cut(text.replace("\t"," ").replace("\n"," ")) # 去掉\t \n然后用jieba分词切分
outline = " ".join(seg_text) # combined into a outline
outline = outline.encode("utf-8") + "\t__label__" + e + "\n" # add label
模型训练
#模型训练,输入内容"news_fasttext_train.txt",输出模型"news_fasttext.model.bin",标签的前缀"__label__"
classifier = fasttext.supervised("news_fasttext_train.txt","news_fasttext.model",label_prefix="__label__")
输入样式:
模型加载
#'news_fasttext.model.bin'模型路径;label_prefix='__label__':标签的前缀
classifier = fasttext.load_model('news_fasttext.model.bin', label_prefix='__label__')
模型预测
#每个文本获得的内容(分布处理)
texts = ['example very long text 1', 'example very longtext 2']
labels = classifier.predict(texts) # gain [label1, label2]
labels = classifier.predict_proba(texts) # gain [(label1,probs_label1),(label2,probs_label2)]
labels = classifier.predict(texts, k=3) # gain top3
labels = classifier.predict_proba(texts, k=3) # gain top3+probs
#整体文本获得的内容(统一处理),只能获得整体的R and P
result = classifier.test('test.txt')
print 'P@1:', result.precision
print 'R@1:', result.recall
print 'Number of examples:', result.nexamples