用朴素贝叶斯完成一个语种检测的分类器
预料库为twitter数据,包含English, French, German, Spanish, Italian 和 Dutch 6种语言。
1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme,nl
1 millón de afectados ante las inundaciones en sri lanka unicef está distribuyendo ayuda de emergencia srilanka,es
1 millón de fans en facebook antes del 14 de febrero y paty miki dani y berta se tiran en paracaídas qué harías tú porunmillondefans,es
1 satellite galileo sottoposto ai test presso lesaestec nl galileo navigation space in inglese,it
10 der welt sind bei,de
。。。
后面的字母分别表示:
en:English
fr:French
de:German
es:Spanish
it:Italian
nl:Dutch
python3.6代码
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
class LanguageDetector():
def __init__(self, classifier=MultinomialNB()):#使用朴素贝叶斯的多项式模式,即考虑重复词语
self.classifier = classifier
self.vectorizer = CountVectorizer(ngram_range=(1,2), max_features=1000, preprocessor=self._remove_noise)#抽取1gram和2gram特征,最大特征数是1000,指定去噪函数
def _remove_noise(self, document):
noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
clean_text = re.sub(noise_pattern, "", document)
return clean_text
def features(self, X):
return self.vectorizer.transform(X)#得到特征向量
def fit(self, X, y):
self.vectorizer.fit(X)#得到映射字典
self.classifier.fit(self.features(X), y)
def predict(self, x):
return self.classifier.predict(self.features([x]))
def score(self, X, y):
return self.classifier.score(self.features(X), y)
in_f = open('data.csv')
lines = in_f.readlines()
in_f.close()
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]#元组列表,每一条数据是一个元组
x, y = zip(*dataset)#unzip成两个列表
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)#使用sklearn自带的分割函数,把原数据集分成训练集(75%)和测试集,random_state为伪随机数
language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)
print(language_detector.predict('This is an English sentence'))
print(language_detector.score(x_test, y_test))
输出
[‘en’]
0.977062196736