# -*- coding: utf-8 -*-
"""
Created on Sun Mar 25 22:02:48 2018
@author: Administrator
"""
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
class LanguageDetector():
# MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
def __init__(self, classifier=MultinomialNB()):
self.classifier = classifier
self.vectorizer = CountVectorizer(
lowercase=True, # lowercase the text
analyzer='char_wb', # tokenise by character ngrams
ngram_range=(1,2), # use ngrams of size 1 and 2
max_features=1000, # keep the most common 1000 ngrams
preprocessor=self._remove_noise
)
#模型要有好效果&#