C++ 离群点检测

ai.hpp

#ifndef ENGINE_AI_N_H
#define ENGINE_AI_N_H
#pragma once
#endif

#include <vector>
#include <cmath>
#include <string>
#include <list>
#include <unordered_map>
#include <tuple>
#include <algorithm>
#include <memory>
#include <fstream>
#include <sstream>
#include <chrono>

using namespace std;

typedef std::vector<float> Vector;

template <typename T> struct Cvt;

template <> struct Cvt<std::string> {
    static const std::string& to_utf8(const std::string& s) { return s; }
    static const std::string& from_utf8(const std::string& s) { return s; }
};

namespace v {
	struct LightVector: public std::pair<float *, float *>
	{
	  using Parent = std::pair<float *, float *>;
	  template <typename... Arg> LightVector(Arg&& ... arg): Parent(std::forward<Arg>(arg) ...) {}

	  float *data() const { return first; }
	  size_t size() const { return std::distance(first, second); }
	  bool empty() const  { return first == second; }

	  float& operator[](size_t i) { return *(first + i); }
	  float operator[](size_t i) const { return *(first + i); }
	};

	template <class Vector1, class Vector2> inline float dot(const Vector1&x, const Vector2& y) { 
		int m = x.size(); const float *xd = x.data(), *yd = y.data();
		float sum = 0.0;
		while (--m >= 0) sum += (*xd++) * (*yd++);
		return sum;
	}

	// saxpy: x = x + g * y; x = a * x + g * y
	inline void saxpy(Vector& x, float g, const Vector& y) {
		int m = x.size(); float *xd = x.data(); const float *yd = y.data();
		while (--m >= 0) (*xd++) += g * (*yd++);
	}
	
	inline void unit(Vector& x) {
		float len = std::sqrt(dot(x, x));
		if (len == 0) return;

		int m = x.size(); float *xd = x.data();
		while (--m >= 0) (*xd++) /= len;
	}
}

namespace ai
{
    template <class String = std::string>
    struct Word2Vec
    {
        struct Word
        {
            int32_t index_;
            String text_;
            uint32_t count_;
            Word *left_, *right_;
        
            std::vector<uint8_t> codes_;
            std::vector<uint32_t> points_;
        
            Word(int32_t index, String text, uint32_t count, Word *left = 0, Word *right = 0) : index_(index), text_(text), count_(count), left_(left), right_(right) {}
            Word(const Word&) = delete;
            const Word& operator = (const Word&) = delete;
        };
        typedef std::shared_ptr<Word> WordP;
        
        struct Sentence
        {
            std::vector<Word *> words_;	
            std::vector<String> tokens_;
        };
        typedef std::shared_ptr<Sentence> SentenceP;

        std::vector<Vector> syn0_, syn1_;
        std::vector<Vector> syn0norm_;

        std::unordered_map<String, WordP> vocab_;
        std::vector<Word *> words_;

        int layer1_size_;
        int window_;

        //subsampling
        float sample_;

        int min_count_;

        float alpha_, min_alpha_;

        bool phrase_;
        float phrase_threshold_;

        Word2Vec(int size = 100, int window = 5, float sample = 0.001, int min_count = 5, int negative = 0, float alpha = 0.025, float min_alpha = 0.0001) 
            :layer1_size_(size), window_(window), sample_(sample), min_count_(min_count), alpha_(alpha), min_alpha_(min_alpha), phrase_(false), phrase_threshold_(50)
        {}

        bool has(const String& w) const { return vocab_.find(w) != vocab_.end(); }

        int build_vocab(std::vector<SentenceP>& sentences) {
            size_t count = 0;
            std::unordered_map<String, int> vocab;

            for (auto& sentence: sentences) {
                ++count;

                String last_token;
                for (auto& token: sentence->tokens_) {
                    vocab[token] += 1;
                    // add bigram phrases
                    if (phrase_) {
                        if(!last_token.empty()) vocab[last_token + Cvt<String>::from_utf8("_") + token] += 1;
                        last_token = token;
                    }
                }
            }

            if (phrase_) {
                count = 0;
                int total_words = std::accumulate(vocab.begin(), vocab.end(), 0, [](int x, const std::pair<String, int>& v) { return x + v.second; });

                std::unordered_map<String, int> phrase_vocab;

                for (auto& sentence: sentences) {
                    ++count;
                    
                    std::vector<String> phrase_tokens;
                    String last_token;
                    uint32_t pa = 0, pb = 0, pab = 0;
                    for (auto& token: sentence->tokens_) {
                        pb = vocab[token];
                        if (!last_token.empty()) {
                            String phrase = last_token + Cvt<String>::from_utf8("_") + token;	
                            pab = vocab[phrase];
                            float score = 0;
                            if (pa >= min_count_ && pb >= min_count_ && pab >= min_count_)
                                score = (pab - min_count_ ) / (float(pa) * pb) * total_words;
                            if (score > phrase_threshold_) {
                                phrase_tokens.emplace_back(phrase);
                                token.clear();
                                phrase_vocab[phrase] += 1;
                            }
                            else {
                                phrase_tokens.emplace_back(last_token);
                                phrase_vocab[last_token] += 1;
                            }
                        }
                        last_token = token;
                        pa = pb;
                    }

                    if (!last_token.empty()) {
                        phrase_tokens.emplace_back(last_token);
                        phrase_vocab[last_token] += 1;
                    }
                    sentence->tokens_.swap(phrase_tokens);
                }
                
                vocab.swap(phrase_vocab);
            }

            int n_words = vocab.size();
            if (n_words <= 1) return -1;

            words_.reserve(n_words);
            auto comp = [](Word *w1, Word *w2) { return w1->count_ > w2->count_; };

            for (auto& p: vocab) {
                uint32_t count = p.second;
                if (count <= min_count_) continue;

                auto r = vocab_.emplace(p.first, WordP(new Word{0, p.first, count}));

                auto it = std::find(words_.begin(), words_.end(), r.first->second.get());
                if (it != words_.end()) {
                    continue;
                } else {
                    words_.emplace_back((r.first->second.get()));
                }
            }
            std::sort(words_.begin(), words_.end(), comp);

            int index = 0;
            for (auto& w: words_) w->index_ = index++;

            n_words = words_.size();
            
            std::vector<Word *> heap = words_;
            std::make_heap(heap.begin(), heap.end(), comp);

            std::vector<WordP> tmp;
            for (int i=0; i<n_words-1; ++i) {
                std::pop_heap(heap.begin(), heap.end(), comp);
                auto min1 = heap.back(); heap.pop_back();
                std::pop_heap(heap.begin(), heap.end(), comp);
                auto min2 = heap.back(); heap.pop_back();
                tmp.emplace_back(WordP(new Word{i + n_words, Cvt<String>::from_utf8(""), min1->count_ + min2->count_, min1, min2}));

                heap.emplace_back(tmp.back().get());
                std::push_heap(heap.begin(), heap.end(), comp);
            }

            int max_depth = 0;
            std::list<std::tuple<Word *, std::vector<uint32_t>, std::vector<uint8_t>>> stack;
            stack.emplace_back(std::make_tuple(heap[0], std::vector<uint32_t>(), std::vector<uint8_t>()));
            count = 0;
            while (!stack.empty()) {
                auto t = stack.back();
                stack.pop_back();

                Word *word = std::get<0>(t);
                if (word->index_ < n_words) {
                    word->points_ = std::get<1>(t);
                    word->codes_ = std::get<2>(t);
                    max_depth = std::max((int)word->codes_.size(), max_depth);
                }
                else {
                    auto points = std::get<1>(t);
                    points.emplace_back(word->index_ - n_words);
                    auto codes1 = std::get<2>(t);
                    auto codes2 = codes1;
                    codes1.emplace_back(0); codes2.emplace_back(1);
                    stack.emplace_back(std::make_tuple(word->left_, points, codes1));
                    stack.emplace_back(std::make_tuple(word->right_, points, codes2));
                }
            }

            syn0_.resize(n_words);
            syn1_.resize(n_words);

            std::default_random_engine eng(::time(NULL));
            std::uniform_real_distribution<float> rng(0.0, 1.0);
            for (auto& s: syn0_) { 
                s.resize(layer1_size_);
                for (auto& x: s) x = (rng(eng) - 0.5) / layer1_size_;
            }
            for (auto& s: syn1_) s.resize(layer1_size_);

            return 0;
        }

        int train(std::vector<SentenceP>& sentences) {
            int total_words = std::accumulate(vocab_.begin(), vocab_.end(), 0, 
                [](int x, const std::pair<String, WordP>& p) { return (int)(x + p.second->count_); });
            int current_words = 0;
            float alpha0 = alpha_, min_alpha = min_alpha_;

            std::default_random_engine eng(::time(NULL));
            std::uniform_real_distribution<float> rng(0.0, 1.0);

            size_t n_sentences = sentences.size();

            size_t last_words = 0;
            auto cstart = std::chrono::high_resolution_clock::now();

            #pragma omp parallel for
            for (size_t i=0; i < n_sentences; ++i) {
                auto sentence = sentences[i].get();
                if (sentence->tokens_.empty()) 
                    continue;
                size_t len = sentence->tokens_.size();
                for (size_t i=0; i<len; ++i) {
                    auto it = vocab_.find(sentence->tokens_[i]);
                    if (it == vocab_.end()) continue; 
                    Word *word = it->second.get();
                    // subsampling
                    if (sample_ > 0) {
                        float rnd = (sqrt(word->count_ / (sample_ * total_words)) + 1) * (sample_ * total_words) / word->count_;
                        if (rnd < rng(eng)) continue;
                    }
                    sentence->words_.emplace_back(it->second.get());
                }

                float alpha = std::max(min_alpha, float(alpha0 * (1.0 - 1.0 * current_words / total_words)));
                Vector work(layer1_size_);
                size_t words = train_sentence(*sentence, alpha, work);

                #pragma omp atomic
                current_words += words;

                if (current_words - last_words > 1024 * 100 || i == n_sentences - 1) {
                    auto cend = std::chrono::high_resolution_clock::now();
                    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(cend - cstart).count();
                    // printf("training alpha: %.4f progress: %.2f%% words per sec: %.3fK\n", alpha, current_words * 100.0/total_words, (current_words - last_words) * 1000.0 / duration);
                    last_words = current_words;
                    cstart = cend;
                }
            }

            syn0norm_ = syn0_;
            for (auto& v: syn0norm_) v::unit(v);

            return 0;
        }

        int save(const std::string& file) const {
            std::ofstream out(file, std::ofstream::out);
            out << syn0_.size() << " " << syn0_[0].size() << std::endl;
            
            std::vector<Word *> words = words_;
            std::sort(words.begin(), words.end(), [](Word *w1, Word *w2) { return w1->count_ > w2->count_; });

            for (auto w: words) {
                out << Cvt<String>::to_utf8(w->text_);
                for (auto i: syn0_[w->index_]) out << " " << i;
                out << std::endl;
            }

            return 0;
        }

        int load(const std::string& file) {
            std::ifstream in(file);
            std::string line;
            if (! std::getline(in, line)) return -1;

            int n_words = 0, layer1_size = 0;
            std::istringstream iss(line);
            iss >> n_words >> layer1_size;

            syn0_.clear(); vocab_.clear(); words_.clear();
            syn0_.resize(n_words);
            for (int i=0; i<n_words; ++i) {
                if (! std::getline(in, line)) return -1;

                std::istringstream iss(line);
                std::string text;
                iss >> text;

                auto p = vocab_.emplace(Cvt<String>::from_utf8(text), WordP(new Word{i, Cvt<String>::from_utf8(text), 0}));
                words_.emplace_back(p.first->second.get());
                syn0_[i].resize(layer1_size);
                for(int j=0; j<layer1_size; ++j) {
                    iss >> syn0_[i][j];
                }
            }
            
            layer1_size_ = layer1_size;

            syn0norm_ = syn0_;
            for (auto& v: syn0norm_) v::unit(v);
            
            return 0;	
        }
        
        std::vector<std::pair<String,float>> most_similar(std::vector<String> positive, std::vector<String> negative, int topn) {
            if ((positive.empty() && negative.empty()) || syn0norm_.empty()) return std::vector<std::pair<String,float>>{};

            Vector mean(layer1_size_);
            std::vector<int> all_words;
            auto add_word = [&mean, &all_words, this](const String& w, float weight) {
                auto it = vocab_.find(w);		
                if (it == vocab_.end()) return;

                Word& word = *it->second;
                v::saxpy(mean, weight, syn0norm_[word.index_]);

                all_words.push_back(word.index_);
            };

            for (auto& w: positive) add_word(w, 1.0);
            for (auto& w: negative) add_word(w, -1.0);

            v::unit(mean);

            Vector dists;
            std::vector<int> indexes;
            int i=0;

            dists.reserve(syn0norm_.size());
            indexes.reserve(syn0norm_.size());
            for (auto &x: syn0norm_) {
                dists.push_back(v::dot(x, mean));		
                indexes.push_back(i++);
            }

            auto comp = [&dists](int i, int j) { return dists[i] > dists[j]; };
    
            int k = std::min(int(topn+all_words.size()), int(indexes.size())-1);
            auto first = indexes.begin(), last = indexes.begin() + k, end = indexes.end();
            std::make_heap(first, last + 1, comp);
            std::pop_heap(first, last + 1, comp);
            for (auto it = last + 1; it != end; ++it) {
                if (! comp(*it, *first)) continue;
                    *last = *it;
                    std::pop_heap(first, last + 1, comp);
            }
            
            std::sort_heap(first, last, comp);

            std::vector<std::pair<String,float>> results;
            for(int i=0, j=0; i<k; ++i) {
                if (std::find(all_words.begin(), all_words.end(), indexes[i]) != all_words.end())
                    continue;
                results.push_back(std::make_pair(words_[indexes[i]]->text_, dists[indexes[i]]));
                if (++j >= topn) break;
            }

            return results;
        }

        const Vector& word_vector(const String& w) const {
            static Vector nil;
            auto it = vocab_.find(w);
            if (it == vocab_.end()) return nil;
            
            return syn0_[it->second->index_];
        }

        size_t word_vector_size() const { return layer1_size_; }

        const Vector calculate_words_average(const Vector& word_vec1, const Vector& word_vec2) {
            int size = word_vec1.size();

            if (size != word_vec2.size()) {
                if (word_vec1.empty()) return word_vec2;
                return word_vec1;
            }

            Vector result(size);
            for (size_t i = 0; i < size; ++i) result[i] = (word_vec1[i] + word_vec2[i]) / 2.0;

            return result;
        }

    private:
        int train_sentence(Sentence& sentence, float alpha, Vector& work) {
            const int max_size = 1000;
            const float max_exp = 6.0;
            const static std::vector<float> table = [&](){
                    std::vector<float> x(max_size);
                    for (size_t i=0; i<max_size; ++i) { float f = exp( (i / float(max_size) * 2 -1) * max_exp); x[i] = f / (f + 1); }
                    return x;
                }();

            int count = 0;
            int len = sentence.words_.size();
            int reduced_window = rand() % window_;
            for (int i=0; i<len; ++i) {
                const Word& current = *sentence.words_[i];
                size_t codelen = current.codes_.size();

                int j = std::max(0, i - window_ + reduced_window);
                int k = std::min(len, i + window_ + 1 - reduced_window);
                for (; j < k; ++j) {
                    const Word *word = sentence.words_[j];
                    if (j == i || word->codes_.empty()) 
                        continue;
                    int word_index = word->index_;
                    auto& l1 = syn0_[word_index];

                    std::fill(work.begin(), work.end(), 0);
                    for (size_t b=0; b<codelen; ++b) {
                        int idx = current.points_[b];
                        auto& l2 = syn1_[idx];
                    
                        float f = v::dot(l1, l2);
                        if (f <= -max_exp || f >= max_exp) 
                            continue;
            
                        int fi = int((f + max_exp) * (max_size / max_exp / 2.0));
                        
                        f = table[fi];
                        float g = (1 - current.codes_[b] - f) * alpha;

                        v::saxpy(work, g, l2);
                        v::saxpy(l2, g, l1);
                    }
                    v::saxpy(l1, 1.0, work);
                }
                ++count;
            }

            return count;
        }

        float similarity(const String& w1, const String& w2) const {
            auto it1 = vocab_.find(w1), it2 = vocab_.find(w2);
            if (it1 != vocab_.end() && it2 != vocab_.end())
                return v::dot(syn0_[it1->second->index_], syn0_[it2->second->index_]);

            return 0;
        }
    };

    template <class Double = std::double_t>
    struct DBSCAN
    {
        struct Model {
            Vector features;   // 特征向量
            int label;         // 类别标签
        };

        Double epsilon;
        size_t minPts;

        DBSCAN(Double epsilon = 0.6, size_t minPts = 1, size_t knn_k = 4) :epsilon(epsilon), minPts(minPts) {}

        std::vector<int> dbscan(const std::vector<Vector>& sentences) {
            int n = sentences.size();
            std::vector<int> labels(n, -1);
            int cluster = 0;

            for (int i = 0; i < n; ++i) {
                if (labels[i] != -1) continue;

                std::vector<int> neighbors;
                for (int j = i + 1; j < n; ++j) {
                    Double distance = calculate_distance(sentences[i], sentences[j]);
                    if (distance <= epsilon) neighbors.emplace_back(j);
                }
                int size = neighbors.size();
                if (size < minPts) {
                    labels[i] = 0;
                } else {
                    ++cluster;
                    labels[i] = cluster;

                    for (int j = 0; j < size; ++j) {
                        int neighborIndex = neighbors[j];
                        if (labels[neighborIndex] == 0) {
                            labels[neighborIndex] = cluster;
                        }

                        if (labels[neighborIndex] != -1) continue;

                        labels[neighborIndex] = cluster;

                        std::vector<int> newNeighbors;
                        for (int k = neighborIndex + 1; k < n; ++k) {
                            double distance = calculate_distance(sentences[neighborIndex], sentences[k]);
                            if (distance <= epsilon) newNeighbors.emplace_back(k);
                        }

                        if (newNeighbors.size() >= minPts) neighbors.insert(neighbors.end(), newNeighbors.begin(), newNeighbors.end());
                    }
                }
            }

            return labels;
        }

        int predict(const std::vector<Model>& model, const Vector& sentence) {
            int n = model.size();

            for (int i = 0; i < n; ++i) {
                double distance = calculate_distance(model[i].features, sentence);
                if (distance <= epsilon-2) {
                    if (model[i].label != 0) {
                        return model[i].label;
                    }
                }
            }

            return 0;
        }

        std::vector<Model> create_model(const std::vector<Vector>& baseline, const std::vector<int>& labels) {
            std::vector<Model> model;
            size_t numSamples = std::min(baseline.size(), labels.size());

            for (size_t i = 0; i < numSamples; ++i) {
                Model m;
                m.features = baseline[i];
                m.label = labels[i];
                model.emplace_back(m);
            }

            return model;
        }

        ~DBSCAN() {}

    private:
        Double calculate_distance(const Vector& v1, const Vector& v2) {
            Double distance = 0.0;

            int size = v1.size();
            for (int i = 0; i < size; ++i) {
                Double diff = v1[i] - v2[i];
                distance += std::abs(diff);  
            }

            return distance;
        }
    };
} // namespace ai

main.cpp

// OpenMP is required..
// g++-4.8 -ow2v -fopenmp -std=c++0x -Ofast -march=native -funroll-loops  main.cpp  -lpthread

#include "ai.hpp"
#include <fstream>
#include <iostream>
#include <sstream>
#include <list>
#include <set>

#define MODEL_SIZE 1000

std::vector<ai::Word2Vec<std::string>::SentenceP> get_sentences (const std::string& filename) {
	using Sentence = ai::Word2Vec<std::string>::Sentence;
	using SentenceP = ai::Word2Vec<std::string>::SentenceP;
	
	SentenceP sentence(new Sentence);

	std::ifstream ifs;
	ifs.open(filename, std::ios::in);
	std::string line;
	std::list<SentenceP> sentence_list;

	while (std::getline(ifs, line)) {
		std::string token;
		std::string before_token;
		std::istringstream iss(line);

		while (iss >> token) {
			sentence->tokens_.push_back(std::move(token));
		}

		sentence_list.push_back(std::move(sentence));
		sentence.reset(new Sentence);
	}

	if (!sentence->tokens_.empty()) sentence_list.push_back(std::move(sentence));

	std::vector<SentenceP> sentences(sentence_list.begin(), sentence_list.end());

	return sentences;
}

int train_w2v(ai::Word2Vec<std::string> &w2v, std::vector<ai::Word2Vec<std::string>::SentenceP>& sentences) {
	if (w2v.build_vocab(sentences) != 0) return -1;
	if (w2v.train(sentences) != 0) return -1;
	if (w2v.save("word2vec") != 0) return -1;

	return 0;
}

std::vector<Vector> get_sentences_vector(ai::Word2Vec<std::string> &w2v, std::vector<ai::Word2Vec<std::string>::SentenceP>& sentences) {
	std::vector<Vector> sentences_vector;
	for (auto sentence = sentences.begin(); sentence != sentences.end(); ++sentence) {
		Vector sentence_vector;
		sentence_vector.reserve((*sentence)->tokens_.size());
		for (auto token = (*sentence)->tokens_.begin(); token != (*sentence)->tokens_.end(); ++token) 
			if (w2v.has(*token)) sentence_vector = w2v.calculate_words_average(sentence_vector, w2v.word_vector(*token));
		
		if (!sentence_vector.empty()) sentences_vector.push_back(std::move(sentence_vector));
	}

	return sentences_vector;
}

int main(int argc, const char *argv[])
{
	std::shared_ptr<ai::Word2Vec<std::string>> w2v = std::make_shared<ai::Word2Vec<std::string>>(32);
	w2v->sample_ = 0;
	w2v->min_count_= 1;
	// w2v->window_ = 10;

	std::vector<ai::Word2Vec<std::string>::SentenceP> sentences = get_sentences("sentences.txt");
	if (train_w2v((*w2v), sentences) != 0) return -1;

	std::vector<Vector> sentences2vector = get_sentences_vector((*w2v), sentences);
	std::cout << "sentence vector size:" << sentences2vector.size() << std::endl;

    ai::DBSCAN<std::double_t> dbscan;
	dbscan.epsilon = 5.03;
	dbscan.minPts = 3;

	std::vector<Vector> second_data;
	using second_Sentence = ai::Word2Vec<std::string>::Sentence;
	using second_SentenceP = ai::Word2Vec<std::string>::SentenceP;
	
	second_SentenceP sentence(new second_Sentence);
	std::vector<second_SentenceP> second_sentences_;

	int num = 0;
	size_t size1 = sentences2vector.size();
	for (int i = 0;i < 50; i+=2) {
		std::vector<Vector> baseline(sentences2vector.begin() + i*MODEL_SIZE, sentences2vector.begin() + (i+1)*MODEL_SIZE);
		std::vector<int> labels = dbscan.dbscan(baseline);

		std::set<int> labels_set(labels.begin(), labels.end());
		std::vector<Vector> average_baseline;

		std::unordered_map<int, Vector> baseline_map;

		for (size_t i = 0; i < labels.size(); i++) {
			baseline_map[labels[i]] = w2v->calculate_words_average(baseline_map[labels[i]], baseline[i]);
		}

		std::vector<int> new_labels;
		std::vector<Vector> new_baseline;

		for (const auto& pair : baseline_map) {
			new_labels.push_back(pair.first);
			new_baseline.push_back(pair.second);
		}

		std::vector<ai::DBSCAN<std::double_t>::Model> model = dbscan.create_model(baseline, labels);
		// std::vector<ai::DBSCAN<std::double_t>::Model> model = dbscan.create_model(new_baseline, new_labels);
		Vector before_log;

		for (int j = 0; j < MODEL_SIZE; j++) {
			if (j+(i+1)*MODEL_SIZE == size1) goto over;
 			Vector log = sentences2vector.at(j+(i+1)*MODEL_SIZE);

			if (!dbscan.predict(model, log)) {
				second_data.push_back(log);
				second_sentences_.push_back(sentences[j+(i+1)*MODEL_SIZE]);
				num++;
				std::cout << "第一次检测的离群点: ";
				const ai::Word2Vec<std::string>::SentenceP& sentence = sentences[j+(i+1)*MODEL_SIZE];
				const ai::Word2Vec<std::string>::Sentence& tokens = *sentence;
				for (const std::string& token : tokens.tokens_) std::cout << token << " ";
				std::cout << std::endl;
			}
		}
		std::cout << std::endl;
	}

over:
	std::cout << "num:" << num << std::endl;

	std::vector<ai::Word2Vec<std::string>::SentenceP> second_sentences(second_sentences_.begin(), second_sentences_.end());
	num = 0;
	std::vector<int> labels = dbscan.dbscan(second_data);
	for(int i = 0; i < labels.size(); i++) {
		if (labels[i] == 0) {
			num++;
			std::cout << "第二次检测的离群点: ";
			const ai::Word2Vec<std::string>::SentenceP& sentence = second_sentences[i];
			const ai::Word2Vec<std::string>::Sentence& tokens = *sentence;
			for (const std::string& token : tokens.tokens_) std::cout << token << " ";
			std::cout << std::endl;
		}
	}
	std::cout << "num:" << num << std::endl;
}

sentences.txt

abc def ghi sdff ghrio oongs
sdgg sdgf jong wongg lllg
jjogg weg egege geron osdg lsdg
...
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
聚类是一种常用的数据分析方法,可以帮助我们发现数据中的一些模式和规律。基于聚类的离群点检测是一种常见的离群点检测方法,它基于聚类的结果来判断哪些数据点可能是离群点。下面简单介绍一下如何在C++中实现基于聚类的离群点检测。 一般来说,基于聚类的离群点检测可以分为两个步骤:聚类和离群点检测。聚类可以使用常见的聚类算法,比如K-means、层次聚类等。离群点检测可以使用一些统计方法,比如Z-score、箱线图等。下面以K-means为例,简单介绍一下如何实现基于聚类的离群点检测。 首先需要使用一个K-means算法对数据进行聚类,得到每个数据点所属的簇。然后对于每个簇,可以计算出该簇中所有数据点的平均值和标准差,然后对于每个数据点,计算它与所属簇的平均值的距离(例如欧氏距离),并将该距离除以该簇的标准差得到一个Z-score值。如果该Z-score值大于某个阈值,就可以认为该数据点是一个离群点。 下面是一个简单的基于聚类的离群点检测C++代码示例: ```cpp #include <iostream> #include <vector> #include <cmath> #include <algorithm> #include <numeric> using namespace std; // 欧氏距离 double euclidean_distance(const vector<double>& a, const vector<double>& b) { double dist = 0.0; for (size_t i = 0; i < a.size(); ++i) { double diff = a[i] - b[i]; dist += diff * diff; } return sqrt(dist); } // K-means聚类 vector<int> kmeans(const vector<vector<double>>& data, int k) { // 初始化聚类中心 vector<vector<double>> centers(k); for (int i = 0; i < k; ++i) { centers[i] = data[i]; } // 迭代聚类 vector<int> labels(data.size()); while (true) { // 分配数据点到最近的聚类中心 bool updated = false; for (size_t i = 0; i < data.size(); ++i) { double min_dist = numeric_limits<double>::max(); int label = 0; for (int j = 0; j < k; ++j) { double dist = euclidean_distance(data[i], centers[j]); if (dist < min_dist) { min_dist = dist; label = j; } } if (labels[i] != label) { labels[i] = label; updated = true; } } if (!updated) { break; } // 更新聚类中心 for (int j = 0; j < k; ++j) { vector<double> new_center(data[0].size(), 0.0); int count = 0; for (size_t i = 0; i < data.size(); ++i) { if (labels[i] == j) { new_center = new_center + data[i]; ++count; } } if (count > 0) { new_center = new_center / count; } centers[j] = new_center; } } return labels; } // 基于聚类的离群点检测 vector<bool> outlier_detection(const vector<vector<double>>& data, int k, double threshold) { // 聚类 vector<int> labels = kmeans(data, k); // 计算每个簇的平均值和标准差 vector<vector<double>> centers(k); vector<vector<double>> stds(k); for (int j = 0; j < k; ++j) { vector<double> center(data[0].size(), 0.0); int count = 0; for (size_t i = 0; i < data.size(); ++i) { if (labels[i] == j) { center = center + data[i]; ++count; } } if (count > 0) { center = center / count; } centers[j] = center; vector<double> std(data[0].size(), 0.0); for (size_t i = 0; i < data.size(); ++i) { if (labels[i] == j) { for (size_t d = 0; d < data[i].size(); ++d) { std[d] += pow(data[i][d] - center[d], 2.0); } } } if (count > 1) { for (size_t d = 0; d < std.size(); ++d) { std[d] = sqrt(std[d] / (count - 1)); } } stds[j] = std; } // 判断每个数据点是否为离群点 vector<bool> outliers(data.size(), false); for (size_t i = 0; i < data.size(); ++i) { int label = labels[i]; double dist = euclidean_distance(data[i], centers[label]); double std = stds[label][0]; // 假设所有维度的标准差相等 double z_score = (dist - centers[label][0]) / std; if (z_score > threshold) { outliers[i] = true; } } return outliers; } int main() { // 生成一些随机数据 constexpr int n = 100; constexpr int d = 2; vector<vector<double>> data(n, vector<double>(d)); for (int i = 0; i < n; ++i) { for (int j = 0; j < d; ++j) { data[i][j] = static_cast<double>(rand()) / RAND_MAX; } } // 基于聚类的离群点检测 constexpr int k = 5; constexpr double threshold = 2.0; vector<bool> outliers = outlier_detection(data, k, threshold); // 输出结果 for (size_t i = 0; i < data.size(); ++i) { if (outliers[i]) { cout << "Outlier: "; } else { cout << "Inlier: "; } for (int j = 0; j < d; ++j) { cout << data[i][j] << " "; } cout << endl; } return 0; } ``` 在这个示例中,我们生成了一些随机数据,并使用基于聚类的离群点检测方法来检测其中的离群点。具体来说,我们将数据分为了5个簇,计算了每个簇的平均值和标准差,并使用Z-score值来判断每个数据点是否为离群点。如果Z-score值大于2.0,就认为该数据点是一个离群点。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值