【案例】贝叶斯

贝叶斯实现拼写检查

完整代码

import re, collections


# 把语料中的词全部拿出来并转变为小写
def words(text):
    return re.findall('[a-z]+', text.lower())


# 构建每个词出现的次数字典
def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model


NWORDS = train(words(open('E:\\ai\\main\\big.txt').read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'


# 返回所有与单词编辑距离为1的集合(仅通过插入,删除,交换,替换的操作把一个词变为另外一个词)
def edits1(word):
    n = len(word)
    return set([word[0:i] + word[i + 1:] for i in range(n)] +  # deletion
               [word[0:i] + word[i + 1] + word[i] + word[i + 2:] for i in range(n - 1)] +  # transposition
               [word[0:i] + c + word[i + 1:] for i in range(n) for c in alphabet] +  # alteration
               [word[0:i] + c + word[i:] for i in range(n + 1) for c in alphabet])  # insertion


# 返回所有与单词编辑距离为2的集合
# 在这些编辑距离小于2的词中间, 只把那些正确的词作为候选词
def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)


# 该单词是正确单词
def known(words):
    return set(w for w in words if w in NWORDS)


# 如果known(set)非空,candidates=known([word])
def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=lambda w: NWORDS[w])

贝叶斯新闻分类

关键词提取 TF-IDF

TF-IDF与余弦相似性的应用(一):自动提取关键词 - 阮一峰的网络日志

相似度分析

TF-IDF与余弦相似性的应用(二):找出相似文章 - 阮一峰的网络日志

LDA建模

gensim框架实现 LDA

使用gensim简单地跑个LDA模型 - 知乎

完整代码

import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib
import jieba.analyse
from gensim import corpora, models, similarities
import gensim
import logging

# 加载新闻
df_news = pd.read_table('E:\\ai\\main\\data\\val.txt', names=['category', 'theme', 'URL', 'content'], encoding='utf-8')
df_news = df_news.dropna()
content = df_news.content.values.tolist()
# 对每条新闻的内容进行分词
content_S = []
for line in content:
    current_segment = jieba.lcut(line)
    if len(current_segment) > 1 and current_segment != '\r\n':  # 换行符
        content_S.append(current_segment)
df_content = pd.DataFrame({'content_S': content_S})
# 加载停用词表
stopwords = pd.read_csv("E:\\ai\\main\\stopwords.txt", index_col=False, sep="\t", quoting=3, names=['stopword'],
                        encoding='utf-8')


def drop_stopwords(contents, stopwords):
    contents_clean = []
    all_words = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
            all_words.append(str(word))
        contents_clean.append(line_clean)
    return contents_clean, all_words


# 去除内容中的停用词
contents = df_content.content_S.values.tolist()
stopwords = stopwords.stopword.values.tolist()
# 返回所有文章的词和每条文章清洗过的词
contents_clean, all_words = drop_stopwords(contents, stopwords)

df_content = pd.DataFrame({'contents_clean': contents_clean})
df_all_words = pd.DataFrame({'all_words': all_words})

# 计算词频,统计每个词出现的频率
words_count = df_all_words.groupby(by=['all_words'])['all_words'].agg({"count": np.size})
words_count = words_count.reset_index().sort_values(by=["count"], ascending=False)
# 词云显示一篇文章的关键词
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
wordcloud = WordCloud(font_path="E:\\ai\\main\\data\\simhei.ttf", background_color="white", max_font_size=80)
word_frequence = {x[0]: x[1] for x in words_count.head(100).values}
wordcloud = wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)


# 提取某一篇文章的关键词
index = 2400
content_S_str = "".join(content_S[index])
print("  ".join(jieba.analyse.extract_tags(content_S_str, topK=5, withWeight=False)))


# 把处理干净的词生成字典
dictionary = corpora.Dictionary(contents_clean)
# 词库向量化
corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
# 模型训练进度显示
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# LDA主题模型
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)  # 类似Kmeans自己指定K值
# 打印主题模型的20个主题和前5个主题词
for topic in lda.print_topics(num_topics=20, num_words=5):
    print(topic[1])
# 打印每条新闻推断主题的概率
for e, values in enumerate(lda.inference(corpus)[0]):
    print(contents_clean[e])
    for ee, value in enumerate(values):
        print('\t主题%d推断值%.2f' % (ee, value))

# 划分训练测试组进
df_train = pd.DataFrame({'contents_clean': contents_clean, 'label': df_news['category']})
label_mapping = {"汽车": 1, "财经": 2, "科技": 3, "健康": 4, "体育": 5, "教育": 6, "文化": 7, "军事": 8, "娱乐": 9, "时尚": 0}
df_train['label'] = df_train['label'].map(label_mapping)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_train['contents_clean'].values, df_train['label'].values,
                                                    random_state=1)
# 把ndarray转成list
words = []
for line_index in range(len(x_train)):
    words.append(' '.join(x_train[line_index]))
test_words = []
for line_index in range(len(x_test)):
    test_words.append(' '.join(x_test[line_index]))

# word2vec词向量,词袋模型
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(analyzer='word', max_features=4000, lowercase=False)
# 为每个词进行编码形成编码字典
vec.fit(words)

from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(vec.transform(words), y_train)
classifier.score(vec.transform(test_words), y_test)

# tf-idf转词向量
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', max_features=4000, lowercase=False)
vectorizer.fit(words)

# 朴素贝叶斯
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(vectorizer.transform(words), y_train)
classifier.score(vectorizer.transform(test_words), y_test)

size -> vector_size

AttributeError: 'Word2Vec' object has no attribute 'most_similar'

model.most_similar -> model.wv.most_similar

model.n_similarity -> model.wv.similarity

链接:https://pan.baidu.com/s/1gJMtEdY5Ag7UEHMvwHl0Jg 提取码:1234 
 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值