word2Vec训练中文模型

最新推荐文章于 2024-07-24 11:48:04 发布

有梦想有行动

最新推荐文章于 2024-07-24 11:48:04 发布

阅读量3.4k

点赞数 5

分类专栏：数据预处理文章标签：自然语言处理 python

本文链接：https://blog.csdn.net/CSDN_of_ding/article/details/107919687

版权

数据预处理专栏收录该内容

4 篇文章 1 订阅

订阅专栏

本以为so easy，结果搞了一天，记录一下，方便以后回顾，如果能帮到你那最好了！

1.准备数据与预处理

首先需要一份比较大的中文语料数据，我用的中文维基百科.

中文维基百科的数据不是太大，xml的压缩文件大约1G左右。首先用 process_wiki_data.py处理这个XML压缩文件，执行

python process_wiki_data.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text

以下是代码：process_wiki_data.py

//process_wiki_data.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# process_wiki_data.py 用于解析XML，将XML的wiki数据转换为text格式
import logging
import os.path
import sys
from gensim.corpora import WikiCorpus
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    space = " "
    i = 0
    output = open(outp, 'w')
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        output.write(space.join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logger.info("Saved " + str(i) + " articles")
    output.close()
    logger.info("Finished Saved " + str(i) + " articles")

2.用opencc 把上述的文件中的繁体字转化为简体字

这是教程

3.用正则表达式提取文章内容并进行分词

执行下面是代码：

将opencc的简体字，放在我本地的./data/目录下
代码中的文件路径，记得修改
stopwords.txt停用词表，我放在同一目录下

# -*- coding: UTF-8 -*-
#@IDE    ：PyCharm
#@Author ：dingjingjing
#@Date   ：2020/8/10
#@Desc   ：使用中文维基百科语料库训练一个word2vec模型
import logging, jieba, os, re


def get_stopwords():
    logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
    # 加载停用词表
    stopword_set = set()
    with open("stop_words.txt", 'r', encoding="utf-8") as stopwords:  # stopwords.txt停用词表，我放在同一目录下
        for stopword in stopwords:
            stopword_set.add(stopword.strip("\n"))
    return stopword_set


def parse_zhwiki(read_file_path, save_file_path):

    regex_str = "[^<doc.*>$]|[^</doc>$]"
    file = open(read_file_path, "r", encoding="utf-8")
    # 写文件
    output = open(save_file_path, "w+", encoding="utf-8")
    content_line = file.readline()
    # 获取停用词表
    stopwords = get_stopwords()
    # 定义一个字符串变量，表示一篇文章的分词结果
    article_contents = ""
    while content_line:
        match_obj = re.match(regex_str, content_line)
        content_line = content_line.strip("\n")
        if len(content_line) > 0:
            if match_obj:
                # 使用jieba进行分词
                words = jieba.cut(content_line, cut_all=False)
                for word in words:
                    if word not in stopwords:
                        article_contents += word + " "
            else:
                if len(article_contents) > 0:
                    output.write(article_contents + "\n")
                    article_contents = ""
        content_line = file.readline()
    output.close()


def generate_corpus():

    # 将维基百科语料库进行分类
    zhwiki_path = "D:/addjj/DataProcess/chuanbingan/data/result.txt"  # 加载zhwiki的路径
    save_path = "D:/addjj/DataProcess/chuanbingan/data/wiki_corpus"  # 保存zhwiki的路径
    parse_zhwiki(zhwiki_path, save_path)


def merge_corpus():

   # 合并分词后的文件

    output = open("D:/addjj/DataProcess/chuanbingan/data/wiki_corpus", "w", encoding="utf-8")
    input = "D:/addjj/DataProcess/chuanbingan/data/"
    for i in range(3):
        file_path = os.path.join(input, str("wiki_corpus0%s" % str(i)))
        file = open(file_path, "r", encoding="utf-8")
        line = file.readline()
        while line:
            output.writelines(line)
            line = file.readline()
        file.close()
    output.close()

if __name__ == "__main__":
    generate_corpus()
    #merge_corpus()

4生成的分词文件用word2vec工具训练

// wiki.zh.text.seg是分词文件
python train_word2vec_model.py wiki.zh.text.seg wiki.zh.text.model wiki.zh.text.vector

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# train_word2vec_model.py用于训练模型
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    # check and process input arguments
    if len(sys.argv) < 4:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp1, outp2 = sys.argv[1:4]
    model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5,
            workers=multiprocessing.cpu_count())
    # trim unneeded model memory = use(much) less RAM
    #model.init_sims(replace=True)
    model.save(outp1)
    model.save_word2vec_format(outp2, binary=False)

5 测试模型效果

在这里插入图片描述

参考.
参考.
心理鸡汤：踏入社会前你专心读书就好！

有梦想有行动

关注

5
点赞
踩
16

收藏

觉得还不错? 一键收藏
1
评论
word2Vec训练中文模型

本以为so easy，结果搞了一天，记录一下，方便以后回顾，如果能帮到你那最好了！1.准备数据与预处理首先需要一份比较大的中文语料数据，我用的中文维基百科.中文维基百科的数据不是太大，xml的压缩文件大约1G左右。首先用 process_wiki_data.py处理这个XML压缩文件，执行python process_wiki_data.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text以下是代码：process_wiki_data.py
复制链接

扫一扫

专栏目录