word2vec训练中文模型

首先需要一份比较大的中文语料数据,可以考虑中文的维基百科(也可以试试搜狗的新闻语料库)。中文维基百科的打包文件地址为 

https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2

中文维基百科的数据不是太大,xml的压缩文件大约1G左右。首先用 process_wiki_data.py处理这个XML压缩文件,执行:python process_wiki_data.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text


  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # process_wiki_data.py 用于解析XML,将XML的wiki数据转换为text格式
  4. import logging
  5. import os.path
  6. import sys
  7. from gensim.corpora import WikiCorpus
  8. if __name__ == '__main__':
  9. program = os.path.basename(sys.argv[0])
  10. logger = logging.getLogger(program)
  11. logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
  12. logging.root.setLevel(level=logging.INFO)
  13. logger.info("running %s" % ' '.join(sys.argv))
  14. # check and process input arguments
  15. if len(sys.argv) < 3:
  16. print (globals()['__doc__'] % locals())
  17. sys.exit(1)
  18. inp, outp = sys.argv[1:3]
  19. space = " "
  20. i = 0
  21. output = open(outp, 'w')
  22. wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
  23. for text in wiki.get_texts():
  24. output.write(space.join(text) + "\n")
  25. i = i + 1
  26. if (i % 10000 == 0):
  27. logger.info("Saved " + str(i) + " articles")
  28. output.close()
  29. logger.info("Finished Saved " + str(i) + " articles")

  30. Python用jieba完成分词,生成分词文件wiki.zh.text.seg 
  31. 分词文件:seg.py
  32. import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs,sys
    def cut_words(sentence):
        #print sentence
    return " ".join(jieba.cut(sentence)).encode('utf-8')
    f=codecs.open('wiki.zh.text','r',encoding="utf8")
    target = codecs.open("wiki.zh.text.seg", 'w',encoding="utf8")
    print ('open files')
    line_num=1
    line = f.readline()
    while line:
        print('---- processing ', line_num, 'article----------------')
        line_seg = " ".join(jieba.cut(line))
        target.writelines(line_seg)
        line_num = line_num + 1
        line = f.readline()
    f.close()
    target.close()
    exit()
    while line:
        curr = []
        for oneline in line:
            #print(oneline)
            curr.append(oneline)
        after_cut = map(cut_words, curr)
        target.writelines(after_cut)
        print ('saved ',line_num,' articles')
        exit()
        line = f.readline1()
    f.close()
    target.close()
  33. 运行 python seg.py
  34. 接着用word2vec工具训练: 
    python train_word2vec_model.py wiki.zh.text.seg wiki.zh.text.model wiki.zh.text.vector
    1. #!/usr/bin/env python
    2. # -*- coding: utf-8 -*-
    3. # train_word2vec_model.py用于训练模型
    4. import logging
    5. import os.path
    6. import sys
    7. import multiprocessing
    8. from gensim.corpora import WikiCorpus
    9. from gensim.models import Word2Vec
    10. from gensim.models.word2vec import LineSentence
    11. if __name__ == '__main__':
    12. program = os.path.basename(sys.argv[0])
    13. logger = logging.getLogger(program)
    14. logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    15. logging.root.setLevel(level=logging.INFO)
    16. logger.info("running %s" % ' '.join(sys.argv))
    17. # check and process input arguments
    18. if len(sys.argv) < 4:
    19. print globals()['__doc__'] % locals()
    20. sys.exit(1)
    21. inp, outp1, outp2 = sys.argv[1:4]
    22. model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5,
    23. workers=multiprocessing.cpu_count())
    24. # trim unneeded model memory = use(much) less RAM
    25. #model.init_sims(replace=True)
    26. model.save(outp1)
    27. model.save_word2vec_format(outp2, binary=False)

    测试模型效果:

     
      
    1. In [1]: import gensim
    2. In [2]: model = gensim.models.Word2Vec.load("wiki.zh.text.model")
    3. In [3]: model.most_similar(u"足球")
    4. Out[3]:
    5. [(u'\u8054\u8d5b', 0.6553816199302673),
    6. (u'\u7532\u7ea7', 0.6530429720878601),
    7. (u'\u7bee\u7403', 0.5967546701431274),
    8. (u'\u4ff1\u4e50\u90e8', 0.5872289538383484),
    9. (u'\u4e59\u7ea7', 0.5840631723403931),
    10. (u'\u8db3\u7403\u961f', 0.5560152530670166),
    11. (u'\u4e9a\u8db3\u8054', 0.5308005809783936),
    12. (u'allsvenskan', 0.5249762535095215),
    13. (u'\u4ee3\u8868\u961f', 0.5214947462081909),
    14. (u'\u7532\u7ec4', 0.5177896022796631)]
    15. In [4]: result = model.most_similar(u"足球")
    16. In [5]: for e in result:
    17. print e[0], e[1]
    18. ....:
    19. 联赛 0.65538161993
    20. 甲级 0.653042972088
    21. 篮球 0.596754670143
    22. 俱乐部 0.587228953838
    23. 乙级 0.58406317234
    24. 足球队 0.556015253067
    25. 亚足联 0.530800580978
    26. allsvenskan 0.52497625351
    27. 代表队 0.521494746208
    28. 甲组 0.51778960228
    29. In [6]: result = model.most_similar(u"男人")
    30. In [7]: for e in result:
    31. print e[0], e[1]
    32. ....:
    33. 女人 0.77537125349
    34. 家伙 0.617369174957
    35. 妈妈 0.567102909088
    36. 漂亮 0.560832381248
    37. 잘했어 0.540875017643
    38. 谎言 0.538448691368
    39. 爸爸 0.53660941124
    40. 傻瓜 0.535608053207
    41. 예쁘다 0.535151124001
    42. mc 0.529670000076


评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值