NLP16-总结之一[dict,tfidf,word2vec,关键词,simhash]

一, 介绍

Nlp使用有一段时间了,代码写得总是写得有点凌乱,想总结一下:
1. 在使用时,希望这个是单例,python的单例实现;
2. 通过cmdline来控制;
3 在一个类中有一些加载数据,加载词典及模型的功能。
3.1 构建一个语料词典;
3.2 训练tfidf模型及基于它的关键词抽取;
3.3 训练word2vec及基于它的相关词汇的提取;
3.4 基于关键词特征的simhash的文章编码,形成文章的指纹;并实现基于文章指纹的文章相似度计算。

二,TF-IDF

TF:
The weight of a term that occurs in a document is simply proportional to the term frequency.
IDF:
The specificity of a term can be quantified as an inverse function of the number of documents in which it occurs.
这里写图片描述
详细的参见[4] tf–idf https://en.wikipedia.org/wiki/Tf%E2%80%93idf

三,Simhash

simhash是google用来处理海量文本去重的算法。simhash将文档换成一个64位的hash码,然后判断hash码的海明距离D来决定文章是否相似,根据经验当D小于3,判断两个文档相似。[1]
这里写图片描述

四、基于装饰器的单例

def singleton(cls):
    instances = {}

    def wrapper(*args, **kwargs):
        if cls not in instances:
            instances[cls] = cls(*args, **kwargs)
        return instances[cls]

    logging.info('singleton size %d' % (len(instances)))
    return wrapper

五,分词

from zhon.hanzi import non_stops
jieba.load_userdict(DICT_PATH)
STOPS_LIST = {}.fromkeys([line.strip() for line in open(STOPWORDS_PATH, 'r', encoding='utf-8')])
C_PUNCTUATION = string.punctuation + non_stops
def cut_txt(txt, r_type=1):
    """
    :param txt:
    :param r_type: 2:表示返回数组
    :return:
    """
    segs = jieba.cut(txt, cut_all=False)
    segs = [word for word in list(segs)
            if word.lstrip() is not None
            and word.lstrip() not in STOPS_LIST
            and word.lstrip() not in C_PUNCTUATION]
    rs = None
    if r_type == 2:
        rs = segs
    else:
        rs = " ".join(segs)
    return rs

六、model类

# 词典数据加载
# coding=utf-8
import logging
import time

import numpy as np
from gensim import corpora, models
from gensim.models import Word2Vec

class ForDict(object):
    """
    对于字典的字符读取
    """

    def __init__(self, file):
        self.file = file

    def __iter__(self):
        for line in open(self.file, 'r', encoding='utf-8'):
            yield line.lower().split()

# 对Word2Vec加载数据
class ForWord2Vec(object):
    def __init__(self, in_file):
        self.in_file = in_file

    def __iter__(self):
        for line in open(self.in_file, encoding='utf-8'):
            yield line.split()


@singleton
class NlpModel(object):
    def __init__(self,
                 opts={'is_load_data': False},
                 dic_path='%s%s.dict' % (MODEL_PATH, 'all'),
                 tfidf_path='%s%s.tfidf' % (MODEL_PATH, 'all'),
                 word2vec_path='%s%s.w2v' % (MODEL_PATH, 'all')):
        self.opts = opts
        self.has_load_data = False

        self.dic = None
        self.dic_path = dic_path

        self.tfidf = None
        self.tfidf_path = tfidf_path

        self.has_comebined = False

        self.word2vec = None
        self.word2vec_path = word2vec_path

    def _load_dic(self):
        logging.info('load dict...')
        if not self.dic:
            self.dic = corpora.Dictionary.load(self.dic_path)
        else:
            logging.info('has loaded dic..')

    def _load_tfidf(self):
        if not self.tfidf:
            self.tfidf = models.TfidfModel.load(self.tfidf_path)
        else:
            logging.info('has loaded tfidf.')

    def _load_word2vec(self):
        if not self.word2vec:
            self.word2vec = Word2Vec.load(self.word2vec_path)
        else:
            logging.info('word2vec has loaded.')

    def _load_data(self):
        if (not self.has_load_data) and self.opts.is_load_data:
            logging.info('load data...')
            load_data()
            combine_cut_data()
            self.has_load_data = True
        else:
            logging.info('has downed data')

    def train_corpus_dic(self):
        """
        构建语料字典模型
        :param opts:
        :return:
        """
        t0 = time.time()
        # 1. 数据下载
        self._load_data()

        # 3. 建立词典
        logging.info('create dictionary.')
        dict_data = ForDict('%s%s.cut' % (CORPUS_PATH, 'all'))
        dict = corpora.Dictionary(dict_data)

        # 4. 保存词典
        logging.info('save dictionary.')
        dict.save('%s%s.dict' % (MODEL_PATH, 'all'))

        # 5. 记录完成日志
        logging.info('number of words:%d; number of num_docs:%d; numbert of num_pos:%d; cost time:%f' % (
            len(dict.keys()), dict.num_docs, dict.num_pos, time.time() - t0))

    def train_tfidf(self, opts=None):
        """
        计算TFIDF
        :param opts:
        :return:
        """
        t0 = time.time()
        logging.info('do_keyword begin...')

        # 1.加载词典
        self._load_dic()

        # 2.计算tfidf
        logging.info('compute tfidf...')
        tfidf = models.TfidfModel(dictionary=self.dic, wlocal=identity_a, wglobal=df2idf_a)

        # 3.保存iftdf
        logging.info('save tfidf...')
        tfidf.save('%s%s.tfidf' % (MODEL_PATH, 'all'))
        logging.info('do_keyword end.cost:%f' % (time.time() - t0))

    def train_word2vec(self):
        """
        word2vec
        :param opts:
        :return:
        """
        t0 = time.time()
        # 1. 数据下载
        self._load_data()

        # 3.计算word2vec
        logging.info('begin word2vec...')
        sentences = ForWord2Vec('%s%s.cut' % (CORPUS_PATH, 'all'))
        m_word2vec = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

        # 4. 保存word2vec
        m_word2vec.save('%s%s.w2v' % (MODEL_PATH, 'all'))
        logging.info('end word2vec. cost:%fs' % (time.time() - t0))

    def get_keyword(self, doc_str=None, top_k=10, is_blank=False):
        """
        关键词提取
       """
        t0 = time.time()
        logging.info('do_keyword begin...')
        # 1.加载词典
        self._load_dic()
        # tfidf
        logging.info('load tfidf...')
        self._load_tfidf()
        # 计算关键词
        rs = None
        if doc_str:
            logging.info('leng of doc_str:%s' % (len(doc_str)))
            vec_bow = self.dic.doc2bow(doc_str.split() if is_blank else cut_txt(doc_str, r_type=2), allow_update=False)
            vec_tfidf = self.tfidf[vec_bow]
            vec_tfidf.sort(key=lambda x: x[1], reverse=True)
            rs = list(map(lambda x: (self.dic[x[0]], x[1]), vec_tfidf[:min(top_k, len(vec_tfidf))]))
        logging.info('do_keyword end.cost time:%f' % (time.time() - t0))
        return rs

    def _string_hash(self, source):
        """
        compute hash.
        :param source:
        :return:
        """
        if source == "":
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** 128 - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            x = bin(x).replace('0b', '').zfill(64)[-64:]
            return str(x)

    def get_simhash(self, content, top_k=200):
        """
        获取simhash
        :param content:
        :param top_k:
        :return:
        """
        t0 = time.time()
        logging.info('get simhash.')
        simhash = []
        try:
            keyWord = self.get_keyword(content, top_k=top_k, is_blank=False)
            keyList = []
            for feature, weight in keyWord:
                weight = round(weight, 6)
                feature = self._string_hash(feature)
                temp = []
                for i in feature:
                    if (i == '1'):
                        temp.append(weight)
                    else:
                        temp.append(-weight)
                keyList.append(temp)
            list1 = np.sum(np.array(keyList), axis=0)
            if (keyList == []):
                return '00'
            for i in list1:
                if (i > 0):
                    simhash.append('1')
                else:
                    simhash.append('0')
        except Exception as e:
            logging.error('Simhash.simhash err:%s' % e)
        finally:
            logging.info('get hash end.cost time:%f' % (time.time() - t0))
        return ''.join(simhash)

    def get_word_similar(self, word='婴儿', topn=10):
        """
        获取相关词汇
        :param word:
        :param model_path:
        :param topn:
        :return:
        """
        t0 = time.time()
        self._load_word2vec()
        rs = self.word2vec.wv.similar_by_word(word, topn=topn)
        logging.info('word similar end.cost time %s' % (time.time() - t0))
        return rs

七、参数解释

from optparse import OptionParser

usage = "usage: %prog [options] arg1 arg2"
MODEL_OP = OptionParser(usage=usage)
MODEL_OP.add_option("-t", "--type",
                    # action="store_true",
                    dest='type',
                    default='keyword',
                    help="types[keyword,dict,tfidf,word2vec]")
MODEL_OP.add_option("-o", "--output",
                    dest="model",
                    default="test.model",
                    help="output model file name")
MODEL_OP.add_option("-l", "--is_load_data",
                    action="store_true",
                    dest="is_load_data",
                    default=False,
                    help="whether will be loaded data or not")

八, 设置日志

import logging
# 日志记录
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s: %(threadName)s %(module)s::%(filename)s::%(funcName)s[line:%(lineno)d] %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filemode=LOGMODE,
                    filename='%s/maodel_%s.log' % (LOGPATH, datetime.datetime.now().strftime('%Y-%m-%d'))
                    )

console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s %(levelname)-8s: %(threadName)s %(module)s::%(filename)s::%(funcName)s[line:%(lineno)d] %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

logging.info("""
   train...train...train...
   *   *   *   *   *   *
     -       -       -
   train...train...train...
""")

九. Main方法

if __name__ == '__main__':
    logging.info('start args:%s', str(sys.argv))
    (opts, args) = MODEL_OP.parse_args(sys.argv[1:])

    model = NlpModel(opts)
    if len(args) > 0:
        MODEL_OP.error("this script takes no arguments.")
        sys.exit(1)

    if 'dict' == opts.type:  # 词典
        logging.info('dict')
        model.train_corpus_dic()
    elif 'tfidf' == opts.type:  # tfidf
        logging.info('tfidf')
        model.train_tfidf()
    elif 'word2vec' == opts.type:  # word2vec
        logging.info('word2vec')
        model.train_word2vec()
    else:
        pass

十、测试

model = NlpModel()
print(model.get_simhash(content=""""
夏季怎么吃才能不犯困 ;钾是人体内不可缺少的元素,一般成年人体内的含钾元素150g左右,其作用主要是维持神经、肌肉的正常功能因此,人体一旦缺钾,正常的运动就会受到影响夏季缺钾不仅精力和体力下降,而且耐热能力也会降低,使人感到倦怠无力严重缺钾时,可导致人体内酸碱平衡失调、代谢紊乱、心律失常,全身肌肉无力、懒动此时,有些人为了使自己少出汗而过量地饮用盐开水殊不知,这样做又容易加重心脏负担,使体内钾、钠平衡失调而适当补充钾元素则有利于改善体内钾、钠平衡,既可以防止血压上升,又可防止血压过低下面介绍一些含钾元素较高的食物困了怎么办?告诉你几种防犯困食物一、粮食中,以荞麦、玉米、红薯、大豆等含钾元素较高二、水果中,以香蕉含钾元素最丰富三、蔬菜中,以菠菜、苋菜、香菜、油菜、甘蓝、芹菜、大葱、青蒜、莴笋、土豆、山药、鲜豌豆、毛豆等含钾元素较高四、海藻类,含钾元素相当丰富,如紫菜每百克含钾1640毫克,是含钠的175倍;海带含钾是含钠的22倍;羊栖菜含钾是钠的3.1倍因此,紫菜汤、紫菜蒸鱼、紫菜肉丸、凉拌海带丝、海带炖肉等都是夏季补钾菜肴的上品特别提醒司机:在生活中,服用有些药物后,可能会出现不同程度的疲倦、嗜睡、困乏和精神不振等,因此在服药后宜稍事休息或小睡,不宜马上驾车,尤其是夏季,驾车族本来就容易犯晕,更要当心“犯困药”可引起驾车族嗜睡或犯困的药有:抗感冒药、抗过敏药、镇静催眠药、抗偏头痛药和治胃反酸药等对驾车族而言,生病时既要吃药,又要保证行车安全,因此合理用药显得格外重要特别需要提醒的是,在上车前4小时尽量不要服药,或是服药后休息6小时再开车;对易产生嗜睡或昏迷的药,服用最佳时间为睡前半小时,既减少对日常生活所带来的不便,又能促进睡眠有些抗感冒药分为日片或夜片,日片不含抗过敏药,极少引起嗜睡,白天宜尽量选用白片对已知有不良反应但离不开的药,上车前可减半量服用,等休息时再补足全量"大师"王林因病死亡,王林大师是怎么死的?2017年泰国10大女网红比中国女网红好看一百倍14岁女酒吧坐台‘事业线’外露一点不害臊身体暴露
"""))

print(model.get_keyword(doc_str=""""
夏季怎么吃才能不犯困 ;钾是人体内不可缺少的元素,一般成年人体内的含钾元素150g左右,其作用主要是维持神经、肌肉的正常功能因此,人体一旦缺钾,正常的运动就会受到影响夏季缺钾不仅精力和体力下降,而且耐热能力也会降低,使人感到倦怠无力严重缺钾时,可导致人体内酸碱平衡失调、代谢紊乱、心律失常,全身肌肉无力、懒动此时,有些人为了使自己少出汗而过量地饮用盐开水殊不知,这样做又容易加重心脏负担,使体内钾、钠平衡失调而适当补充钾元素则有利于改善体内钾、钠平衡,既可以防止血压上升,又可防止血压过低下面介绍一些含钾元素较高的食物困了怎么办?告诉你几种防犯困食物一、粮食中,以荞麦、玉米、红薯、大豆等含钾元素较高二、水果中,以香蕉含钾元素最丰富三、蔬菜中,以菠菜、苋菜、香菜、油菜、甘蓝、芹菜、大葱、青蒜、莴笋、土豆、山药、鲜豌豆、毛豆等含钾元素较高四、海藻类,含钾元素相当丰富,如紫菜每百克含钾1640毫克,是含钠的175倍;海带含钾是含钠的22倍;羊栖菜含钾是钠的3.1倍因此,紫菜汤、紫菜蒸鱼、紫菜肉丸、凉拌海带丝、海带炖肉等都是夏季补钾菜肴的上品特别提醒司机:在生活中,服用有些药物后,可能会出现不同程度的疲倦、嗜睡、困乏和精神不振等,因此在服药后宜稍事休息或小睡,不宜马上驾车,尤其是夏季,驾车族本来就容易犯晕,更要当心“犯困药”可引起驾车族嗜睡或犯困的药有:抗感冒药、抗过敏药、镇静催眠药、抗偏头痛药和治胃反酸药等对驾车族而言,生病时既要吃药,又要保证行车安全,因此合理用药显得格外重要特别需要提醒的是,在上车前4小时尽量不要服药,或是服药后休息6小时再开车;对易产生嗜睡或昏迷的药,服用最佳时间为睡前半小时,既减少对日常生活所带来的不便,又能促进睡眠有些抗感冒药分为日片或夜片,日片不含抗过敏药,极少引起嗜睡,白天宜尽量选用白片对已知有不良反应但离不开的药,上车前可减半量服用,等休息时再补足全量"大师"王林因病死亡,王林大师是怎么死的?2017年泰国10大女网红比中国女网红好看一百倍14岁女酒吧坐台‘事业线’外露一点不害臊身体暴露
""", is_blank=False))
ws = [
    # '感冒',
    # '高血压'
    # '维生素',
    # '乙肝',
    '婴儿',
    # '小猴子',
    # '营养品',
    # '盆腔炎',
    # '咽喉炎',
    # '高尿酸',
    # '高胆固醇血症'
]
for w in ws:
    print(w)
    a = model.get_word_similar(w, topn=30)
    for i in range(0, len(a)):
        print(a[i])
结果:
0010010001000010000000101100110011100011010010101000100010111111
[('含钾', 0.3054520633861392), ('钾', 0.28989261631141955), ('元素', 0.28500796381274923), ('驾车', 0.23984964041886256), ('犯困', 0.22359016443278812), ('缺钾', 0.20313506276231016), ('日片', 0.19394250188386772), ('药', 0.18726227727860778), ('嗜睡', 0.18180214856953114), ('夏季', 0.14477385758300293)]
对于word2vec:
婴儿
('新生儿', 0.8387792110443115)
('早产儿', 0.78364098072052)
('宝宝', 0.7606385946273804)
('小宝宝', 0.7359695434570312)
('婴幼儿', 0.7072071433067322)
('幼儿', 0.6674544811248779)
('婴儿期', 0.657639741897583)
('宝贝', 0.6358252763748169)
('体重儿', 0.6267213821411133)
('足月儿', 0.6226769685745239)
('男婴', 0.6199154853820801)
('小孩', 0.6098051071166992)
('孩子', 0.6038563847541809)
('乳母', 0.5991641283035278)
('胎儿', 0.5983285903930664)
('小儿', 0.5968100428581238)
('刚出生', 0.5956158638000488)
('出生', 0.594602108001709)
('儿童', 0.589705765247345)
('喂养', 0.5812875032424927)
('母乳', 0.5803599953651428)
('母乳喂养', 0.5779118537902832)
('崽', 0.5771138668060303)
('月龄', 0.5713974237442017)
('婴', 0.5667303204536438)
('患儿', 0.5656975507736206)
('配方奶粉', 0.5621417760848999)
('新生儿期', 0.5526482462882996)
('母亲', 0.5463709831237793)
('孩童', 0.5433803796768188)

十二、参考

[1] simhash算法原理及实现
https://yanyiwu.com/work/2014/01/30/simhash-shi-xian-xiang-jie.html
[2]part 3: the simhash algorithm
http://matpalm.com/resemblance/simhash/
[3] Similarity Estimation Techniques from Rounding Algorithms
http://www.cs.princeton.edu/courses/archive/spring04/cos598B/bib/CharikarEstim.pdf
[4] tf–idf
https://en.wikipedia.org/wiki/Tf%E2%80%93idf
[5] Luhn, Hans Peter (1957). “A Statistical Approach to Mechanized Encoding and Searching of Literary Information” (PDF). IBM Journal of research and development. IBM. 1 (4): 315.doi:10.1147/rd.14.0309. Retrieved 2 March 2015. There is also the probability that the more frequently a notion and combination of notions occur, the more importance the author attaches to them as reflecting the essence of his overall idea.
[6] Spärck Jones, K. (1972). “A Statistical Interpretation of Term Specificity and Its Application in Retrieval”. Journal of Documentation. 28: 11–21. doi:10.1108/eb026526.
[7] Hans Peter Luhn
https://en.wikipedia.org/wiki/Hans_Peter_Luhn
[8]Karen Spärck Jones: https://en.wikipedia.org/wiki/Karen_Sp%C3%A4rck_Jones

[happyprince , http://blog.csdn.net/ld326/article/details/79117241]

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值