去重的类函数实现

# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import jieba.posseg as pseg
import codecs
from gensim import corpora, models, similarities
from database import Database
from demo import Demo
import datetime

class CacuSimil(object):
    insert = 'INSERT INTO finance_new_news(content,date, id, tags, time,' \
             ' update_time, url, website) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'

    #def __init__(self):
        # self.db = Database()
        # self.db.connect('crawl_data')

    #计算对比日期
    def date(self):
        today = datetime.date.today()
        print(today)
        preone_time = today + datetime.timedelta(days=-1)
        self.preone_time_nyr = preone_time.strftime('%Y-%m-%d')  # 格式化输出
        return self.preone_time_nyr


    #连接数据库,取出用来对照的文本,放在一个元组sql_saved内;取出刚抓取的内容,放在元组sql_craw内;
    def conn_db(self, day):
        db = Database()
        db.connect('crawl_data')
        sql_saved = "select content from finance_new_news where date >= '%s'" % (day)
        saved_files = db.query(sql_saved)
        print(saved_files)
        sql_craw = "select * from finance_old_news where date >= '%s'" % (day)
        #craw_file = list(db.query(sql_craw))
        craw_file = db.query(sql_craw)
        print(len(craw_file))
        db.close()
        return saved_files, craw_file


    #对一个文本分词、去停用词
    def tokenization(self, content):
        # 构建停用词
        stop_words = 'stop_words_ch.txt'
        stopwords = codecs.open(stop_words, 'rb', encoding='utf-8').readlines()
        stop_words = [w.strip() for w in stopwords]
        # 结巴分词后的停用词性【标点符号、连词、助词、副词、介词、时语素、'的'、数词、方位词、代词】
        stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r']

        result = []
        words = pseg.cut(content)
        for word, flag in words:
            if flag not in stop_flag and word not in stopwords:
                result.append(word)
        return result

    #生成语料库
    def gen_corpus(self, saved_files):
        corpus = []
        for content_tuple in saved_files:
            content = str(content_tuple)
            corpus.append(self.tokenization(content))
        #print(corpus[0],corpus[1])
        print(len(corpus))
        return corpus

    def gen_Model(self, corpus):
        #建立词袋模型:寻找整篇语料的词典、所有词
        dictionary = corpora.Dictionary(corpus)
        print(dictionary)

        #分支一、BOW词袋模型:由doc2bow变成词袋
        doc_vectors = [dictionary.doc2bow(text) for text in corpus]
        print(len(doc_vectors))
        print(doc_vectors)

        #分支二、建立TF-IDF模型
        tfidf = models.TfidfModel(doc_vectors)
        tfidf_vectors = tfidf[doc_vectors]
        print(len(tfidf_vectors))
        #print(len(tfidf_vectors[0]))
        return dictionary, tfidf_vectors


    #输入待匹配的文件(元组形式),利用词袋模型的字典将其映射到向量空间;计算相似度
    def get_similar(self, file, dictionary, tfidf_vectors):
         content = file[0]
         token = self.tokenization(content)
         token_bow = dictionary.doc2bow(token)
         print(token_bow)

         index = similarities.MatrixSimilarity(tfidf_vectors)
         sims = index[token_bow]
         result = list(enumerate(sims))
         print(result)
         #print(len(result))
         similar = []
         for i in range(len(result)):
             similar.append(result[i][1])
         return max(similar)

    def insert_or_not(self, score, craw_file):
        db = Database()
        db.connect('crawl_data')
        if score<0.9:
            db.execute(self.insert, craw_file)
        db.close()

修改版:

# -*- coding: utf-8 -*-
import json
import pandas as pd
import numpy as np
import jieba.posseg as pseg
import codecs
from gensim import corpora, models, similarities
from database import Database
import grpc
import data_pb2
import data_pb2_grpc
import datetime

class CacuSimil(object):


    def __init__(self):
         self.db = Database()
         self.db.connect('crawl_data')

    #获取待匹配数据,json形式
    def get_craw(self):
        conn = grpc.insecure_channel('192.168.1.100' + ':' + '2333')
        client = data_pb2_grpc.FormatDataStub(channel=conn)
        response = client.DoFormat(data_pb2.Data(text='getData'))
        return json.loads(response.text)


    #连接数据库,取出用来对照的文本,sql_saved是一个元组;取出刚抓取的内容,放在元组sql_craw内;
    def conn_db(self):
        db = Database()
        db.connect('crawl_data')

        today = datetime.date.today()
        preone_time = today + datetime.timedelta(days=-1)
        day = preone_time.strftime('%Y-%m-%d')  # 格式化输出

        sql_saved = "select content from finance_news where date >= '%s'" % (day)
        saved_files = db.query(sql_saved)

        db.close()
        return saved_files


    #对一个文本分词、去停用词
    def tokenization(self, content):
        # 构建停用词
        stop_words = 'stop_words_ch.txt'
        stopwords = codecs.open(stop_words, 'rb', encoding='utf-8').readlines()
        stop_words = [w.strip() for w in stopwords]
        # 结巴分词后的停用词性【标点符号、连词、助词、副词、介词、时语素、'的'、数词、方位词、代词】
        stop_flag = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r']
        result = []
        words = pseg.cut(content)
        for word, flag in words:
            if flag not in stop_flag and word not in stopwords:
                result.append(word)
        return result

    #生成语料库,获得模型
    def get_Model(self, saved_files):
        corpus = []
        for content_tuple in saved_files:
            content = str(content_tuple)
            corpus.append(self.tokenization(content))
        #建立词袋模型:寻找整篇语料的词典、所有词
        dictionary = corpora.Dictionary(corpus)

        #分支一、BOW词袋模型:由doc2bow变成词袋
        doc_vectors = [dictionary.doc2bow(text) for text in corpus]
        #分支二、建立TF-IDF模型
        tfidf = models.TfidfModel(doc_vectors)
        tfidf_vectors = tfidf[doc_vectors]

        return dictionary, tfidf_vectors


    #输入待匹配的文件,计算相似度
    def get_similar(self, craw_file, dictionary, tfidf_vectors):
         print(craw_file)
         content = craw_file['content']
         token = self.tokenization(content)
         token_bow = dictionary.doc2bow(token)

         index = similarities.MatrixSimilarity(tfidf_vectors)
         sims = index[token_bow]
         result = list(enumerate(sims))

         similar = []
         for i in range(len(result)):
             similar.append(result[i][1])
         return max(similar)

    #判断是否插入数据库
    def insert_or_not(self, score, craw_file):
        db = Database()
        db.connect('crawl_data')

        insert = 'INSERT INTO finance_news(content,date, id, tags, time,' \
                 ' update_time, url, website) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'

        if score<0.9:
            db.execute(insert, [craw_file['content'], craw_file['date'], craw_file['id'], \
                                craw_file['tags'], craw_file['time'], craw_file['update_time'],\
                                craw_file['url'], craw_file['website']])
            print(craw_file)
        db.close()



def run():
    CS=CacuSimil()
    #while True:
    craw_file = CS.get_craw()
    print(craw_file)
    if craw_file:
        saved_files = CS.conn_db()
        dictionary, tfidf_vectors = CS.get_Model(saved_files)
        similar = CS.get_similar(craw_file, dictionary, tfidf_vectors)
        CS.insert_or_not(similar, craw_file)


if __name__ == '__main__':
        run()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值