匹配关键字,给新闻贴上个股标签

贴上个股标签

import csv
import pandas as pd
from database import  Database
#from connect_keywords.database import Database

csv_file = csv.reader(open('finace_news_content.csv', 'r'))
csv_keyword = csv.reader(open('Keyword.csv', 'r'))
csv_select_one = csv.reader(open('select_one.csv', 'r'))
csv_select_double = csv.reader(open('select_double.csv', 'r'))

#获取contents:原数据库ID:content

contents = []
for row in csv_file:
    content = {}
    content['id'] = row[0]
    content['content'] = row[1].strip('\t\n')
    contents.append(content)
print(contents)

keyword_dict = {}
for row in csv_keyword:
    keyword_dict[row[1]] = row[2:]
print(keyword_dict)

select_one = []
for row in csv_select_one:
    value = str(row[0]).split('%')
    select_one.append(value[1])
print(select_one)

select_two = []
for row in csv_select_double:
    values = str(row[0]).split('%')[1]
    value = values.split(',')
    select_two.append(value)
print(select_two)

double_id = []
texts = []
connect_dict = {}
for i in range(len(contents)):
    flag = True
    text = contents[i]
    content = text['content']
    for m in range(len(select_one)):
        word = select_one[m]
        if word in content:
            flag = False
            break
    if flag:
        for n in range(len(select_two)):
            words = select_two[n]
            word1 = words[0]
            word2 = words[1]
            if word1 in content and word2 in content:
                flag = False
                break
        if flag:
            #取出个股及对应的关键字
            for key, value in keyword_dict.items():
                value = str(value[0]).split(',')
                #遍历指定个股的关键字
                for j in range(len(value)):
                    keyword = value[j]
                    #判断关键字是否在文本中
                    if keyword in content:
                    #判断文本是否已有匹配到的个股
                        for item in texts:
                            if item['id'] == text['id'] and text['id'] != None:
                                item['stoc_id'].extend([key])
                                #print(item['id'] + keyword + key )
                                double_id.append(item['id'])
                                flag = False
                        if flag:
                            jre = {}
                            jre['id'] = text['id']
                            jre['stoc_id'] = [key]
                            jre['content'] = content
                            texts.append(jre)
                            #print(keyword + key)
                            break
print(texts)
print(double_id)



def run():
    insert = 'INSERT IGNORE INTO news_connect_keywords(news_id, content, stock_id) VALUES (%s, %s, %s)'

    db = Database()
    db.connect('news_connect_keyword')

    for i in range(len(texts)):
        data = texts[i]
        db.execute(insert, [data['id'], data['content'], str(data['stoc_id'])])

    db.close()

# if __name__ == '__main__':
#      run()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值