做情感分析还是需要结合情景和业务,之前直接用词典库效果太差,准备自建金融词典构建
语料库,呃呃呃,所有的词汇来源dict_myself
1.计算TF- IDF ,然后排序,得到的词可能会有和情感词典中重复的
#coding=UTF-8
"""
author:susuxuer
function:构建金融领域词汇
参考文献:https://www.cnblogs.com/en-heng/p/5848553.html
"""
import jieba.posseg as pseg
import numpy as np
import pandas as pd
import jieba
import time
import csv
import sys
import glob
import os
from collections import Counter
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from collections import defaultdict
from gensim import corpora,models
#调用停用词
def loadPoorEnt(path2 = 'G:/project/sentimation_analysis/data/stopwords.csv'):
csvfile = open(path2,encoding='UTF-8')
stopwords = [line.strip() for line in csvfile.readlines()]
return stopwords
stop_words=loadPoorEnt()
#读取所有文件路径
def get_all_content():
#abel_dir = [path + x for x in os.listdir(path) if os.path.isdir(path + x)]
all_files = glob.glob(r'D:/GFZQ/GFZQ/xuesu2018/xuesu/*.csv')
return all_files
#获取文本信息
def get_wenben(path):
csvfile = open(path,'r',encoding='UTF-8')
reader = csv.reader(csvfile)
return reader
# 进行句子的切分,选取v、a、d
def cut(data):
result=[] #pos=['n','v']
res = pseg.cut(data)
list = []
for item in res:
if item.word not in stop_words and (item.flag == 'd' or item.flag == 'a' or item.flag == 'v'):
list.append(item.word)
result.append(list)
return result
#每篇业绩说明会选取部分词汇ÿ