#! /usr/bin/python
# -*- coding: utf8 -*-
# @Time : 2018/8/29 15:15
# @Author : yukang
import nltk,jieba,re
import jieba.analyse
import jieba.posseg as pseg
from nltk.probability import FreqDist
sentence = """文本数据 此处放入一个文本""".replace(" ","")
class KeyWord():
""" 文本分析 测试"""
def chinese_stopwords(self,filepath):
"""
导入停用的词库
:param filepath:停用词库的绝对路径
:return: stopwords
"""
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
def seg_sentence(self,sentence):
"""
对句子进行分词
:param sentence:需要分析的str
:return:outstr
"""
sentence_seged = jieba.cut(sentence.strip())
stopwords = self.chinese_stopwords('./data_terms/stopWord.txt') # 这里加载停用词的路径
outstr = []
for word in sentence_seged:
if word not in stopwords:
if word != '\t' and word != '\n':
outstr.append(word)
return outstr
def extract_t(self,content):
"""
提取关键词数量和权重
:param content: 文本内容
:return:
"""
keywords = jieba.analyse.extract_tags(content, topK=20, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v'))
# 访问提取结果
for item in keywords:
# 分别为关键词和相应的权重
print(item[0], item[1])
def word_pseg(self, word_str): # 名词提取函数
words = pseg.cut(word_str)
word_list = []
for wds in words:
# 筛选自定义词典中的词,和各类名词,自定义词库的词在没设置词性的情况下默认为x词性,即词的flag词性为x
if wds.flag == 'x' and wds.word != ' ' and wds.word != 'ns' or re.match(r'^n', wds.flag) != None and re.match(r'^nr', wds.flag) == None:
word_list.append(wds.word)
return word_list
def all_pseg(self,word_str):
"""提取所有词性"""
words = pseg.cut(word_str)
for word, flag in words:
return word,flag
def sort_item(self, item):
"""
将分词数据进行正序排序
:param item: 可遍历对象
:return: List
"""
vocab = []
for k, v in item:
vocab.append((k, v))
List = list(sorted(vocab, key=lambda v: v[1], reverse=1))
return List
word = KeyWord()
pseg_sen = word.word_pseg(sentence)
seg = word.seg_sentence("".join(pseg_sen))
fdist = FreqDist(seg)
Sum = len(seg)
pre = 0
# # 计算相对个数的关键词不一定适合自己,所以我用的是 所有关键词的百分比,也就是0.5 左右,Sum为关键词的总数量
# for (s, n) in word.sort_item(fdist.items()):
# print(s + str(float(n) / Sum)+" " +str(n)+ '\r\n')
# pre = pre + float(n) / Sum
# if pre > 0.5:
# print(pre)
# break
all_participle = word.seg_sentence(sentence)
fdists = FreqDist(all_participle)
Sums = len(seg)
for (s, n) in word.sort_item(fdists.items()):
words, flag = word.all_pseg(s)
print(s + str(float(n) / Sums)+" "+str(n)+" "+flag)