import sqlite3
import pandas as pd
import numpy as np
import jieba
from collections import Counter
from pyecharts import WordCloud
class MyWordCloud:
'''
自定义的词云分割
'''
def __init__(self,db_path,table_name,stop_word_file_path,field_name):
self.sqlit3_db = sqlite3.connect(db_path)#链接sqlite3
sql = 'select * from {}'.format(table_name)
self.content_from_db = pd.read_sql(sql,self.sqlit3_db)#从sqliter3中加载数据
self.stop_words = []#停止词,即不需要的一些词语,比如:的、得、你、我、他等
self.stop_word_path = stop_word_file_path#停止词词库文件路径
self.field_filter = field_name#过滤字段
self.user_dic = jieba.load_userdict('user_dic.txt')
def stop_word_file(self,):
'''
加载停止词词库,主要针对中文
:param stop_word_path: 词库文件路径
:return:
'''
if self.stop_word_path:
with open(self.stop_word_path, encoding='utf-8') as f:
self.stop_words.append(f.read().split('\n'))
else:
print('未添加词云停止词库,请添加停止词库文件或者开启filter_more为True')
def mk_word_list(self,record_count=None,filter_more_bol=True):
'''
生成所需得词库列表
:param field_need: 使用jieba要剪取得字段或者说类别
:param record_count: sqlite3中所需要进行切词分析的记录条数
:param filter_more_bol: 进一步或者说更加精准的过滤不需要的词
:return:
'''
word_list = []
if record_count is not None:
content_need = self.content_from_db[self.field_filter].head(record_count)
else:
content_need = self.content_from_db[self.field_filter]
for record_one in content_need:
words = jieba.cut(record_one)
if filter_more_bol:
words = self.filter_more(words)
for word in words:
if word not in self.stop_words:
word_list.append(word)
return word_list
def filter_more(self,words:list):
'''
过滤出中文和英文以及过滤掉单个字符
:param words: jieba切过的词语序列
:return:
'''
word_list = []
for word in words:
if word.isalpha() and len(word) >1:
word_list.append(word)
return word_list
def words_cloud(self,words_list:list):
'''
生成词云文件
:param words_list: 词云所需词库
:return:
'''
content = pd.Series(words_list).value_counts()
words_show = content.index
words_count = content.values
wd = WordCloud(width=1300, height=620)
wd.add('', words_show, words_count, word_size_range=(20, 100))
wd.render('wordcloud.html')
def start(self,record_count=10):
'''
直接生成词云文件的函数,当然也可以自己一步一步进行函数调用
:param record_count: 需要的sqlite3数据库记录数,默认10条
:return:
'''
self.stop_word_file()
word_list = self.mk_word_list(record_count=record_count)
self.words_cloud(word_list)
if __name__ == '__main__':
wordcloud = MyWordCloud('recruit.db','recruit','stopword.txt','job_detail')
wordcloud.start()
所需要的三个文件:
recruit.db:sqlite3数据库文件,提供数据
stopword.txt:需要过滤的字符文件,不需要字符或者字符串写入该文件,注意每个字符或者字符串都要换行
user_dic.txt:自定义字典,不进行切割的字符或者字符串