Jieba分词斗罗大陆
1.相关包导入
import jieba
import numpy as np
import pandas as pd
2.斗罗大陆词库导入
dldlck = pd.read_csv('F:\\cqie3.2\\NLP\\斗罗大陆词 库.txt',encoding='gbk')
Dldlck
3.斗罗大陆文本导入
with open('F:\\cqie3.2\\NLP\\斗罗大陆.txt',encoding='utf-8') as f:
dldl = f.read()
f.close()
(代码报错,点开文本另存为utf-8格式即可)
4.查看前五十
dldl[:50]
5.分词
test_word = jieba.cut(dldl)
print("/".join(test_word))
jieba.lcut(dldl)
newlist = [w for w in jieba.cut(dldl) if w not in [',','。']]
print(newlist)
6.停用词库导入
tmpdf = pd.read_csv('F:\\cqie3.2\\NLP\\停用词.txt',names = ['w'],sep = 'aaa',encoding='utf-8')
tmpdf.head()
7.停用词分词
finish_word = []
for i in jieba.cut(dldl) :
if i not in list(tmpdf.w)+[" ","\n"]:
finish_word.append(i)
finish_word
8.行列计数
raw = pd.read_table('F:\\cqie3.2\\NLP\\斗罗大陆.txt',names=['txt'],encoding='utf-8')
print(len(raw))
40552
9.按章节划分
def m_head(tmpstr):
return tmpstr[:1]
def m_mid(tmpstr):
return tmpstr.find('章')
raw['head'] = raw.txt.apply(m_head)
raw['mid'] = raw.txt.apply(m_mid)
raw['len'] = raw.txt.apply(len)
raw.head(50)
10.章节处理
chapnum = 0
for i in range(len(raw)):
if raw['head'][i] == '第' and raw['mid'][i]>0 and raw['len'][i]<20:
chapnum += 1
if chapnum >= 20 and raw['txt'][i] == '引子 穿越的唐家三少':
chapnum = 0
raw.loc[i,'chap'] = chapnum
del raw['head']
del raw['mid']
del raw['len']
raw.head(50)
11.章节验证
raw[raw.chap == 50].head()
12.画词云
from wordcloud import WordCloud
from PIL import Image
import numpy as np
wc = WordCloud(
background_color='white',
width=800,
height=600,
min_font_size=0,
max_font_size=200,
font_path= 'C:\Windows\Fonts\STXINGKA.TTF',)
wc.generate(' '.join(finish_word))
wc.to_image()