文章分析见:拂羽:Python之三国演义(上)zhuanlan.zhihu.com拂羽:Python之三国演义(下)zhuanlan.zhihu.com
#!/usr/bin/env python
# coding: utf-8
# In[47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import jieba #需要安装:pip install jieba
from pandas import read_csv
from scipy.cluster.hierarchy import dendrogram,ward
from scipy.spatial.distance import pdist,squareform
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
import nltk
from nltk.cluster.kmeans import KMeansClusterer
# In[2]:
## 设置字体和 设置pandas显示方式
##font=FontProperties(fname = "C:/Windows/Fonts/Hiragino Sans GB W3.otf",size=14)
## 设置字体和 设置pandas显示方式(字体设置一定要是自己计算机上面存在的字体)
font=FontProperties(fname = "C:\Windows\Fonts\STFANGSO.TTF",size=14)
pd.set_option("display.max_rows",8)
pd.options.mode.chained_assignment = None # default='warn'
# In[3]:
## 读取停用词和需要的词典
stopword = read_csv(r"E:\bigdata\sanguoTest2\my_stop_words.txt",header=None,names = ["Stopwords"])
mydict = read_csv(r"E:\bigdata\sanguoTest2\red_dictionary.txt",header=None, names=["Dictionary"])
print(stopword)
print("---------------------------------")
print(mydict)
RedDream = read_csv(r"E:\bigdata\sanguoTest2\sanguo.txt",header=None,names = ["Reddream"])
RedDream
# In[4]:
#删除空白行和不需要的段,并重新设置索引
###查看数据是否有空白行
np.sum(pd.isnull(RedDream))
# In[5]:
###删除卷处理,使用正则表达式
####包含相应关键字的索引
indexjuan = RedDream.Reddream.str.contains("^正文+")
# In[6]:
####删除不需要的段,并重新设置索引
RedDream = RedDream[~indexjuan].reset_index(drop=True)
RedDream
# In[7]:
####包含相应关键字的索引
indexjuan = RedDream.Reddream.str.contains("^分节阅读+")
# In[8]:
####删除不需要的段,并重新设置索引
RedDream = RedDream[~indexjuan].reset_index(drop=True)
RedDream
# In[9]:
## 找出每一章节的头部索引和尾部索引
## 每一章节的标题
indexhui = RedDream.Reddream.str.match("^第+.+回")
chapnames = RedDream.Reddream[indexhui].reset_index(drop=True)
print(chapnames)
print("--------------------------------------")
# In[10]:
## 处理章节名,按照空格分割字符串
chapnamesplit = chapnames.str.split(" ").reset_index(drop=True)
chapnamesplit
# In[15]:
## 建立保存数据的数据表
Red_df=pd.DataFrame(list(chapnamesplit),columns=["Chapter","Leftname","Rightname","null"])
Red_df
# In[16]:
## 添加新的变量
Red_df["Chapter2"] = np.arange(1,121)
Red_df["ChapName"] = Red_df.Leftname+","+Red_df.Rightname
## 每章的开始行(段)索引
Red_df["StartCid"] = indexhui[indexhui == True].index
## 每章的结束行数
Red_df["endCid"] = Red_df["StartCid"][1:len(Red_df["StartCid"])].reset_index(drop = True) - 1
Red_df["endCid"][[len(Red_df["endCid"])-1]] = RedDream.index[-1]
## 每章的段落长度
Red_df["Lengthchaps"] = Red_df.endCid - Red_df.StartCid
Red_df["Artical"] = "Artical"
# In[17]:
## 每章节的内容
for ii in Red_df.index:
## 将内容使用""连接
chapid = np.arange(Red_df.StartCid[ii]+1,int(Red_df.endCid[ii]))
## 每章节的内容替换掉空格
Red_df["Artical"][ii] = "".join(list(RedDream.Reddream[chapid])).replace("\u3000","")
## 计算某章有多少字
Red_df["lenzi"] = Red_df.Artical.apply(len)
Red_df
# In[20]:
####散点图##########
##字长和段落数的散点图一
from pylab import *
mpl.rcParams['font.sans-serif']=['SimHei']#指定默认字体
mpl.rcParams['axes.unicode_minus']=False#解决保存图像是负号'-'显示为方块的问题
plt.figure(figsize=(10,6))
plt.scatter(Red_df.Lengthchaps,Red_df.lenzi)
for ii in Red_df.index:
plt.text(Red_df.Lengthchaps[ii]+1,Red_df.lenzi[ii],Red_df.Chapter2[ii])
plt.xlabel("章节段数")
plt.ylabel("章节字数")
plt.title("《三国演义》120回")
plt.show
# In[21]:
##字长和段落数的散点图二
plt.figure(figsize=(10,6))
plt.scatter(Red_df.Lengthchaps,Red_df.lenzi)
for ii in Red_df.index: plt.text(Red_df.Lengthchaps[ii]-2,Red_df.lenzi[ii]+100,Red_df.Chapter[ii],size=7)
plt.xlabel("章节段数")
plt.ylabel("章节字数")
plt.title("《三国演义》120回")
plt.show
# In[26]:
plt.figure(figsize=(16,12))
plt.subplot(2,1,1)
plt.plot(Red_df.Chapter2,Red_df.Lengthchaps,"ro-",label="段落")
plt.ylabel("章节段数",Fontproperties=font)
plt.title("《三国演义》120回",Fontproperties=font)
##添加平均值
plt.hlines(np.mean(Red_df.Lengthchaps),-5,125,"b