
import numpy as np
import pandas as pd
import os
## 模块准备
# 加载需要模块
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import jieba
from wordcloud import WordCloud,ImageColorGenerator
from imageio import imread
import nltk
from nltk.cluster import cosine_distance,KMeansClusterer
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram,ward
from scipy.spatial.distance import pdist,squareform
from sklearn.manifold import MDS
from gensim import corpora, models
# 加载绘制社交网络图的包
import network as nx

  1. 设置基本属性,设置字体,pandas显示方式,显示图像的方式,参考代码如下:
# 设置字体
font =FontProperties(fname ='simkai.ttf', size =14)
# 设置pandas显示方式
pd.options.mode.chained_assignment =None# default='warn'
# 设置图像显示方式
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%config InlineBackend.figure_format = 'retina'


1mary sue ,同人中那种完美的女主角。

8995 rows × 1 columns



def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords
stopwords = stopwordslist('stopping.txt')
if np.sum(pd.isnull(book.Book1))!=0:
['阿', '哎', '哎呀', '哎哟', '唉', '俺', '俺们', '按', '按照']

2. 找出章节的头部索引和尾部索引


def chapterclean(df):
    #截取章节^所在行号  每一章节的名字
    indexchap = df.str.match('^ˇ+.+ˇ')
    chapnames = df[indexchap].reset_index(drop=True)
    newchap = pd.DataFrame([myL.replace('ˇ','')for myL in list(chapnames)])
    newchap.columns = ['ChapterName']
    newchap['StartCid'] = indexchap[indexchap==True].index
    newchap["EndCid"] = newchap['StartCid'][1:len(newchap['StartCid'])].reset_index(drop=True) - 1
    newchap["EndCid"][[len(newchap["EndCid"])-1]] = book.index[-1]
    newchap["Lengthchaps"] = newchap.EndCid - newchap.StartCid
    newchap['Artical'] = 'Artical'
    for ii in newchap.index:
        chapid = np.arange(newchap.StartCid[ii]+1,int(newchap.EndCid[ii]))
        newchap['Artical'][ii] = ''.join(list(book.Book1[chapid])).replace('\u3000','')
    newchap["WordNum"] = newchap.Artical.apply(len)
    return newchap

newchap = chapterclean(book.Book1)
      ChapterName  StartCid  EndCid  Lengthchaps  \
0           所谓玛丽苏         0     8.0          8.0   
1    余周周小朋友的个人秀之一         9    78.0         69.0   
2    余周周小朋友的个人秀之二        79   161.0         82.0   
3             小飞虫       162   221.0         59.0   
..            ...       ...     ...          ...   
104     你的资格,我的考试      8511  8605.0         94.0   
105      泯然众人间的幸福      8606  8773.0        167.0   
106        再见,旧时光      8774  8940.0        166.0   
107  尾声:年年有余,周周复始      8941  8994.0         53.0   

                                               Artical  WordNum  
0    mary sue ,同人中那种完美的女主角。其含义就是制造一个原作中不存在的女孩,与故事里的...      660  
1    “你……你怎么样?你流了好多血!”“西米克,这个瓶子,你先拿走!”“不要,我不要丢下你,我不...     2209  
2    “无论怎样,我都不会把圣蛋交给你的!”雅典娜坚贞不屈,高昂着头,任长在背后飘啊飘。余周周版雅...     3627  
3    余周周常说,奔奔这个名字很好。那时候电视上正在播放一部动画片,里面的主角是一辆长得像碰碰车的...     3309  
..                                                 ...      ...  
104  “你没必要一周来一次的。”米乔靠在病床上啃苹果,她终于稳定下来了,不再吃什么吐什么。十月份的...     3042  
105  考试结束铃打响的时候,余周周“腾”地站起身。辛锐有那么一秒钟觉得余周周要冲上来撕了她——她从...     6575  
106  余周周很久之后才知道,其实在奔奔不再是奔奔,却也还不是慕容沉樟的时候,他的大名叫做冀希杰,应...     5186  
107  “乖,来,不理爸爸,来找小姑姑玩!”余周周拍拍手,余思窈就白了她爸爸余乔一眼,扭着屁股投入到...     1660  

[108 rows x 6 columns]


def chapterplot(newchap):
    plt.figure(figsize = (12, 10))
    plt.ylabel('章节段数',fontproperties = font)
    plt.title('《你好,旧时光》',fontproperties = font)
    plt.plot(newchap.index+1,newchap.WordNum,"bo-",label = "段落")
    plt.xlabel("章节",fontproperties = font)
    plt.ylabel("章节字数",fontproperties = font)



def delete_punctuation(df):
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    df_new = pd.DataFrame(None,columns = ['value'])
    for i in np.arange(len(df)):
        df_new.loc[i] = ''.join(re.findall(pattern,df[i]))
    return df_new

def seg_sentence(sentence):
    sentence_seged = [[word for word in jieba.cut(document)]for document in sentence]
    outstr = []
    for sentence_list in sentence_seged:
        words = []
        for word in sentence_list:
            if word not in list(stopwords):
    return outstr
newchap['Cutword'] = seg_sentence(delete_punctuation(newchap.Artical).value)
newchap['Chapter_num'] = range(1,len(newchap)+1)
# print(newchap['Chapter_num'])
0      [同人, 那种, 完美, 女主角, 含义, 制造, 原作, 女孩, 故事, 美少年, 恋爱,...
1      [流, 好多, 血, 西米克, 瓶子, 先, 拿走, 丢下, 快快, 时间, 余周周, 卧倒...
2      [无论怎样, 圣蛋, 交给, 雅典娜, 坚贞不屈, 高昂, 头, 任长, 背后, 飘, 飘,...
3      [余周周, 常说, 奔奔, 名字, 电视, 播放, 一部, 动画片, 主角, 一辆, 长得,...
104    [一周, 米乔, 病床, 啃, 苹果, 终于, 稳定下来, 吃, 吐, 十月份, 天空, 明...
105    [考试, 结束, 铃, 打响, 余周周, 腾地, 起身, 辛锐, 一秒钟, 余周周, 要冲,...
106    [余周周, 久, 奔奔, 奔奔, 慕容, 沉樟, 大名, 冀希杰, 酒鬼, 养父, 冠名, ...
107    [乖来, 不理, 爸爸, 找, 小姑姑, 玩, 余周周, 拍拍手, 余思窈, 白, 爸爸, ...
Name: Cutword, Length: 108, dtype: object


def wordstotal(newchap):
    # 连接词
    textwords =np.concatenate(newchap.Cutword)
    words_df =pd.DataFrame({'Word': textwords})
    words_stat =words_df.groupby(by =['Word'])['Word'].agg([('number',np.size),])
    words_stat =words_stat.reset_index().sort_values(by ='number', ascending =False)
    words_stat['Wordlen'] =words_stat.Word.apply(len) 
    # 去除长度大于5
    words_stat =words_stat.loc[words_stat.Word.apply(len) <5,:]
    words_stat =words_stat.sort_values(by ='number', ascending =False)
    return words_stat
words_stat =wordstotal(newchap)

18658 rows × 3 columns


 def wordsplot(words_stat):
    # 筛选数据
    newdata = words_stat.loc[words_stat.number >250]
    # 绘制直方图
    newdata.plot(kind ='bar', x ='Word', y ='number', figsize =(10, 7))
    plt.xticks(fontproperties =font, size =9)
    plt.xlabel('关键词', fontproperties =font)
    plt.ylabel('频数', fontproperties=font)
    plt.title("《你好,旧时光》", fontproperties =font)



def wcplot(words_stat):
    # 数据准备
    worddict ={}
    # 构造:词语:频率 字典
    for key,value in zip(words_stat.Word, words_stat.number):
        worddict[key] =value
    # 读取背景图片
    back_image =imread('7.jpg')
    # 生成词云,使用generate_from_frequencies函数

    wcbook =WordCloud(font_path ='msyhl.ttc', margin =5, width =6000, height =4000, background_color ='white',  # 
                        max_words =500,  # 词云显示的最大词数
                        mask =back_image,  # 设置背景图片
                        # max_font_size=100, #字体最大值
                        random_state =42,
                        ).generate_from_frequencies(frequencies =worddict)
    image_colors =ImageColorGenerator(back_image)
    # 绘制词云
    plt.figure(figsize =(15,10))
    plt.imshow(wcbook.recolor(color_func =image_colors))


# 角色表
role =pd.read_table('role.txt', sep ='\n\n', header =None, engine='python')
# 对段落分词
newcharacter =seg_sentence(delete_punctuation(book.Book1).value)

def weightcaculate(newcharacter, role):
    # 计算权重
    names ={}      # 姓名字典
    relationships ={} # 关系字典
    lineNames=[]    # 每段内人物关系
    for n in newcharacter:
        # 替换同人不同名
        a =list(map(lambda x : [x, '余周周'][x =='周周'or x =='小姑姑'or x =='姑姑'], n))
        b =list(map(lambda x : [x, '米乔'][x =='乔帮主'or x =='帮主'], a))
        c =list(map(lambda x : [x, '奔奔'][x =='周周'or x =='慕容沉樟'or x =='冀希杰'], b))
        d =list(map(lambda x : [x, '凌翔茜'][x =='茜茜'], c))
        e =list(map(lambda x : [x, '辛锐'][x =='辛美香'], d))
        f =list(map(lambda x : [x, '郑彦一'][x =='彦一'], e))
        g =list(map(lambda x : [x, '武文陆'][x =='武老师'or x =='老武'], f))
        h =list(map(lambda x : [x, '周沈然'][x =='然然'], g))
        i =list(map(lambda x : [x, '潘元胜'][x =='潘主任'or x =='小潘'], h))
        j =list(map(lambda x : [x, '何瑶瑶'][x =='瑶瑶'], i))
        for word in j:
            if word in list(role[0]):
                if names.get(word) is None:
                    names[word] =0
                    relationships[word] ={}
                names[word] += 1
    for line in lineNames:   #对于每一段          
        for name1 in line:                
            for name2 in line:  #每段中的任意两个人        
                if name1 ==name2:   
                if relationships[name1].get(name2) is None:       #若两个人尚未同时出现则新建项
                else: relationships[name1][name2] =relationships[name1][name2]+1# 1
    return relationships
relationships =weightcaculate(newcharacter, role)
#rel_stat = pd.read_csv('weight.csv', sep = ',')
test = pd.DataFrame(relationships)
df = pd.DataFrame(columns = ["First", "Second", "Weight"])
m = 0
temp = np.array(test)
for i in list(test.index):
    n = 0
    for j in list(test.columns)[n:]:
        df = df.append(pd.DataFrame(np.matrix([i, j , temp[m, n]]), columns = df.columns))
        n = n + 1
    m = m + 1
df = df[(True^df["Weight"].isin(["nan"]))]

rel_stat = pd.DataFrame(columns = ["First", "Second", "Weight"])
for i in np.arange(17):
    for j in np.arange(17)[i+1:]:
        First = df["First"].unique()[i]
        Second = df["First"].unique()[j]
        if(len(df[(df["First"] == First) & (df["Second"] == Second)]["Weight"]) != 0):
            Weight = df[(df["First"] == First) & (df["Second"] == Second)]["Weight"].values[0]
            rel_stat = rel_stat.append(pd.DataFrame(np.matrix([First, Second, Weight]), columns = rel_stat.columns))
rel_stat.index= list(range(60))

rel_stat['DWeight'] = rel_stat['Weight'].astype('float') / 600
rel_stat['DWeight'].plot(kind = 'hist')
## 绘制人物关系图
def characterplot(rel_stat):
    plt.figure(figsize = (12, 12))
    G = nx.Graph()
    G.clear() #将图上元素清空
    for ii in rel_stat.index:
        G.add_edge(rel_stat.First[ii], rel_stat.Second[ii], weight = rel_stat.DWeight[ii])

    elarge = [(u, v) for (u, v, d) in G.edges(data = True) if d['weight'] > 0.2]
    emidle= [(u, v) for (u, v, d) in G.edges(data = True) if (d['weight'] > 0.07) & (d['weight'] <= 0.2)]
    esmall = [(u, v) for (u, v, d) in G.edges(data = True) if d['weight'] <= 0.07]
    # 图的布局
    pos = nx.fruchterman_reingold_layout(G) # positions for all nodes
    # 计算每个节点的重要程度
    Gdegree = nx.degree(G)
    Gdegree = pd.DataFrame({'name' : list(dict(Gdegree).keys()), 'degree' : list(dict(Gdegree).values())})
    # nodes根据节点的入度和初度来设置节点的大小
    nx.draw_networkx_nodes(G, pos, alpha = 0.6, node_size =50 + Gdegree.degree * 70)
    # edges
    nx.draw_networkx_edges(G, pos, edgelist = elarge,width = 3, alpha = 0.9, edge_color = 'greenyellow')
    nx.draw_networkx_edges(G, pos, edgelist = emidle,width = 2, alpha = 0.6, edge_color = 'yellow')
    nx.draw_networkx_edges(G, pos, edgelist = esmall,width = 1, alpha = 0.3, edge_color = 'blue', style = 'dashed')
    # labels
    nx.draw_networkx_labels(G, pos, font_size = 10, font_family = 'KaiTi')
    plt.title("《你好,旧时光》人物关系", FontProperties = font)
    plt.show() # display

AttributeError                            Traceback (most recent call last)

Input In [25], in <cell line: 29>()
     27     plt.title("《你好,旧时光》人物关系", FontProperties = font)
     28     plt.show() # display
---> 29 characterplot(rel_stat)

Input In [25], in characterplot(rel_stat)
      3 plt.figure(figsize = (12, 12))
      4 #生成社交网络图
----> 5 G = nx.Graph()
      6 G.clear() #将图上元素清空
      7 for ii in rel_stat.index:

AttributeError: module 'network' has no attribute 'Graph'

<Figure size 864x864 with 0 Axes>


#articals = list(newchap['cutword'])
for cutwords in newchap.Cutword:
    articals.append(" ".join(cutwords))
# tfidf
vectorizer = CountVectorizer()    # max_features
transformer = TfidfTransformer() # TfidfTransformer

tfidf =transformer.fit_transform(vectorizer.fit_transform(articals))
# tfidf 
print(tfidf) # 
# tfidf

dtm =tfidf.toarray()
  (0, 17320)	0.08137094931580552
  (0, 16881)	0.058802010025752284
  (0, 16346)	0.025465471435741963
  (0, 15510)	0.07476995773293878
  (0, 15471)	0.08137094931580552
  (0, 15216)	0.07476995773293878
  (0, 14958)	0.07476995773293878
  (0, 14898)	0.4268313529203993
  (0, 14855)	0.038999035277152046
  (0, 14685)	0.08137094931580552
  (0, 14635)	0.07476995773293878
  (0, 14607)	0.08137094931580552
  (0, 14380)	0.08137094931580552
  (0, 14044)	0.03142289905051851
  (0, 14014)	0.042333098226247645
  (0, 13787)	0.07476995773293878
  (0, 13778)	0.04653056860356843
  (0, 13714)	0.08137094931580552
  (0, 13618)	0.043090446332163676
  (0, 13392)	0.055169222211114595
  (0, 13038)	0.058802010025752284
  (0, 12630)	0.08137094931580552
  (0, 12589)	0.06348548808791216
  (0, 12396)	0.08137094931580552
  (0, 12390)	0.08137094931580552
  :	:
  (107, 1651)	0.044905434303681
  (107, 1550)	0.041592651764436474
  (107, 1505)	0.0575564263390265
  (107, 1433)	0.014515556453089301
  (107, 1412)	0.05288732159097601
  (107, 1354)	0.03361076447714147
  (107, 1312)	0.0575564263390265
  (107, 1244)	0.0575564263390265
  (107, 1106)	0.0575564263390265
  (107, 980)	0.09195089336895704
  (107, 864)	0.044905434303681
  (107, 759)	0.0575564263390265
  (107, 710)	0.04957453905173149
  (107, 530)	0.0575564263390265
  (107, 481)	0.02095977244179595
  (107, 444)	0.0575564263390265
  (107, 365)	0.01898084297406993
  (107, 359)	0.023649947722120427
  (107, 352)	0.04700494528372696
  (107, 326)	0.018780569592096952
  (107, 297)	0.036780357347582816
  (107, 251)	0.02676245687939197
  (107, 232)	0.028019933595394574
  (107, 121)	0.03902305799643194
  (107, 106)	0.0147930501023435

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02688032, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])



['一丁点儿', '一万', '一万个', '一上午', '一下一下', '一下下', '一下半', '一下头', '一下子']

F:\python_anaconda\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)
kmeans =KMeansClusterer(num_means =3, distance =nltk.cluster.util.cosine_distance,)

labpre =[kmeans.classify(i) for i in dtm]
kmeanlab =newchap.loc[:,["ChapterName"]]
kmeanlab["cosd_pre"] =labpre

count =kmeanlab.groupby("cosd_pre").count()
count =count.reset_index()


mds =MDS(n_components =2, random_state =123)

coord =mds.fit_transform(dtm)

plt.figure(figsize =(8, 8))
plt.scatter(coord[:, 0], coord[:, 1], c =kmeanlab.cosd_pre)
for ii in np.arange(108):
    plt.text(coord[ii, 0] +0.02, coord[ii, 1], s =newchap.Chapter_num[ii])
plt.title("K-means MDS")  
(108, 2)



pca =PCA(n_components =2)
coord =pca.fit_transform(dtm)

plt.figure(figsize =(8, 8))
plt.scatter(coord[kmeanlab.cosd_pre ==0, 0], coord[kmeanlab.cosd_pre ==0, 1],label ='first')
plt.scatter(coord[kmeanlab.cosd_pre ==1,0], coord[kmeanlab.cosd_pre ==1, 1],label ='second')
plt.scatter(coord[kmeanlab.cosd_pre ==2,0], coord[kmeanlab.cosd_pre ==2, 1], label ='third')
plt.legend('upper right')
for ii in np.arange(108):
    plt.text(coord[ii, 0] +0.02, coord[ii, 1], s =newchap.Chapter_num[ii])
plt.xlabel("主成分1", fontproperties =font)   
plt.ylabel("主成分2", fontproperties =font)  
plt.title("K-means PCA")
[0.04479856 0.03542421]
(108, 2)



labels =newchap.ChapterName.values

cosin_matrix =squareform(pdist(dtm, 'cosine')) 

ling =ward(cosin_matrix)  
fig, ax =plt.subplots(figsize =(10, 15)) 
ax =dendrogram(ling, orientation ='right', labels =labels);
plt.yticks(fontproperties =font, size =8)
plt.title("《你好旧时光》各章节层次聚类", fontproperties =font)


