数据科学导论实验:基于Twitter的网络结构和社会群体演化

分析及预处理

查看json结构

随便选一个json文件拖入浏览器,借助chrome的开发者工具查看json结构
其中,name其实不需要取,nick是唯一的且只允许英文数字下划线 (\w),作为用户的唯一标识

迭代取数据

先取完再处理耗费内存,故通过yield建立迭代器

import json
import os

def extract_info(batch):
    plist = batch["response"]["list"]
    for post in plist:
        nick = post['trackback_author_nick']
        name = post['trackback_author_name']
        date = post['trackback_date']
        content = post['content']
        yield nick, date, name, content

def get_infos(fplist):
    f_i = 0
    for fp in fplist:
        fs = os.listdir(fp)
        for filename in fs:
            with open(fp+filename,encoding="utf-8") as f:
                try:
                    batch = json.load(f)
                except:
                    print("{} is error json file".format(f))
            for info in extract_info(batch):
                yield list(info)
            f_i+= 1
    print("read all {} files".format(f_i))

提取关系

根据用户转发、评论关系构建网络。当用户 A 在其 content 字段中@用户 B,我们
认为用户 A 与用户 B 之间存在联系。

因此需要通过正则表达式提取用户的nick

import re

pattern = re.compile('@(\w+)')
def get_at_list(content):
    return pattern.findall(content)

时间分割

地震时间(2011年3月11日13:46)需要统一时间格式,这里统一为时间戳

import datetime
equake_time = datetime.datetime(2011,3,11,13,46).timestamp()

输出csv,同时保存关系文件以便之后读取

注意数据清洗,通过nick+date的唯一性去重

还需除去@自己的,避免后面的图中出现自环

另外,如果@自己,info中的nick是@中的nick的小写,需要判断一下

import csv

for c in ['EN', 'JP']:
    key_dic = {}
    at_dic = {'Pre': {}, 'Post': {}}
    with open('data/' + c + 'All' + '.csv', 'w', newline='', encoding='utf-8') as f0:
        writer = csv.writer(f0)
        writer.writerow(['nick', 'date'])
        for info in get_infos(['data/' + c + 'alljson/']):
            ater = info[0]
            date = info[1]
            key = (ater, date)
            if key not in key_dic:
                key_dic[key] = 1
                writer.writerow([ater, date])
                for atee in get_at_list(info[3]):
                    atee = atee.lower()
                    if atee == ater:
                        continue
                    i = 'Pre' if date <= equake_time else 'Post'
                    if (ater, atee) not in at_dic[i]:
                        at_dic[i][(ater, atee)] = 0
                    at_dic[i][(ater, atee)] += 1
                
    
    for type in ['Pre', 'Post']:
        with open('data/' + c + type + '.csv', 'w', newline='', encoding='utf-8') as f1:
            writer = csv.writer(f1)
            writer.writerow(['ater', 'atee', 'count'])
            for item in at_dic[type].items():
                writer.writerow(list(item[0]) + [item[1]])

del key_dic
del at_dic

构建网络

构建图

import pandas as pd
import networkx as nx
from tqdm import tqdm_notebook as tqdm

def build_graph(df: pd.DataFrame):
    
    g = nx.Graph()
    g.add_weighted_edges_from(df.values.tolist())
    # for row in tqdm(df.iterrows(), total=len(df)):
    #     ater = row[1]['nick']
    #     for at in row[1]['atlist']:
    #         data = g.get_edge_data(ater, at)
    #         if data is None:
    #             g.add_edge(ater, at, weight=1)
    #         else:
    #             g[data[0]][data[1]].update({'weight': data[]})
    return g

删除非共有节点

def drop_diff_point(G1: nx.Graph, G2: nx.Graph):
    nodes = list(G1.nodes())
    for node in nodes:
        if not(node in G2):
            G1.remove_node(node)
    nodes = list(G2.nodes())
    for node in nodes:
        if not(node in G1):
            G2.remove_node(node)

装载入内存

dfs = {'EN': {}, 'JP': {}}
nets = {'EN': {}, 'JP': {}}
for c in ['EN', 'JP']:
    for type in ['All', 'Pre', 'Post']:
        dfs[c][type] = pd.read_csv('data/' + c + type + '.csv')
        if type != 'All':
            nets[c][type] = build_graph(dfs[c][type])
    drop_diff_point(nets[c]['Pre'], nets[c]['Post'])

保存网络模型

for c in ['EN', 'JP']:
    for type in ['Pre', 'Post']:
        nx.write_gml(nets[c][type], 'data/' + c + type + '.gml')

相关系数计算

网络平均度

import numpy as np

def average_deg(G):
    return np.array([i[1] for i in nx.degree(G)]).mean()

最大连通片(最大连通分支)

def largest_com(G):
    largest_components = max(nx.connected_components(nx.Graph(G)), key=len)
    return len(largest_components)

平均群居系数(平均聚集系数)

def average_clu(G):
    return nx.average_clustering(G)

图直径

最长最短路径的长度 nx.diameter(G)

所有节点间平均最短路径长度 nx.average_shortest_path_length(G)

def diameter(G):
    return nx.diameter

def avg_shortest_path_len(G):
    return nx.average_clustering(G)

但是因为图并不完全联通,所以计算会报错

个人层面度分析

import matplotlib.pyplot as plt

def add_identity(axes, *line_args, **line_kwargs):
    identity, = axes.plot([], [], *line_args, **line_kwargs)
    def callback(axes):
        low_x, high_x = axes.get_xlim()
        low_y, high_y = axes.get_ylim()
        low = max(low_x, low_y)
        high = min(high_x, high_y)
        identity.set_data([low, high], [low, high])
    callback(axes)
    axes.callbacks.connect('xlim_changed', callback)
    axes.callbacks.connect('ylim_changed', callback)
    return axes

def individualdegree(G1, G2, name, ax):
    nodes1 = G1.nodes()
    nodes2 = G2.nodes()
    degree1 = []
    degree2 = []
    for node in nodes1:
        if node in nodes2:
            degree1.append(G1.degree(node))
            degree2.append(G2.degree(node))
    # plt.scatter(degree1,degree2)#在双对坐标轴上绘制度分布曲线
    # plt.subplot(121)
    plt.title(name)
    plt.xlabel('before')
    plt.ylabel('after')
    plt.loglog(degree1,degree2,'o', label=name)#在双对坐标轴上绘制度分布曲线
    add_identity(ax, ls='--')
    

累积度分析

def cumlutive_degree_distribution(G):
    degree = []
    k = G.degree()
    for each_node in k:
        degree.append(each_node[1])
    xs = degree
    distKeys = range(min(xs), max(xs) + 1)
    pdf = dict([(k, 0) for k in distKeys])
    for x in xs:
        pdf[x] += 1
    pdf_temp=pdf
    scope = range(min(pdf),max(pdf)+1)
    for degree in scope:
        k=degree+1
        while k<=max(pdf):
            pdf[degree]+=pdf_temp[k]
            k+=1
    return pdf
#根据(图,名称,度列表,返回一个度分布图)
def draw_degree_chart(G, name, distribution, ax=None):
    # degree=nx.degree_histogram(G)#返回图中所有节点的度分布序列
    degree = distribution
    # print(degree)
    y = np.array(list(degree.values()))
    # print(y)
    # y=[z/float(sum(degree))for z in degree]#将频次转化为频率,利用列表内
    y = y/y[0]#将频次转化为频率,利用列表内涵
    x=range(len(degree))#生成 X 轴序列,从 1 到最大度
    if ax is None:
        ax = plt
    # y = degree
    if 'e' in name:
        color = 'lightsteelblue'
        marker = 'o'
    else:
        color = 'lightsalmon'
        marker = '^'
    line = ax.loglog(x, y, color=color, marker=marker, linestyle='', label=name) # 在双对坐标轴上绘制度分布曲线
    

分析结果

网络基本特性信息统计对比

from prettytable import PrettyTable
table = PrettyTable([' ', '  ', '#users', '#tweets', '#links', 
                     '#nodes', 'avg_deg', 'largest_com', 'avg_clustering'])
for c in ['JP', 'EN']:
    for type in ['Pre', 'Post']:
        dff = dfs[c]['All']
        if type == 'Pre':
            df = dff[dff['date'] <= equake_time]
        else:
            df = dff[dff['date'] > equake_time]
        n_usrs = len(df['nick'].unique())
        n_tws = len(df)
        n_links = dfs[c][type]['count'].sum()
        n_nodes = nets[c][type].number_of_nodes()
        d = average_deg(nets[c][type])
        s = largest_com(nets[c][type])
        clu = average_clu(nets[c][type])
        table.add_row([c, type, n_usrs, n_tws, n_links, n_nodes, round(d/2, 3), s, round(clu/2, 3)])
print(table)

因为是有向图,个人认为度和聚集系数不能直接按原来的无向图算,需要除以2;

另外,有点奇怪的是论文和老师给的模板中,user都没有去重,这里我做了一下去重

+----+------+--------+---------+--------+--------+---------+-------------+--------------+
|    |      | #users | #tweets | #links | #nodes | avg_deg | largest_com | avg_cluster  |
+----+------+--------+---------+--------+--------+---------+-------------+--------------+
| JP | Pre  |  4000  |  39347  | 25738  |  5467  |  1.602  |     4392    |     0.035    |
| JP | Post |  5383  |  102669 | 90825  |  5467  |  3.390  |     4949    |     0.045    |
| EN | Pre  |  3887  |  44124  | 29215  |  4922  |  1.338  |     4024    |     0.044    |
| EN | Post |  4436  |  57462  | 38099  |  4922  |  1.500  |     4204    |     0.048    |
+----+------+--------+---------+--------+--------+---------+-------------+--------------+

个人层面 度分析

for c in ['JP', 'EN']:
    f, ax = plt.subplots(figsize=(6,6))
    individualdegree(nets[c]['Pre'], nets[c]['Post'], c, ax)

png png

为了更直观地显示前后的变化,我加上了对角线。

个人层面上,日本地震前后 y > x的更多,说明个人的度增高了,即有了更多的联系;EN则较平稳,整体的变化不大。

累积度分析

for c in ['JP', 'EN']:
    for type in ['Pre', 'Post']:
        draw_degree_chart(nets[c][type], type, cumlutive_degree_distribution(nets[c][type]))
    plt.legend()
    plt.title(c)
    plt.show()
    

png png

可以看到地震对美国影响不大,对日本则使累计度普遍增高。

Dephi分析

网络信息统计

社区划分与渲染

社区分析

from prettytable import PrettyTable
from networkx.algorithms.community import k_clique_communities

table = PrettyTable(['Region', 'Period', 'community_num', 'max_com_size'])
cliques = {'JP': {'Pre': None, 'Post': None}, 'EN': {'Pre': None, 'Post': None}}
for c in ['JP', 'EN']:
    for type in ['Pre', 'Post']:
        clique = k_clique_communities(nets[c][type],4)
        clique = list(clique)
        clique_size = [len(cl) for cl in clique]
        cliques[c][type] = clique
        table.add_row([c, type, len(clique), max(clique_size)])
print(table)
+--------+--------+---------------+--------------+
| Region | Period | community_num | max_com_size |
+--------+--------+---------------+--------------+
|   JP   |  Pre   |       20      |     218      |
|   JP   |  Post  |       35      |     711      |
|   EN   |  Pre   |       27      |      97      |
|   EN   |  Post  |       34      |      73      |
+--------+--------+---------------+--------------+

可以看到,美国的社区数增长,是正常的发展趋势;而日本的社区数减少,侧面反映了日本社交聚合关系加强;渲染图更直观地说明了这一点。

桑基图

又叫冲击图 Alluvial diagram,查了一下库,几乎都是R语言写的,除了一个floweaver比较不错,就用了它
https://github.com/ricklupton/floweaver

from floweaver import *

from ipysankeywidget import SankeyWidget

c = 'JP'
# c = 'EN'
clique_dic = {'Pre':{}, 'Post': {}}
for type in ['Pre', 'Post']:
    cliques[c][type].sort(reverse=True, key=len)
    for i in range(20):
        for name in cliques[c][type][i]:
            clique_dic[type][name] = i + 1
df1 = pd.DataFrame.from_dict(clique_dic['Pre'], orient='index', columns=['source'])
df2 = pd.DataFrame.from_dict(clique_dic['Post'], orient='index', columns=['target'])
df3 = df1.join(df2, how='inner').astype('Int64')
df3 = df3.reset_index()
df3.columns = ['source', 'pre', 'post']
df3['type'] = 1
df3['value'] = 1
df3['target'] = df3['source']     
from floweaver import *

size = dict(width=888, height=666)
nodes = {
    'PreTop10': ProcessGroup(df3['source'].tolist()),
    'PostTop10': ProcessGroup(df3['source'].tolist()),
}
ordering = [['PreTop10'], ['PostTop10']]
bundles = [Bundle('PreTop10', 'PostTop10'),]
sdd = SankeyDefinition(nodes, bundles, ordering)

pre_partition = Partition.Simple('process',
                            [(i, df3[df3.pre==i].source.tolist()) for i in list(range(100))[1:]])
post_partition = Partition.Simple('process', 
                            [(i, df3[df3.post==i].source.tolist()) for i in list(range(100))[1:]])
nodes['PreTop10'].partition = pre_partition
nodes['PostTop10'].partition = post_partition

weave(sdd, df3[['source', 'target', 'type', 'value']]).to_widget(**size)

日本的桑基图中,几乎所有社区的用户都汇集在一起,也很少有社区分散,同样可以看到社区紧密程度的提升

EN的桑基图中,社区用户的汇聚变化不如日本,从第一大社区的增幅和来源成分就可以看出;另外,EN地震后的没有出现在图中的社区大多由离散的用户(因为不构成社区所以不在左侧的来源中)组成,社区变化也较简单。

  • 3
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值