7.用图挖掘找到感兴趣的人_用图挖掘实现推荐关注-CSDN博客

本文链接：https://blog.csdn.net/qq_39124646/article/details/83147282
# -*- coding: utf-8 -*-
"""
Created on Wed Oct  3 09:40:15 2018

@author: asus
"""
#7 用图挖掘找到感兴趣的人

#7.1 加载数据集
#根据社交网络用户的好友信息，向他们推荐好友。逻辑为：如果两个用户有共同好友，那么这两
#个人相似度很高，只得向彼此推荐。

#国外的网站国内禁了

#利用上一张介绍的Twitter API来获取数据，创建一个小的社交网络图。寻找喜欢同一个话题的
#用户，从中选择一部分，再获取这些人的好友列表（他们关注的人）。
import twitter #设置授权令牌
consumer_key = "XXXX" #用户密匙
consumer_secret = "XXXX" #请求令牌
access_token = "XXXX" #这里的访问令牌
access_token_secret = "XXXX" #您的访问令牌在这里是秘密的
authorization = twitter.OAuth(access_token, access_token_secret,
                              consumer_key, consumer_secret)
t = twitter.Twitter(auth=authorization, retry=True)
#指定输出文件名
import os
import json
output_filename = os.path.join(
        "E:\\books\Python数据挖掘入门与实践\用图挖掘找到感兴趣的人",
        "python_tweets.json")

#获取用户数据
original_users = [] #用户信息
tweets = []         #消息文本
user_ids = {}       #关联用户编号和名称
#搜索包含pyhton的消息，遍历搜索结果
search_results = t.search.tweets(q="python", count=100)['statuses']
for tweet in search_results:
    if 'text' in tweet:
        original_users.append(tweet['user']['screen_name'])
        user_ids[tweet['user']['screen_name']] = tweet['user']['id']

#7.1.1 用现有模型进行分类
#运行上一章的模型,用joblib库保存并加载模型
from sklearn.externals import joblib
output_filename2 = os.path.join(
        "E:\\books\Python数据挖掘入门与实践\用图挖掘找到感兴趣的人",
        "python_context.pkl")
joblib.dump(model, output_filename2) #model为模型名称

model_file = os.path.join(
        "E:\\books\Python数据挖掘入门与实践\用图挖掘找到感兴趣的人",
        "python_context.pkl")
#重建NLTKBOW类
from sklearn.base import TransformerMixin
from nltk import word_tokenize #对句子进行分词
import nltk
nltk.download('punkt')

class NLTKBOW(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return[{word: True for word in word_tokenize(document)} 
                for document in X]

#加载模型
from sklearn.externals import joblib       
context_classifiter = joblib.load(model_file)        

#预测消息是否与编程语言相关
y_pred = context_classifiter.predict(tweets)
#如果第i条消息与编程语言有关，那么y_pred中的第i项为1，否则为0 。
relevant_tweets = [tweets[i] for i in range(len(tweets)) if y_pred[i] == 1]
relevant_users = [original_users[i] for i in range(len(tweets)) if 
                  y_pred[i] == 1]
        
#7.1.2 获取Twitter好友消息
import time
import sys
def get_friends(t, user_id):
    friends =[] 
    cursor = -1
    while cursor != 0:
        try:
            results =t.friends.ids(user_id, cursor=cursor, count=5000)
            friends.extend([friend for friend in results['ids']])
            cursor =results['next_cursor']
            if len(friends) >= 10000:
                break
        except TypeError as e:
            if results is None:
                print("You probably reached your API limit,\
                      waiting for 5 minutes")
                sys.stdout.flush()
                time.sleep(5*60) # 5 minute wait
            else:
                raise e
        except twitter.TwitterHTTPError as e:
            break
        finally:
            time.sleep(60)
    return friends
        
#7.1.3 构建网络
friends ={}
for screen_name in relevant_users:
    user_id = user_ids[screen_name]
    friends[user_id] = get_friends(t, user_id)
    #删除没有好友的孤家寡人
    friends= {user_id:friends[user_id] for user_id in friends
              if len(friends[user_id]) > 0}
#计算每个好友的出现次数
from collections import defaultdict
def count_friends(friends):
    friend_count = defaultdict(int)
    for friend_list in friends.values():
        for friend in friend_list:
            friend_count[friend] += 1
    return friend_count

from operator import itemgetter
friend_count = count_friends(friends)
best_friends = sorted(friend_count.items(), key=itemgetter(1), reverse=True)
#建立一循环，凑够150个用户的好友数据
while len(friends) < 150:
    for user_id, count in best_friends:
        if user_id not in friends:
            break
        friends[user_id] = get_friends(t, user_id)
    for friend in friends[user_id]:
        friend_count[friend] += 1
    best_friends = sorted(friend_count.items(), 
                          key=itemgetter(1), reverse=True)
#保存中间结果
import os
import json       
friends_filename = os.path.join(
        "E:\\books\Python数据挖掘入门与实践\用图挖掘找到感兴趣的人",
        "python_friends.json")
#with open(friends_filename, 'w') as outf:
#    json.dump(friends, outf)        
#使用json.load函数，从文件中加载数据
with open(friends_filename) as inf:
    friends = json.load(inf)

#7.1.4 创建图
#顶点表示对象——在这里，顶点表示用户。边表示用户A是用户B的好友 。顶点的顺序是由含义的
#为有向图。
import networkx as nx
G = nx.DiGraph()
#我们只将150名核心用户彼此间的好友关系绘制成图像，其他好友关系由于数据量很大难以可视化
#把核心用户作为顶点，添加到图中。
main_users = friends.keys()
G.add_nodes_from(main_users)        
#接着创建边
for user_id in friends():
    for friend in friends[user_id]:
        if friend in main_users:
            G.add_edge(user_id, friend)

%matplotlib inline
nx.draw(G)        
#可借助用于处理图像生成的pyplot函数设置图像大小。
from matplotlib import pyplot as plt
plt.figure(3,figsize=(20,20))
nx.draw(G, alpha=0.1, edge_color='b')

#7.1.5 创建用户相似性图
#向拥有共同好友的用户推荐彼此。
#杰卡德相似系数，规范化方法，用共同好友的数量除以拥有的不同好友的数量
#即两个集合交集除以两个集合并集的元素数量。
import networkx as nx
from matplotlib import pyplot as plt
friends = {user:set(friends[user]) for user in friends}
#相似度函数
def compute_similarity(friends1, friends2):
    return len(friends1 & friends2) / len(friends1 | friends2)
#创建加权图
def create_graph(followers, threshold=0):
    G = nx.Graph()
    for user1 in friends.keys():
        for user2 in friends.keys():
            if user1 == user2:
                continue
            weight = compute_similarity(friends[user1], friends[user2])
            if weight >= threshold:
                G.add_node(user1)
                G.add_node(user2)
                G.add_edge(user1, user2, weight=weight)
    return G
%matplotlib inline
G = create_graph(friends)
#适当增加作图尺寸
plt.figure(figsize=(10,10))
#使用spring_layout布局方法
pos = nx.spring_layout(G)
#使用pos布局方法，确定顶点位置
nx.draw_networkx_nodes(G, pos)
#接下来，绘制边
edgewidth = [d['weight'] for (u,v,d) in G.edges(data=True)]
#绘制各条边
nx.draw_networkx_edges(G, pos, width=edgewidth)
nx.draw(G)

#7.2 寻找子图
#7.2.1 连通分支
#一个连通分支是图中由边连接在一起的一组顶点，不要求顶点之间必须两两连接。但是，连通分
#支的任意两个顶点之间，至少存在一条路径。
G = create_graph(friends, 0.1) #保留权重至少为0.1的边
sub_graphs = nx.connected_component_subgraphs(G) #寻找连通分支
#遍历找到的连通分支，输出基本信息
for i, sub_graph in enumerate(sub_graphs):
    n_nodes = len(sub_graph.nodes())
    print("Subgraph {0} has {1} nodes".format(i, n_nodes))
#调整阈值，找出不同的连通分支
G = create_graph(friends, 0.25)
sub_graphs = nx.connected_component_subgraphs(G)
for i, sub_graph in enumerate(sub_graphs):
    n_nodes = len(sub_graph.nodes())
    print("Subgraph {0} has {1} nodes".format(i, n_nodes))
#获得连通分支及它们的总数量
#本数据适合权重0.1画图。
sub_graphs = nx.connected_component_subgraphs(G) #一种生成器
n_subgraphs = nx.number_connected_components(G)
#让图像大小随着连通分支数量的增加而增加
fig = plt.figure(figsize=(20, (n_subgraphs * 3)))
#遍历所有连通分支，为每一个连通分支作图。add_subplot的几个参数分别为图的行数、图的列
#数及图所在的位置。
for i, sub_graph in enumerate(sub_graphs):
    ax = fig.add_subplot(int(n_subgraphs / 3), 3, i+1)
    #关闭坐标轴标签
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    #绘制顶点和边
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)
    nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)
    
#7.2.2 优化参数选取标准
#确定合适的阈值
import numpy as np
from sklearn.metrics import silhouette_score 
from scipy.optimize import minimize
def compute_silhouette(threshold, friends):
    G = create_graph(friends, threshold=threshold)
    if len(G.nodes()) < 2:
        return -99
    sub_graphs = nx.connected_component_subgraphs(G)
    if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):
        return -99
    label_dict = {}
    for i, sub_graph in enumerate(sub_graphs):
        for node in sub_graph.nodes():
            label_dict[node] = i
    labels = np.array([label_dict[node] for node in G.nodes()])
    #轮廓系数函数接收的是距离矩阵，将图转换为矩阵
    X = nx.to_scipy_sparse_matrix(G).todense()
    X = 1-X
    return silhouette_score(X, labels, metric='precomputed')
#轮廓系数取反，打分函数变为损失函数
def invert(func):
    def inverted_function(*args, **kwds):
        return -func(*args, **kwds)
    return inverted_function
#调用minimize()优化操作
result = minimize(invert(compute_silhouette), 0.1, method='nelder-mead', 
                  args=(friends,), options={'maxiter':10, })
print(result)
#参数说明
#inverted_silhouette:对我们要最小化的函数compute_silhouette进行取反操作，将其变成
                                                                   #损失函数。
#0.1：我们一开始猜测与之为0.1时，函数取到最小值
#options={'maxiter':10}:只进行10轮迭代(增加迭代次数，效果会更好，但允许时间也会增加)。                                                                  
#method='nethod='nelder-mead':使用下山单纯形法优化方法。
#args=(friends,):向北优化的函数传入friends字典参数。