Python 聚类分析LinkedIn用户人脉网络

CODE:

#!/usr/bin/python 
# -*- coding: utf-8 -*-

'''
Created on 2014-8-26
@author: guaguastd
@name: linkedin_network_clusters.py
'''

import os
import sys
import json
from urllib2 import HTTPError
from cluster import KMeansClustering, centroid

# A helper function to munge data and build up an XML tree
sys.path.append(os.path.join(os.getcwd(), "e:", "eclipse", "LinkedIn", "dFile"))
from mykml import createKML

K = 3

# get geo code
from geo import geo_from_bing
g = geo_from_bing()

# Load the data
CONNECTIONS_DATA = 'E:\eclipse\LinkedIn\dfile\linkedin_connections.json'
OUT_FILE = "E:\eclipse\LinkedIn\dfile\linkedin_clusters_kmeans.kml"

# Open up your saved connections with extended profile information
# or fetch them again from LinkedIn if you prefer
connections = json.loads(open(CONNECTIONS_DATA).read())['values']
locations = [c['location']['name'] for c in connections if c.has_key('location')]

# Some basic transforms
transforms = [('Greater ', ''), (' Area', '')]

# Step 1 - Tally the frequency of each location
coords_freqs = {}
for location in locations:

    if not c.has_key('location'):
        continue

    # Avoid unnecessary I/O and geo requests by building up a cache
    if coords_freqs.has_key(location):
        coords_freqs[location][1] += 1
        continue
    transformed_location = location

    for transform in transforms:
        transformed_location = transformed_location.replace(*transform)

        # Handle potential IO errors with a retry pattern...
        while True:
            num_errors = 0
            try:
                results = g.geocode(transformed_location, exactly_one=False)
                print results
                break
            except HTTPError, e:
                num_errors += 1
                if num_errors >= 3:
                    sys.exit()
                print >> sys.stderr, e
                print >> sys.stderr, 'Encountered an urllib2 error. Trying again...'

        if results is None:
            continue
        
        for result in results:
            # Each result is of the form ("Description", (X,Y))
            coords_freqs[location] = [result[1], 1]
            break # Disambiguation strategy is "pick first"

# Step 2 - Build up data structure for converting locations to KML
expanded_coords = []
for label in coords_freqs:
    # Flip lat/lon for Google Earth
    ((lat, lon), f) = coords_freqs[label]
    expanded_coords.append((label, [(lon, lat)] * f))

    # No need to clutter the map with unnecessary placemarks...
    kml_items = [{'label': label, 'coords': '%s,%s' % coords[0]} for (label, coords) in expanded_coords]

    # It would also be helpful to include names of your contacts on the map
    for item in kml_items:
        item['contacts'] = '\n'.join(['%s %s.' % (c['firstName'], c['lastName'])
            for c in connections if c.has_key('location') and
                                    c['location']['name'] == item['label']])

# Step 3 - Cluster locations and extend the KML data structure with centroids
c1 = KMeansClustering([coords for (label, coords_list) in expanded_coords
                      for coords in coords_list])

centroids = [{'label':'CONTROID', 'coords': '%s,%s' % centroid(c)} for c in c1.getclusters(K)]
kml_items.extend(centroids)

# Step 4 - Create the final KML output and write it to a file
kml = createKML(kml_items)

f = open(OUT_FILE, 'w')
f.write(kml)
f.close()

print 'Data written to ' + OUT_FILE

RESULT:

[Location(Beijing, Beijing, China 39 54m 0.0s N, 116 23m 0.0s E)]
[Location(Beijing, Beijing, China 39 54m 0.0s N, 116 23m 0.0s E)]
None
[Location(CA, United States 37 43m 0.0s N, 122 15m 0.0s W)]
[Location(Birmingham, England, United Kingdom 52 29m 0.0s N, 1 55m 0.0s W), Location(Birmingham, England, United Kingdom 52 27m 0.0s N, 1 43m 0.0s W), Location(Birmingham Airport, England, United Kingdom 52 27m 0.0s N, 1 44m 0.0s W), Location(Birmingham Business Park, England, United Kingdom 52 28m 0.0s N, 1 43m 0.0s W)]
[Location(Birmingham, England, United Kingdom 52 29m 0.0s N, 1 55m 0.0s W), Location(Birmingham, England, United Kingdom 52 27m 0.0s N, 1 43m 0.0s W), Location(Birmingham Airport, England, United Kingdom 52 27m 0.0s N, 1 44m 0.0s W), Location(Birmingham Business Park, England, United Kingdom 52 28m 0.0s N, 1 43m 0.0s W)]
[Location(China 36 33m 0.0s N, 103 59m 0.0s E)]
[Location(China 36 33m 0.0s N, 103 59m 0.0s E)]
[Location(Chengdu, Sichuan, China 30 40m 0.0s N, 104 5m 0.0s E)]
[Location(Chengdu, Sichuan, China 30 40m 0.0s N, 104 5m 0.0s E)]
[Location(Xingtai, Hebei, China 37 4m 0.0s N, 114 29m 0.0s E)]
[Location(Xingtai, Hebei, China 37 4m 0.0s N, 114 29m 0.0s E)]
[Location(United States 39 27m 0.0s N, 98 57m 0.0s W)]
[Location(United States 39 27m 0.0s N, 98 57m 0.0s W)]
[Location(Foshan, Guangdong, China 23 2m 0.0s N, 113 6m 0.0s E)]
[Location(Foshan, Guangdong, China 23 2m 0.0s N, 113 6m 0.0s E)]
Data written to E:\eclipse\LinkedIn\dfile\linkedin_clusters_kmeans.kml


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值