Python 查询Google+相似文档


# -*- coding: utf-8 -*-

Created on 2014-9-10
@author: guaguastd

# Finding similar documents using cosine similarity
import json
import nltk.cluster.util

# Load in human language data from wherever you've saved it
DATA = r'E:\eclipse\Google\dFile\107033731246200681024.json'
data = json.loads(open(DATA).read())

# Only consider content that's ~1000+ words
data = [post for post in json.loads(open(DATA).read())
        if len(post['object']['content']) > 1000]

all_posts = [post['object']['content'].lower().split()
             for post in data]

# Provides tf, idf, and tf_idf abstractions for scorin
tc = nltk.TextCollection(all_posts)

# Compute a term-document matrix
td_matrix = {}
for idx in range(len(all_posts)):
    post = all_posts[idx]
    fdist = nltk.FreqDist(post)

    doc_title = data[idx]['title']
    url = data[idx]['url']
    td_matrix[(doc_title, url)] = {}

    for term in fdist.iterkeys():
        td_matrix[(doc_title, url)][term] = tc.tf_idf(term, post)

# Build vectors such that term scores are in the same positions...
distances = {}
for (title1, url1) in td_matrix.keys():
    distances[(title1, url1)] = {}
    (min_dist, most_similar) = (1.0, ('', ''))
    for (title2, url2) in td_matrix.keys():
        # Take care not to mutate the original data structures
        # since we're in a loop and need the originals multiple times
        terms1 = td_matrix[(title1, url1)].copy()
        terms2 = td_matrix[(title2, url2)].copy()

        # Fill in gaps in each map so vectors of the same length can be computed
        for term1 in terms1:
            if term1 not in terms2:
                terms2[term1] = 0

        for term2 in terms2:
            if term2 not in terms1:
                terms1[term2] = 0

        # Create vectors from term maps
        v1 = [score for (term, score) in sorted(terms1.items())]
        v2 = [score for (term, score) in sorted(terms2.items())]
        # Compute similarity amongst documents
        distances[(title1, url1)][(title2, url2)] = nltk.cluster.util.cosine_distance(v1, v2)

        if url1 == url2:

        if distances[(title1, url1)][(title2, url2)] < min_dist:
            (min_dist, most_similar) = (distances[(title1, url1)][(title2, url2)], (title2, url2))

    print '''Most similar to %s (%s)
\t%s (%s)
\tscore %f
''' % (title1, url1, most_similar[0], most_similar[1], 1-min_dist)


Most similar to Great talk by Maciej Ceglowski.  Funny, smart, and with an important message.  Just like Maciej all ... (
	Journalism vs. Punditry: NPR's Kelly McEvers on Why Reporting Matters

There was a great segment on ... (
	score 0.056840

Most similar to Journalism vs. Punditry: NPR's Kelly McEvers on Why Reporting Matters

There was a great segment on ... (
	Great talk by Maciej Ceglowski.  Funny, smart, and with an important message.  Just like Maciej all ... (
	score 0.056840

Most similar to The Myth of the Spoiled Child

There is an interesting op-ed in the NY Times by Alfie Cohn, who has ... (
	How to Raise Moral Children

I thought this article on child-raising had a lot of good ideas in it. ... (
	score 0.064629

Most similar to Why Common Core is Like

Draw a bold line between this piece on the failure of the Common... (
	"We don't need new policies. We need better implementation."

Last night, I hosted Oakland City Councilor... (
	score 0.067829

Most similar to "We don't need new policies. We need better implementation."

Last night, I hosted Oakland City Councilor... (
	Why Common Core is Like

Draw a bold line between this piece on the failure of the Common... (
	score 0.067829

Most similar to +Maria Konnikova's NY Times article about the role of time and attention scarcity in the cycle of poverty... (
	How to Raise Moral Children

I thought this article on child-raising had a lot of good ideas in it. ... (
	score 0.046450

Most similar to How to Raise Moral Children

I thought this article on child-raising had a lot of good ideas in it. ... (
	The Myth of the Spoiled Child

There is an interesting op-ed in the NY Times by Alfie Cohn, who has ... (
	score 0.064629

