#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on 2015-1-24
@author: beyondzhou
@name: nltk_process_blog.py
'''
import json
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
# Download nltk packages used in this example
#nltk.download('stopwords')
# Read data
BLOG_DATA = r"E:\eclipse\Web\dFile\feed.json"
blog_data = json.loads(open(BLOG_DATA).read())
# Customize your list of stopwords as needed. Here, we add common
# punctuation and contraction artifacts.
stop_words = nltk.corpus.stopwords.words('english') + [
'.',
',',
'--',
'\'s',
'?',
')',
'(',
':',
'\'',
'\'re',
'"',
'-',
'}',
'{',
u'-',
]
for post in blog_data:
sentences = sent_tokenize(post['content'])
words = [w.lower() for sentence in sentences for w in
word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
# Basic stats
num_words = sum([i[1] for i in fdist.items()])
num_unique_words = len(fdist.keys())
# Hapaxes are words that appear only once
num_hapaxes = len(fdist.hapaxes())
top_10_words_sans_stop_words = [w for w in fdist.items() if w[0]
not in stop_words][:10]
print post['title']
print '\tNum Sentences:'.ljust(25), len(sentences)
print '\tNum Words:'.ljust(25), num_words
print '\tNum Unique Words:'.ljust(25), num_unique_words
print '\tNum Hapaxes:'.ljust(25), num_hapaxes
print '\tTop 10 Most Frequent Words (sans stop words):\n\t\t', '\n\t\t'.join(['%s (%s)'
% (w[0], w[1]) for w in top_10_words_sans_stop_words])
print
Four short links: 23 January 2015
Num Sentences: 6
Num Words: 172
Num Unique Words: 121
Num Hapaxes: 98
Top 10 Most Frequent Words (sans stop words):
— (4)
fields (3)
academic (2)
analysis (2)
believed (2)
brilliance (2)
code (2)
language (2)
mining (2)
require (2)
Designing on a system level
Num Sentences: 8
Num Words: 238
Num Unique Words: 128
Num Hapaxes: 85
Top 10 Most Frequent Words (sans stop words):
thinking (6)
design (5)
systems (5)
goodman (4)
physical (3)
computer (2)
conversation (2)
define (2)
designer (2)
designing (2)
Bitcoin is just the first app to use blockchain technology
Num Sentences: 13
Num Words: 231
Num Unique Words: 135
Num Hapaxes: 103
Top 10 Most Frequent Words (sans stop words):
bitcoin (6)
time (4)
blockchain (3)
& (2)
around (2)
computers (2)
consensus (2)
could (2)
first (2)
global (2)
Blockchain scalability
Num Sentences: 13
Num Words: 367
Num Unique Words: 196
Num Hapaxes: 144
Top 10 Most Frequent Words (sans stop words):
bitcoin (9)
blockchain (9)
issue (3)
transactions (3)
& (2)
article (2)
block (2)
blocks (2)
centralization (2)
detail (2)
Bringing an end to synthetic biology’s semantic debate
Num Sentences: 11
Num Words: 350
Num Unique Words: 179
Num Hapaxes: 125
Top 10 Most Frequent Words (sans stop words):
synthetic (12)
biology (11)
genetic (5)
working (5)
that’s (4)
gardner (3)
living (3)
materials (3)
areas (2)
defining (2)
Building and deploying large-scale machine learning pipelines
Num Sentences: 10
Num Words: 196
Num Unique Words: 124
Num Hapaxes: 93
Top 10 Most Frequent Words (sans stop words):
learning (5)
machine (5)
projects (4)
data (3)
pipelines (3)
ben (2)
berkeley (2)
many (2)
new (2)
optimization (2)
Four short links: 22 January 2015
Num Sentences: 16
Num Words: 251
Num Unique Words: 150
Num Hapaxes: 119
Top 10 Most Frequent Words (sans stop words):
— (4)
i’m (3)
language (3)
like (3)
natural (3)
testing (3)
facebook (2)
interface (2)
interfaces (2)
kinect (2)
How to make a UX designer
Num Sentences: 7
Num Words: 227
Num Unique Words: 126
Num Hapaxes: 93
Top 10 Most Frequent Words (sans stop words):
design (8)
ux (8)
designers (5)
wydeven (4)
better (3)
graphic (3)
new (3)
websites (3)
came (2)
got (2)
The 3Ps of the blockchain: platforms, programs and protocols
Num Sentences: 6
Num Words: 167
Num Unique Words: 100
Num Hapaxes: 74
Top 10 Most Frequent Words (sans stop words):
blockchain (3)
although (2)
blockchain’s (2)
landscape (2)
protocol (2)
single (2)
“blockchain (2)
activity (1)
adoption (1)
already (1)
Four short links: 21 January 2015
Num Sentences: 9
Num Words: 118
Num Unique Words: 74
Num Hapaxes: 57
Top 10 Most Frequent Words (sans stop words):
pc (5)
— (4)
= (3)
2015 (2)
data (2)
mouse (2)
new (2)
2.2b (1)
2000s (1)
2020 (1)
The Internet of Things is really about software
Num Sentences: 10
Num Words: 244
Num Unique Words: 132
Num Hapaxes: 97
Top 10 Most Frequent Words (sans stop words):
internet (6)
software (6)
things (6)
; (2)
business (2)
free (2)
hardware (2)
industries (2)
iot (2)
it’s (2)
What containers can do for you
Num Sentences: 5
Num Words: 100
Num Unique Words: 84
Num Hapaxes: 74
Top 10 Most Frequent Words (sans stop words):
containers (2)
advantages (1)
applications (1)
behind (1)
better (1)
bringing (1)
buzz (1)
check (1)
compelling (1)
container (1)
Four short links: 20 January 2015
Num Sentences: 10
Num Words: 180
Num Unique Words: 116
Num Hapaxes: 85
Top 10 Most Frequent Words (sans stop words):
— (4)
mind (3)
scalability (3)
change (2)
collective (2)
group (2)
intelligence (2)
look (2)
mit (2)
one (2)
Striking parallels between mathematics and software engineering
Num Sentences: 20
Num Words: 339
Num Unique Words: 185
Num Hapaxes: 134
Top 10 Most Frequent Words (sans stop words):
mathematics (5)
algebra (3)
linear (3)
+ (2)
defined (2)
design (2)
designed (2)
hadoop (2)
learning (2)
machine (2)
Four short links: 19 January 2015
Num Sentences: 12
Num Words: 199
Num Unique Words: 132
Num Hapaxes: 105
Top 10 Most Frequent Words (sans stop words):
— (4)
ai (3)
learning (3)
ava (2)
deep (2)
facebook (2)
google’s (2)
information (2)
open (2)
q (2)