几乎所有的博客都可以在线阅读,或者通过RSS订阅源进行阅读。RSS订阅源是一个包含博客及其所有文章条目信息的简单的XML文档。
程序中使用了feedparser第三方模块,可以轻松地从任何RSS或Atom订阅源中得到标题、链接和文章的条目。完整代码如下:
01 | ''' |
02 | Created on Jul 14, 2012 |
03 |
04 | @Author: killua |
05 | @E-mail: killua_hzl@163.com |
06 | @Homepage: http://www.yidooo.net |
07 | @Decriptioin: Counting the words in a Feed |
08 |
09 | feedparser:feedparser is a Python library that parses feeds in all known formats, including Atom, RSS, and RDF.It runs on Python 2.4 all the way up to 3.2. |
10 |
11 | dataset: http://kiwitobes.com/clusters/feedlist.txt |
12 | You can download feeds from this list. Maybe some feeds you can access in China. |
13 | ''' |
14 |
15 | import feedparser |
16 | import re |
17 |
18 | #Get word from feed |
19 | def getwords(html): |
20 | #Remove all the HTML tags |
21 | text = re. compile (r "<[^>]+>" ).sub('', html) |
22 |
23 | #Split words by all non-alpha characters |
24 | words = re. compile (r "[^A-Z^a-z]+" ).split(text) |
25 |
26 | #Convert words to lowercase |
27 | wordlist = [word.lower() for word in words if word ! = ""] |
28 |
29 | return wordlist |
30 |
31 | #Returns title and dictionary of word counts for an RSS feed |
32 | def getFeedwordcounts(url): |
33 | #Parser the feed |
34 | d = feedparser.parse(url) |
35 | wordcounts = {} |
36 |
37 | #Loop over all the entries |
38 | for e in d.entries: |
39 | if 'summary' in e: |
40 | summary = e.summary |
41 | else : |
42 | summary = e.description |
43 |
44 | words = getwords(e.title + ' ' + summary) |
45 | for word in words: |
46 | wordcounts.setdefault(word, 0 ) |
47 | wordcounts[word] + = 1 |
48 |
49 | return d.feed.title, wordcounts |
50 |
51 | if __name__ = = '__main__' : |
52 | #count the words appeared in blog |
53 | blogcount = {} |
54 | wordcounts = {} |
55 |
56 | feedFile = file ( 'resource/feedlist.txt' ) |
57 | feedlist = [line for line in feedFile.readlines()] |
58 |
59 | for feedUrl in feedlist: |
60 | try : |
61 | title, wc = getFeedwordcounts(feedUrl) |
62 | wordcounts[title] = wc |
63 | for word, count in wc.items(): |
64 | blogcount.setdefault(word, 0 ) |
65 | if count > 1 : |
66 | blogcount[word] + = 1 |
67 | except : |
68 | print 'Failed to parse feed %s' % feedUrl |
69 |
70 | wordlist = [] |
71 | for w, bc in blogcount.items(): |
72 | frac = float (bc) / len (feedlist) |
73 | if frac > 0.1 and frac < 0.5 : |
74 | wordlist.append(w) |
75 |
76 | #Write the result to the file |
77 | datafile = file ( 'blogdata' , 'w' ) |
78 | #Write result's head |
79 | datafile.write( 'Blog' ) |
80 | for word in wordlist: |
81 | datafile.write( '\t%s' % word) |
82 | datafile.write( '\n' ) |
83 | #Write results |
84 | for blogname, wc in wordcounts.items(): |
85 | print blogname |
86 | datafile.write(blogname) |
87 | for word in wordlist: |
88 | if word in wc: |
89 | datafile.write( "\t%d" % wc[word]) |
90 | else : |
91 | datafile.write( "\t0" ) |
92 | datafile.write( '\n' ) |
转载请注明: 转自阿龙の异度空间