1. word_stat.py
# encoding: utf-8
import re
import os
from sys import argv
# 将文本分割成单词列表
def split_str(text):
return re.split(r"\W+", text.strip())
# 字典排序
def sort_dict(d):
return sorted(d.items(), key = lambda d:d[1], reverse = True)
# 字典过滤
def filter_dict(d, limit = 0):
for (k, v) in d.items():
if v <= limit:
d.pop(k)
# 单词过滤
def filter_word(word, minlen = 3, maxlen = 20):
global filterwords
if len(word) < minlen or len(word) > maxlen:
return True
if word.isdigit():
return True
if word in filterwords:
return True
# 单词统计
def stat_words(words, word_dict):
for word in words:
word = word.lower()
if filter_word(word):
continue
if word in word_dict:
word_dict[word] += 1
else:
word_dict[word] = 1
return word_dict
# 处理文本行
def process(text):
global word_dict
words = split_str(text)
stat_words(words, word_dict)
# 加载过滤词汇
def load_filter_words():
with open("wordsfilter.txt") as f:
filterwords = split_str(f.read())
return set(filterwords)
# 保存过滤单词
filterwords = set()
# 保存单词字典
word_dict = {}
if len(argv) > 1:
try:
# 加载过滤词汇
filterwords = load_filter_words()
del argv[0]
for fname in argv:
if os.path.isfile(fname):
# 行读取文件,逐行处理文本
with open(fname) as f:
for line in f:
process(line)
# 过滤出现次数太少的单词
filter_dict(word_dict)
# 按单词出现的次数排序字典
sorted_word_dict = sort_dict(word_dict)
# 打印结果
for item in sorted_word_dict:
print item
except IOError, e:
print str(e)
2. 要过滤的单词wordsfilter.txt:
#
a
an
the
#
be
am
is
are
been
was
were
do
does
did
done
have
has
had
will
would
can
could
shall
should
may
might
must
#
here
there
this
that
these
those
#
how
what
when
where
which
who
whom
#
i
we
our
ours
you
your
yours
he
she
they
it
his
her
hers
its
their
theirs
#
break
class
const
contiue
double
except
exception
false
final
finally
float
for
else
int
integer
if
long
privated
protected
public
short
static
switch
true
try
while
#
http
https
java
javabeen
javascript
jquery
js
jsf
jsp
mysql
oracle
sql
url
xml
web
#
also
and
but
either
nor
or
#
about
above
across
after
back
before
between
by
from
in
into
of
off
on
out
to
under
up
with
without
#
one
two
three
for
five
six
seven
eight
nine
ten
first
second
third
#
all
better
best
each
even
good
hello
just
once
only
other
many
much
more
most
next
no
not
so
such
too
than
then
very
well
yes
#
come
came
get
give
go
gone
need
set