Python(2.7.x)实现简单的单词频数统计

1. word_stat.py

# encoding: utf-8 
import re
import os
from sys import argv

# 将文本分割成单词列表
def split_str(text):
	return re.split(r"\W+", text.strip())

# 字典排序
def sort_dict(d):
	return sorted(d.items(), key = lambda d:d[1], reverse = True)

# 字典过滤
def filter_dict(d, limit = 0):
	for (k, v) in d.items():
		if v <= limit:
			d.pop(k)

# 单词过滤
def filter_word(word, minlen = 3, maxlen = 20):
	global filterwords
	if len(word) < minlen or len(word) > maxlen:
		return True
	if word.isdigit():
		return True
	if word in filterwords:
		return True

# 单词统计
def stat_words(words, word_dict):
	for word in words:
		word = word.lower()
		if filter_word(word):
			continue
		if word in word_dict:
			word_dict[word] += 1
		else:
			word_dict[word] = 1
	return word_dict

# 处理文本行
def process(text):
	global word_dict
	words = split_str(text)
	stat_words(words, word_dict)

# 加载过滤词汇
def load_filter_words():
	with open("wordsfilter.txt") as f:
		filterwords = split_str(f.read())
	return set(filterwords)

# 保存过滤单词
filterwords = set()
# 保存单词字典
word_dict = {}

if len(argv) > 1:
	try:
		# 加载过滤词汇
		filterwords = load_filter_words()

		del argv[0]
		for fname in argv:
			if os.path.isfile(fname):
				# 行读取文件,逐行处理文本
				with open(fname) as f:
					for line in f:
						process(line)
		
		# 过滤出现次数太少的单词
		filter_dict(word_dict)
		# 按单词出现的次数排序字典
		sorted_word_dict = sort_dict(word_dict)

		# 打印结果
		for item in sorted_word_dict:
			print item

	except IOError, e:
		print str(e)

2. 要过滤的单词wordsfilter.txt:

#
a
an
the

#
be
am
is
are
been
was
were
do
does
did
done
have
has
had
will
would
can
could
shall
should
may
might
must

#
here
there
this
that
these
those

#
how
what
when
where
which
who
whom

#
i
we
our
ours
you
your
yours
he
she
they
it
his
her
hers
its
their
theirs

#
break
class
const
contiue
double
except
exception
false
final
finally
float
for
else
int
integer
if
long
privated
protected
public
short
static
switch
true
try
while

#
http
https
java
javabeen
javascript
jquery
js
jsf
jsp
mysql
oracle
sql
url
xml
web

#
also
and
but
either
nor
or

#
about
above
across
after
back
before
between
by
from
in
into
of
off
on
out
to
under
up
with
without

#
one
two
three
for
five
six
seven
eight
nine
ten
first
second
third

#
all
better
best
each
even
good
hello
just
once
only
other
many
much
more
most
next
no
not
so
such
too
than
then
very
well
yes

#
come
came
get
give
go
gone
need
set


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值