In [
1]:
import re
In [
2]:
import io
In [
3]: non_word = re.compile(
r'[\W\d]+', re.UNICODE)
In [
4]: common_words = {
...:
'the',
'of',
'and',
'in',
'to',
'a',
'is',
'it',
'that',
'which',
'as',
'on',
'by',
...:
'be',
'this',
'with',
'are',
'from',
'will',
'at',
'you',
'not',
'for',
'no',
'have',
...:
'i',
'or',
'if',
'his',
'its',
'they',
'but',
'their',
'one',
'all',
'he',
'when',
...:
'than',
'so',
'these',
'them',
'may',
'see',
'other',
'was',
'has',
'an',
'there',
...:
'more',
'we',
'footnote',
'who',
'had',
'been',
'she',
'do',
'what',
...:
'her',
'him',
'my',
'me',
'would',
'could',
'said',
'am',
'were',
'very',
...:
'your',
'did',
'not',
...: }
In [
5]:
def yield_words(filename):
...:
import io
...:
with io.open(filename, encoding=
'latin-1')
as f:
...:
for line
in f:
...:
for word
in line.split():
...: word = non_word.sub(
'', word.lower())
...:
if word
and word
not
in common_words:
...:
yield word
...:
In [
6]:
def word_count(filename):
...: word_iterator = yield_words(filename)
...: counts = {}
...: counts = defaultdict(int)
...:
while
True:
...:
try:
...: word = next(word_iterator)
...:
except StopIteration:
...:
break
...:
else:
...: counts[word] +=
1
...:
return counts
...:
In [
6]:
from collections
import defaultdict
In [
7]: %time counts = word_count(filename)
CPU times: user
88.5 ms, sys:
2.48 ms, total:
91 ms
Wall time:
89.3 ms
|