获取文档中单词出现的频率
一、
# -*- coding:utf-8 -*-
import sys
import os
import re
import collections
# 匹配字母、数字、下划线, + 至少一次(一次或多次)
WORD_RE = re.compile(r'\w+')
path = os.path.dirname(os.path.abspath(__file__))
path2 = path + '/aa.txt'
index = {}
with open(path2, 'r') as f:
for line_no, line in enumerate(f, 1):
for match in WORD_RE.finditer(line): # match 匹配一个个分组
# print(match)
word = match.group()
column_no = match.start() + 1
location = (line_no, column_no)
occurrences = index.get(word, []) # 在index 查找不到就返回一个[]
occurrences.append(location)
index[word] = occurrences
# sorted(index, key=str.upper) 对字典key值排序,返回[,,]
for word in sorted(index, key=str.upper):
print(word, index[word])
二、
a = collections.defaultdict() :用户在创建defaultdict对象的时候需要给他一个方法,在他找不到键的时候回调用这个方法。
a[key] 其实是在调用dict的内置方法__getitem__ 。也就是a[key] 在找不到键的时候会调用上面给的入参方法然后返回。
# -*- coding:utf-8 -*-
import sys
import os
import re
import collections
WORD_RE = re.compile(r'\w+')
path = os.path.dirname(os.path.abspath(__file__))
path2 = path + '/aa.txt'
# defaultdict() 入参是方法,如list,func_1。 list是python 内置方法 list()
# def func_1: return 'abc'
# collections.defaultdict(list)
index_2 = collections.defaultdict(list)
with open(path2, 'r') as f:
for line_no, line in enumerate(f, 1):
for match in WORD_RE.finditer(line): # match 匹配一个个分组
# print(match)
word = match.group()
column_no = match.start() + 1
location = (line_no, column_no)
# index_2[word],如果找不到键值,就会调用list()方法,放回一个空[]
index_2[word].append(location)
for word in sorted(index_2, key=str.upper):
print(word, index_2[word])
三、
index_3.setdefault(word, []) 在找不到键值的时候创建该键值并且赋值为[]
# -*- coding:utf-8 -*-
import sys
import os
import re
import collections
WORD_RE = re.compile(r'\w+')
path = os.path.dirname(os.path.abspath(__file__))
path2 = path + '/aa.txt'
index_3= {}
with open(path2, 'r') as f:
for line_no, line in enumerate(f, 1):
for match in WORD_RE.finditer(line): # match 匹配一个个分组
# print(match)
word = match.group()
column_no = match.start() + 1
location = (line_no, column_no)
index_3.setdefault(word, []).append(location)
for word in sorted(index_3, key=str.upper):
print(word, index_3[word])
四
with open(path2, 'r') as f:
for line_no, line in enumerate(f, 1):
for match in WORD_RE.finditer(line): # match 匹配一个个分组
# print(match)
word = match.group()
column_no = match.start() + 1
location = (line_no, column_no)
# 1、sequence -- 一个序列、迭代器或其他支持迭代对象。2、start -- 下标起始位置。
# 返回 enumerate(枚举) 对象
>>> a = ['dd','cc','aa']
>>> b = enumerate(a,1)
>>> list(b)
[(1, 'dd'), (2, 'cc'), (3, 'aa')]
>>> a = 'aa dd bb cc'
>>> WORD_RE = re.compile(r'\w+')
# 和 findall 类似,在字符串中找到正则表达式所匹配的所有子串,并把它们作为一个迭代器返回
>>> b = WORD_RE.finditer(a)
<callable-iterator object at 0x7f2f5db57c10>
>>> c = list(b)
>>>> c
[<_sre.SRE_Match object at 0x7f2f5db5a308>, <_sre.SRE_Match object at 0x7f2f5db5a370>, <_sre.SRE_Match object at 0x7f2f5db5a3d8>, <_sre.SRE_Match object at 0x7f2f5db5a440>]
>>> c[0].group()
'aa'
# 正则表达式中,group()用来提出分组截获的字符串,()用来分组
>>> a = "123abc456"
# group(0)、group()返回整体
>>> re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(0)
'123abc456'
>>> re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(1)
'123'
>>> re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(2)
'abc'
>>> re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(3)
'456'