Python collections
模块实用教程
collections
是 Python 标准库中一个强大的模块,提供了许多有用的容器数据类型,可以在某些情况下替代 Python 的内置容器(如 dict
, list
, set
, tuple
)。
目录
1. 基础容器类型
namedtuple
创建具有命名字段的元组子类,使代码更易读。
from collections import namedtuple
# 创建一个简单的Point类
Point = namedtuple('Point', ['x', 'y'])
p = Point(11, y=22) # 可以用位置参数或关键字参数实例化
print(p.x, p.y) # 11 22
print(p[0], p[1]) # 11 22 (仍然支持索引访问)
print(p._asdict()) # {'x': 11, 'y': 22}
# 创建更复杂的Employee类
Employee = namedtuple('Employee', 'name age department salary', defaults=['IT', 50000])
emp = Employee('Alice', 30)
print(emp) # Employee(name='Alice', age=30, department='IT', salary=50000)
# _replace 创建新实例
emp_new = emp._replace(age=31, salary=55000)
print(emp_new) # Employee(name='Alice', age=31, department='IT', salary=55000)
# _fields 查看字段名
print(Employee._fields) # ('name', 'age', 'department', 'salary')
# _make 从可迭代对象创建
emp2 = Employee._make(['Bob', 25, 'HR', 45000])
print(emp2) # Employee(name='Bob', age=25, department='HR', salary=45000)
deque
双端队列,支持从两端快速添加和删除元素。
from collections import deque
d = deque('ghi') # 用可迭代对象初始化
print(d) # deque(['g', 'h', 'i'])
d.append('j') # 右侧添加
d.appendleft('f') # 左侧添加
print(d) # deque(['f', 'g', 'h', 'i', 'j'])
print(d.pop()) # 'j' (右侧移除)
print(d.popleft()) # 'f' (左侧移除)
print(d) # deque(['g', 'h', 'i'])
print(d[1]) # 'h' (支持索引访问)
d[1] = 'x' # 支持索引赋值
print(d) # deque(['g', 'x', 'i'])
d.extend('jkl') # 右侧扩展
d.extendleft('def') # 左侧扩展(注意顺序)
print(d) # deque(['f', 'e', 'd', 'g', 'x', 'i', 'j', 'k', 'l'])
d.rotate(1) # 向右旋转1步
print(d) # deque(['l', 'f', 'e', 'd', 'g', 'x', 'i', 'j', 'k'])
d.rotate(-2) # 向左旋转2步
print(d) # deque(['e', 'd', 'g', 'x', 'i', 'j', 'k', 'l', 'f'])
# 限制最大长度
d = deque(maxlen=3)
d.extend('abc')
print(d) # deque(['a', 'b', 'c'], maxlen=3)
d.append('d') # 添加新元素时,左侧元素会被自动移除
print(d) # deque(['b', 'c', 'd'], maxlen=3)
ChainMap
将多个字典或映射链接在一起,形成单个可更新的视图。
from collections import ChainMap
dict1 = {'a': 1, 'b': 2}
dict2 = {'b': 3, 'c': 4}
chain = ChainMap(dict1, dict2)
print(chain['a']) # 1 (来自dict1)
print(chain['b']) # 2 (来自dict1,因为dict1在前)
print(chain['c']) # 4 (来自dict2)
# 更新操作只影响第一个映射
chain['c'] = 5
print(dict2) # {'b': 3, 'c': 4} (未改变)
print(dict1) # {'a': 1, 'b': 2, 'c': 5} (新增了'c')
# 添加新映射
dict3 = {'d': 6}
new_chain = chain.new_child(dict3)
print(new_chain['d']) # 6
# parents属性跳过第一个映射
print(new_chain.parents['b']) # 2 (来自dict1)
# 合并操作
merged = dict(chain) # 转换为普通字典
print(merged) # {'a': 1, 'b': 2, 'c': 5}
# 动态更新
dict1['e'] = 7
print(chain['e']) # 7 (动态反映变化)
Counter
计数器,用于统计可哈希对象的出现次数。
from collections import Counter
# 从可迭代对象创建
cnt = Counter(['red', 'blue', 'red', 'green', 'blue', 'blue'])
print(cnt) # Counter({'blue': 3, 'red': 2, 'green': 1})
# 从字典创建
cnt = Counter({'red': 4, 'blue': 2})
print(cnt) # Counter({'red': 4, 'blue': 2})
# 从关键字参数创建
cnt = Counter(cats=4, dogs=8)
print(cnt) # Counter({'dogs': 8, 'cats': 4})
# 基本操作
print(cnt['dogs']) # 8
print(cnt['birds']) # 0 (不存在的键返回0)
cnt['dogs'] += 1
print(cnt['dogs']) # 9
del cnt['cats']
print(cnt) # Counter({'dogs': 9})
# 方法
print(list(cnt.elements())) # ['dogs', 'dogs', ...] (9次)
print(cnt.most_common(1)) # [('dogs', 9)]
print(cnt.most_common()) # [('dogs', 9)]
# 数学运算
c = Counter(a=3, b=1)
d = Counter(a=1, b=2)
print(c + d) # Counter({'a': 4, 'b': 3})
print(c - d) # Counter({'a': 2}) (负值被忽略)
print(c & d) # Counter({'a': 1, 'b': 1}) (交集: min)
print(c | d) # Counter({'a': 3, 'b': 2}) (并集: max)
# 更新和减法
c.update(d) # 类似于c += d
print(c) # Counter({'a': 4, 'b': 3})
c.subtract(d) # 类似于c -= d
print(c) # Counter({'a': 3, 'b': 1})
OrderedDict
有序字典,记住键的插入顺序(Python 3.7+ 普通dict也保持顺序,但OrderedDict有额外功能)。
from collections import OrderedDict
d = OrderedDict()
d['a'] = 1
d['b'] = 2
d['c'] = 3
print(d) # OrderedDict([('a', 1), ('b', 2), ('c', 3)])
# 保持插入顺序
for k, v in d.items():
print(k, v) # a 1, b 2, c 3
# 移动元素到末尾
d.move_to_end('a')
print(d) # OrderedDict([('b', 2), ('c', 3), ('a', 1)])
# 移动元素到开头
d.move_to_end('b', last=False)
print(d) # OrderedDict([('b', 2), ('c', 3), ('a', 1)])
# popitem移除并返回元素
print(d.popitem()) # ('a', 1) (默认移除最后一个)
print(d.popitem(last=False)) # ('b', 2) (移除第一个)
# 指定顺序排序
d = OrderedDict(sorted(d.items(), key=lambda t: t[0])) # 按键排序
print(d) # OrderedDict([('a', 1), ('b', 2), ('c', 3)])
# 相等性比较
d1 = OrderedDict([('a', 1), ('b', 2)])
d2 = OrderedDict([('b', 2), ('a', 1)])
print(d1 == d2) # False (顺序不同)
defaultdict
带有默认工厂的字典,当键不存在时自动创建默认值。
from collections import defaultdict
# 默认值为0的字典
dd = defaultdict(int)
print(dd['a']) # 0 (自动创建)
dd['b'] += 1 # 可以直接操作
print(dd) # defaultdict(<class 'int'>, {'a': 0, 'b': 1})
# 默认值为列表的字典
dd_list = defaultdict(list)
dd_list['colors'].append('red')
dd_list['colors'].append('blue')
print(dd_list) # defaultdict(<class 'list'>, {'colors': ['red', 'blue']})
# 默认值为集合的字典
dd_set = defaultdict(set)
dd_set['tags'].add('python')
dd_set['tags'].add('java')
dd_set['tags'].add('python') # 重复值会被忽略
print(dd_set) # defaultdict(<class 'set'>, {'tags': {'python', 'java'}})
# 使用lambda指定更复杂的默认值
dd_complex = defaultdict(lambda: {'count': 0, 'name': ''})
dd_complex['a']['count'] += 1
dd_complex['a']['name'] = 'Alice'
print(dd_complex) # defaultdict(<function <lambda> at ...>, {'a': {'count': 1, 'name': 'Alice'}})
# 普通字典实现类似功能(更冗长)
regular_dict = {}
regular_dict.setdefault('a', []).append(1)
regular_dict.setdefault('a', []).append(2)
print(regular_dict) # {'a': [1, 2]}
2. 高级容器类型
UserDict
字典对象的包装器,方便创建字典子类。
from collections import UserDict
class MyDict(UserDict):
def __missing__(self, key):
return f'Key {key} not found'
def __contains__(self, key):
return str(key) in self.data
def __setitem__(self, key, value):
super().__setitem__(str(key).lower(), value)
d = MyDict({'Name': 'Alice', 'Age': 30})
print(d['name']) # 'Alice' (不区分大小写)
print(d['EMAIL']) # 'Key email not found' (__missing__)
print('AGE' in d) # True (不区分大小写)
d['EMAIL'] = 'alice@example.com'
print(d.data) # {'name': 'Alice', 'age': 30, 'email': 'alice@example.com'}
UserList
列表对象的包装器,方便创建列表子类。
from collections import UserList
class SortedList(UserList):
def __init__(self, iterable=None):
super().__init__(sorted(iterable) if iterable else [])
def append(self, item):
super().append(item)
self.data.sort()
def extend(self, items):
super().extend(items)
self.data.sort()
lst = SortedList([3, 1, 2])
print(lst) # [1, 2, 3]
lst.append(0)
print(lst) # [0, 1, 2, 3]
lst.extend([5, 4])
print(lst) # [0, 1, 2, 3, 4, 5]
UserString
字符串对象的包装器,方便创建字符串子类。
from collections import UserString
class MyString(UserString):
def reverse(self):
return self.data[::-1]
def remove_punctuation(self):
import string
return ''.join(c for c in self.data if c not in string.punctuation)
s = MyString("Hello, World!")
print(s.reverse()) # "!dlroW ,olleH"
print(s.remove_punctuation()) # "Hello World"
3. 抽象基类
collections.abc
模块提供了许多抽象基类,用于测试一个类是否提供了特定的接口。
Collection
from collections.abc import Collection
class MyCollection:
def __init__(self, data):
self._data = list(data)
def __iter__(self):
return iter(self._data)
def __len__(self):
return len(self._data)
def __contains__(self, item):
return item in self._data
# 检查是否实现了Collection接口
print(issubclass(MyCollection, Collection)) # True
Sequence
from collections.abc import Sequence
class MySequence(Sequence):
def __init__(self, data):
self._data = list(data)
def __getitem__(self, index):
return self._data[index]
def __len__(self):
return len(self._data)
seq = MySequence([1, 2, 3])
print(seq[1]) # 2
print(len(seq)) # 3
print(3 in seq) # True
MutableSequence
from collections.abc import MutableSequence
class MyMutableList(MutableSequence):
def __init__(self, data=None):
self._data = list(data) if data else []
def __getitem__(self, index):
return self._data[index]
def __setitem__(self, index, value):
self._data[index] = value
def __delitem__(self, index):
del self._data[index]
def __len__(self):
return len(self._data)
def insert(self, index, value):
self._data.insert(index, value)
lst = MyMutableList([1, 2, 3])
lst.append(4)
lst[1] = 20
del lst[0]
print(lst) # [20, 3, 4]
Mapping
from collections.abc import Mapping
class MyMapping(Mapping):
def __init__(self, data):
self._data = dict(data)
def __getitem__(self, key):
return self._data[key]
def __iter__(self):
return iter(self._data)
def __len__(self):
return len(self._data)
m = MyMapping({'a': 1, 'b': 2})
print(m['a']) # 1
print(len(m)) # 2
print('b' in m) # True
MutableMapping
from collections.abc import MutableMapping
class MyMutableDict(MutableMapping):
def __init__(self, data=None):
self._data = dict(data) if data else {}
def __getitem__(self, key):
return self._data[key]
def __setitem__(self, key, value):
self._data[key] = value
def __delitem__(self, key):
del self._data[key]
def __iter__(self):
return iter(self._data)
def __len__(self):
return len(self._data)
d = MyMutableDict({'a': 1})
d['b'] = 2
del d['a']
print(dict(d)) # {'b': 2}
Set
from collections.abc import Set
class MySet(Set):
def __init__(self, iterable):
self._data = set(iterable)
def __contains__(self, item):
return item in self._data
def __iter__(self):
return iter(self._data)
def __len__(self):
return len(self._data)
s = MySet([1, 2, 3])
print(2 in s) # True
print(len(s)) # 3
MutableSet
from collections.abc import MutableSet
class MyMutableSet(MutableSet):
def __init__(self, iterable=None):
self._data = set(iterable) if iterable else set()
def __contains__(self, item):
return item in self._data
def __iter__(self):
return iter(self._data)
def __len__(self):
return len(self._data)
def add(self, item):
self._data.add(item)
def discard(self, item):
self._data.discard(item)
s = MyMutableSet([1, 2, 3])
s.add(4)
s.discard(2)
print(list(s)) # [1, 3, 4]
Iterable
Iterable
是最基础的迭代协议,表示可迭代对象。
from collections.abc import Iterable
class MyIterable:
def __iter__(self):
return iter([1, 2, 3])
# 检查是否是Iterable
print(isinstance(MyIterable(), Iterable)) # True
print(isinstance([1, 2, 3], Iterable)) # True
print(isinstance("abc", Iterable)) # True
print(isinstance(123, Iterable)) # False
# 自定义 Iterable 类
class CountDown(Iterable):
def __init__(self, start):
self.start = start
def __iter__(self):
n = self.start
while n > 0:
yield n
n -= 1
count_down = CountDown(5)
print(list(count_down)) # [5, 4, 3, 2, 1]
# 或者使用迭代器方式实现
class CountDownAlt:
def __init__(self, start):
self.start = start
def __iter__(self):
return CountDownIterator(self.start)
class CountDownIterator:
def __init__(self, count):
self.count = count
def __iter__(self):
return self
def __next__(self):
if self.count <= 0:
raise StopIteration
value = self.count
self.count -= 1
return value
count_down = CountDownAlt(3)
print(list(count_down)) # [3, 2, 1]
Iterator
Iterator
继承自 Iterable
,表示迭代器对象。
from collections.abc import Iterator
# 内置迭代器
numbers = [1, 2, 3]
iter_numbers = iter(numbers)
print(isinstance(iter_numbers, Iterator)) # True
print(isinstance(numbers, Iterator)) # False
# 生成器是迭代器
gen = (x for x in range(3))
print(isinstance(gen, Iterator)) # True
# 自定义 Iterator 类
class SquareIterator(Iterator):
def __init__(self, numbers):
self.numbers = iter(numbers)
def __next__(self):
num = next(self.numbers) # 可能引发StopIteration
return num * num
# Iterator已经实现了__iter__方法,返回self
squares = SquareIterator([1, 2, 3])
print(list(squares)) # [1, 4, 9]
Generator
Generator
继承自 Iterator
,表示生成器对象。
from collections.abc import Generator
def simple_gen():
yield 1
yield 2
yield 3
gen = simple_gen()
print(isinstance(gen, Generator)) # True
print(isinstance(gen, Iterator)) # True
print(isinstance(gen, Iterable)) # True
# 自定义Genarator类
class MyGenerator(Generator):
def __init__(self, iterable):
self.iterable = iterable
self.iterator = None
def send(self, value):
if self.iterator is None:
self.iterator = iter(self.iterable)
return next(self.iterator)
try:
return next(self.iterator)
except StopIteration:
raise StopIteration from None
def throw(self, typ, val=None, tb=None):
super().throw(typ, val, tb)
def close(self):
super().close()
gen = MyGenerator([1, 2, 3])
print(next(gen)) # 1
print(next(gen)) # 2
print(next(gen)) # 3
4. 应用技巧
基于Counter的词频统计
from collections import Counter
import re
def word_frequency(text, n=10):
words = re.findall(r'\w+', text.lower())
return Counter(words).most_common(n)
text = """Python is an interpreted, high-level, general-purpose programming language.
Created by Guido van Rossum and first released in 1991, Python's design philosophy
emphasizes code readability with its notable use of significant whitespace."""
print(word_frequency(text))
# [('python', 2), ('is', 1), ('an', 1), ('interpreted', 1), ('high', 1),
# ('level', 1), ('general', 1), ('purpose', 1), ('programming', 1), ('language', 1)]
使用OrderedDict实现LRU缓存
from collections import OrderedDict
class LRUCache:
def __init__(self, capacity):
self.cache = OrderedDict()
self.capacity = capacity
def get(self, key):
if key not in self.cache:
return -1
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key, value):
if key in self.cache:
self.cache.move_to_end(key)
self.cache[key] = value
if len(self.cache) > self.capacity:
self.cache.popitem(last=False)
cache = LRUCache(2)
cache.put(1, 1)
cache.put(2, 2)
print(cache.get(1)) # 1
cache.put(3, 3) # 超出容量,移除2
print(cache.get(2)) # -1 (已被移除)
使用ChainMap实现配置优先级
from collections import ChainMap
defaults = {'theme': 'light', 'language': 'en', 'show_help': True}
user_prefs = {'theme': 'dark', 'timezone': 'UTC'}
config = ChainMap(user_prefs, defaults)
print(config['theme']) # 'dark' (用户设置优先)
print(config['language']) # 'en' (使用默认值)
print(config['timezone']) # 'UTC' (用户设置)
# 动态更新
user_prefs['language'] = 'fr'
print(config['language']) # 'fr' (动态反映变化)
5. 性能比较
deque
vs list
:
deque
从两端添加/删除元素的时间复杂度为 O(1)list
在开头插入/删除元素的时间复杂度为 O(n)
defaultdict
vs dict.setdefault()
:
defaultdict
更简洁高效,适合大量使用默认值的场景dict.setdefault()
更灵活,适合偶尔使用默认值的场景
Counter
:
- 统计元素频率比手动实现更高效
- 支持快速的数学运算(并集、交集等)
OrderedDict
:
- Python 3.7+ 中普通
dict
也保持插入顺序 OrderedDict
仍然有用,因为它有move_to_end()
等方法
6. 应用案例
使用namedtuple处理数据库记录
from collections import namedtuple
import sqlite3
# 定义记录结构
EmployeeRecord = namedtuple('EmployeeRecord', 'id name title department salary')
# 连接数据库
conn = sqlite3.connect('company.db')
cursor = conn.cursor()
# 查询并转换为namedtuple
cursor.execute('SELECT id, name, title, department, salary FROM employees')
for emp in map(EmployeeRecord._make, cursor.fetchall()):
print(emp.name, emp.title, emp.salary)
# 可以像对象一样访问字段,比字典更清晰
使用deque实现滑动窗口平均
from collections import deque
import random
def sliding_average(iterable, window_size):
window = deque(maxlen=window_size)
for item in iterable:
window.append(item)
yield sum(window) / len(window)
# 模拟温度数据
temperatures = [random.uniform(20, 25) for _ in range(100)]
averages = list(sliding_average(temperatures, 5))
print("原始数据:", temperatures[:10])
print("滑动平均:", averages[:10])
使用Counter分析日志文件
from collections import Counter
import re
def analyze_logs(logfile):
ip_pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
with open(logfile) as f:
ip_counts = Counter(ip_pattern.search(line).group() for line in f if ip_pattern.search(line))
print("Top 10 IP addresses:")
for ip, count in ip_counts.most_common(10):
print(f"{ip}: {count}")
# analyze_logs('access.log')
使用defaultdict构建图结构
from collections import defaultdict
class Graph:
def __init__(self):
self.graph = defaultdict(list)
def add_edge(self, u, v):
self.graph[u].append(v)
def bfs(self, start):
visited = set()
queue = [start]
visited.add(start)
while queue:
vertex = queue.pop(0)
print(vertex, end=' ')
for neighbor in self.graph[vertex]:
if neighbor not in visited:
visited.add(neighbor)
queue.append(neighbor)
g = Graph()
g.add_edge(0, 1)
g.add_edge(0, 2)
g.add_edge(1, 2)
g.add_edge(2, 0)
g.add_edge(2, 3)
g.add_edge(3, 3)
print("BFS traversal:")
g.bfs(2) # 2 0 3 1