Python collections模块实用教程

Python collections 模块实用教程

collections 是 Python 标准库中一个强大的模块,提供了许多有用的容器数据类型,可以在某些情况下替代 Python 的内置容器(如 dict, list, set, tuple)。

目录

  1. 基础容器类型
  2. 高级容器类型
  3. 抽象基类
  4. 应用技巧
  5. 性能比较
  6. 应用案例

1. 基础容器类型

namedtuple

创建具有命名字段的元组子类,使代码更易读。

from collections import namedtuple

# 创建一个简单的Point类
Point = namedtuple('Point', ['x', 'y'])
p = Point(11, y=22)  # 可以用位置参数或关键字参数实例化

print(p.x, p.y)      # 11 22
print(p[0], p[1])    # 11 22 (仍然支持索引访问)
print(p._asdict())   # {'x': 11, 'y': 22}

# 创建更复杂的Employee类
Employee = namedtuple('Employee', 'name age department salary', defaults=['IT', 50000])
emp = Employee('Alice', 30)
print(emp)  # Employee(name='Alice', age=30, department='IT', salary=50000)

# _replace 创建新实例
emp_new = emp._replace(age=31, salary=55000)
print(emp_new)  # Employee(name='Alice', age=31, department='IT', salary=55000)

# _fields 查看字段名
print(Employee._fields)  # ('name', 'age', 'department', 'salary')

# _make 从可迭代对象创建
emp2 = Employee._make(['Bob', 25, 'HR', 45000])
print(emp2)  # Employee(name='Bob', age=25, department='HR', salary=45000)

deque

双端队列,支持从两端快速添加和删除元素。

from collections import deque

d = deque('ghi')                 # 用可迭代对象初始化
print(d)                         # deque(['g', 'h', 'i'])

d.append('j')                    # 右侧添加
d.appendleft('f')                # 左侧添加
print(d)                         # deque(['f', 'g', 'h', 'i', 'j'])

print(d.pop())                   # 'j' (右侧移除)
print(d.popleft())               # 'f' (左侧移除)
print(d)                         # deque(['g', 'h', 'i'])

print(d[1])                      # 'h' (支持索引访问)
d[1] = 'x'                       # 支持索引赋值
print(d)                         # deque(['g', 'x', 'i'])

d.extend('jkl')                  # 右侧扩展
d.extendleft('def')              # 左侧扩展(注意顺序)
print(d)                         # deque(['f', 'e', 'd', 'g', 'x', 'i', 'j', 'k', 'l'])

d.rotate(1)                      # 向右旋转1步
print(d)                         # deque(['l', 'f', 'e', 'd', 'g', 'x', 'i', 'j', 'k'])

d.rotate(-2)                     # 向左旋转2步
print(d)                         # deque(['e', 'd', 'g', 'x', 'i', 'j', 'k', 'l', 'f'])

# 限制最大长度
d = deque(maxlen=3)
d.extend('abc')
print(d)                         # deque(['a', 'b', 'c'], maxlen=3)
d.append('d')                    # 添加新元素时,左侧元素会被自动移除
print(d)                         # deque(['b', 'c', 'd'], maxlen=3)

ChainMap

将多个字典或映射链接在一起,形成单个可更新的视图。

from collections import ChainMap

dict1 = {'a': 1, 'b': 2}
dict2 = {'b': 3, 'c': 4}
chain = ChainMap(dict1, dict2)

print(chain['a'])  # 1 (来自dict1)
print(chain['b'])  # 2 (来自dict1,因为dict1在前)
print(chain['c'])  # 4 (来自dict2)

# 更新操作只影响第一个映射
chain['c'] = 5
print(dict2)  # {'b': 3, 'c': 4} (未改变)
print(dict1)  # {'a': 1, 'b': 2, 'c': 5} (新增了'c')

# 添加新映射
dict3 = {'d': 6}
new_chain = chain.new_child(dict3)
print(new_chain['d'])  # 6

# parents属性跳过第一个映射
print(new_chain.parents['b'])  # 2 (来自dict1)

# 合并操作
merged = dict(chain)  # 转换为普通字典
print(merged)         # {'a': 1, 'b': 2, 'c': 5}

# 动态更新
dict1['e'] = 7
print(chain['e'])     # 7 (动态反映变化)

Counter

计数器,用于统计可哈希对象的出现次数。

from collections import Counter

# 从可迭代对象创建
cnt = Counter(['red', 'blue', 'red', 'green', 'blue', 'blue'])
print(cnt)  # Counter({'blue': 3, 'red': 2, 'green': 1})

# 从字典创建
cnt = Counter({'red': 4, 'blue': 2})
print(cnt)  # Counter({'red': 4, 'blue': 2})

# 从关键字参数创建
cnt = Counter(cats=4, dogs=8)
print(cnt)  # Counter({'dogs': 8, 'cats': 4})

# 基本操作
print(cnt['dogs'])       # 8
print(cnt['birds'])      # 0 (不存在的键返回0)

cnt['dogs'] += 1
print(cnt['dogs'])       # 9

del cnt['cats']
print(cnt)               # Counter({'dogs': 9})

# 方法
print(list(cnt.elements()))  # ['dogs', 'dogs', ...] (9次)

print(cnt.most_common(1))    # [('dogs', 9)]
print(cnt.most_common())     # [('dogs', 9)]

# 数学运算
c = Counter(a=3, b=1)
d = Counter(a=1, b=2)
print(c + d)            # Counter({'a': 4, 'b': 3})
print(c - d)            # Counter({'a': 2}) (负值被忽略)
print(c & d)            # Counter({'a': 1, 'b': 1}) (交集: min)
print(c | d)            # Counter({'a': 3, 'b': 2}) (并集: max)

# 更新和减法
c.update(d)             # 类似于c += d
print(c)                # Counter({'a': 4, 'b': 3})

c.subtract(d)           # 类似于c -= d
print(c)                # Counter({'a': 3, 'b': 1})

OrderedDict

有序字典,记住键的插入顺序(Python 3.7+ 普通dict也保持顺序,但OrderedDict有额外功能)。

from collections import OrderedDict

d = OrderedDict()
d['a'] = 1
d['b'] = 2
d['c'] = 3
print(d)  # OrderedDict([('a', 1), ('b', 2), ('c', 3)])

# 保持插入顺序
for k, v in d.items():
    print(k, v)  # a 1, b 2, c 3

# 移动元素到末尾
d.move_to_end('a')
print(d)  # OrderedDict([('b', 2), ('c', 3), ('a', 1)])

# 移动元素到开头
d.move_to_end('b', last=False)
print(d)  # OrderedDict([('b', 2), ('c', 3), ('a', 1)])

# popitem移除并返回元素
print(d.popitem())        # ('a', 1) (默认移除最后一个)
print(d.popitem(last=False))  # ('b', 2) (移除第一个)

# 指定顺序排序
d = OrderedDict(sorted(d.items(), key=lambda t: t[0]))  # 按键排序
print(d)  # OrderedDict([('a', 1), ('b', 2), ('c', 3)])

# 相等性比较
d1 = OrderedDict([('a', 1), ('b', 2)])
d2 = OrderedDict([('b', 2), ('a', 1)])
print(d1 == d2)  # False (顺序不同)

defaultdict

带有默认工厂的字典,当键不存在时自动创建默认值。

from collections import defaultdict

# 默认值为0的字典
dd = defaultdict(int)
print(dd['a'])  # 0 (自动创建)
dd['b'] += 1    # 可以直接操作
print(dd)       # defaultdict(<class 'int'>, {'a': 0, 'b': 1})

# 默认值为列表的字典
dd_list = defaultdict(list)
dd_list['colors'].append('red')
dd_list['colors'].append('blue')
print(dd_list)  # defaultdict(<class 'list'>, {'colors': ['red', 'blue']})

# 默认值为集合的字典
dd_set = defaultdict(set)
dd_set['tags'].add('python')
dd_set['tags'].add('java')
dd_set['tags'].add('python')  # 重复值会被忽略
print(dd_set)  # defaultdict(<class 'set'>, {'tags': {'python', 'java'}})

# 使用lambda指定更复杂的默认值
dd_complex = defaultdict(lambda: {'count': 0, 'name': ''})
dd_complex['a']['count'] += 1
dd_complex['a']['name'] = 'Alice'
print(dd_complex)  # defaultdict(<function <lambda> at ...>, {'a': {'count': 1, 'name': 'Alice'}})

# 普通字典实现类似功能(更冗长)
regular_dict = {}
regular_dict.setdefault('a', []).append(1)
regular_dict.setdefault('a', []).append(2)
print(regular_dict)  # {'a': [1, 2]}

2. 高级容器类型

UserDict

字典对象的包装器,方便创建字典子类。

from collections import UserDict

class MyDict(UserDict):
    def __missing__(self, key):
        return f'Key {key} not found'
    
    def __contains__(self, key):
        return str(key) in self.data
    
    def __setitem__(self, key, value):
        super().__setitem__(str(key).lower(), value)

d = MyDict({'Name': 'Alice', 'Age': 30})
print(d['name'])      # 'Alice' (不区分大小写)
print(d['EMAIL'])     # 'Key email not found' (__missing__)
print('AGE' in d)     # True (不区分大小写)

d['EMAIL'] = 'alice@example.com'
print(d.data)         # {'name': 'Alice', 'age': 30, 'email': 'alice@example.com'}

UserList

列表对象的包装器,方便创建列表子类。

from collections import UserList

class SortedList(UserList):
    def __init__(self, iterable=None):
        super().__init__(sorted(iterable) if iterable else [])
    
    def append(self, item):
        super().append(item)
        self.data.sort()
    
    def extend(self, items):
        super().extend(items)
        self.data.sort()

lst = SortedList([3, 1, 2])
print(lst)            # [1, 2, 3]
lst.append(0)
print(lst)            # [0, 1, 2, 3]
lst.extend([5, 4])
print(lst)            # [0, 1, 2, 3, 4, 5]

UserString

字符串对象的包装器,方便创建字符串子类。

from collections import UserString

class MyString(UserString):
    def reverse(self):
        return self.data[::-1]
    
    def remove_punctuation(self):
        import string
        return ''.join(c for c in self.data if c not in string.punctuation)

s = MyString("Hello, World!")
print(s.reverse())            # "!dlroW ,olleH"
print(s.remove_punctuation()) # "Hello World"

3. 抽象基类

collections.abc 模块提供了许多抽象基类,用于测试一个类是否提供了特定的接口。

Collection

from collections.abc import Collection

class MyCollection:
    def __init__(self, data):
        self._data = list(data)
    
    def __iter__(self):
        return iter(self._data)
    
    def __len__(self):
        return len(self._data)
    
    def __contains__(self, item):
        return item in self._data

# 检查是否实现了Collection接口
print(issubclass(MyCollection, Collection))  # True

Sequence

from collections.abc import Sequence

class MySequence(Sequence):
    def __init__(self, data):
        self._data = list(data)
    
    def __getitem__(self, index):
        return self._data[index]
    
    def __len__(self):
        return len(self._data)

seq = MySequence([1, 2, 3])
print(seq[1])        # 2
print(len(seq))      # 3
print(3 in seq)      # True

MutableSequence

from collections.abc import MutableSequence

class MyMutableList(MutableSequence):
    def __init__(self, data=None):
        self._data = list(data) if data else []
    
    def __getitem__(self, index):
        return self._data[index]
    
    def __setitem__(self, index, value):
        self._data[index] = value
    
    def __delitem__(self, index):
        del self._data[index]
    
    def __len__(self):
        return len(self._data)
    
    def insert(self, index, value):
        self._data.insert(index, value)

lst = MyMutableList([1, 2, 3])
lst.append(4)
lst[1] = 20
del lst[0]
print(lst)  # [20, 3, 4]

Mapping

from collections.abc import Mapping

class MyMapping(Mapping):
    def __init__(self, data):
        self._data = dict(data)
    
    def __getitem__(self, key):
        return self._data[key]
    
    def __iter__(self):
        return iter(self._data)
    
    def __len__(self):
        return len(self._data)

m = MyMapping({'a': 1, 'b': 2})
print(m['a'])        # 1
print(len(m))        # 2
print('b' in m)      # True

MutableMapping

from collections.abc import MutableMapping

class MyMutableDict(MutableMapping):
    def __init__(self, data=None):
        self._data = dict(data) if data else {}
    
    def __getitem__(self, key):
        return self._data[key]
    
    def __setitem__(self, key, value):
        self._data[key] = value
    
    def __delitem__(self, key):
        del self._data[key]
    
    def __iter__(self):
        return iter(self._data)
    
    def __len__(self):
        return len(self._data)

d = MyMutableDict({'a': 1})
d['b'] = 2
del d['a']
print(dict(d))  # {'b': 2}

Set

from collections.abc import Set

class MySet(Set):
    def __init__(self, iterable):
        self._data = set(iterable)
    
    def __contains__(self, item):
        return item in self._data
    
    def __iter__(self):
        return iter(self._data)
    
    def __len__(self):
        return len(self._data)

s = MySet([1, 2, 3])
print(2 in s)    # True
print(len(s))    # 3

MutableSet

from collections.abc import MutableSet

class MyMutableSet(MutableSet):
    def __init__(self, iterable=None):
        self._data = set(iterable) if iterable else set()
    
    def __contains__(self, item):
        return item in self._data
    
    def __iter__(self):
        return iter(self._data)
    
    def __len__(self):
        return len(self._data)
    
    def add(self, item):
        self._data.add(item)
    
    def discard(self, item):
        self._data.discard(item)

s = MyMutableSet([1, 2, 3])
s.add(4)
s.discard(2)
print(list(s))  # [1, 3, 4]

Iterable

Iterable 是最基础的迭代协议,表示可迭代对象。

from collections.abc import Iterable

class MyIterable:
    def __iter__(self):
        return iter([1, 2, 3])

# 检查是否是Iterable
print(isinstance(MyIterable(), Iterable))  # True
print(isinstance([1, 2, 3], Iterable))    # True
print(isinstance("abc", Iterable))         # True
print(isinstance(123, Iterable))           # False


# 自定义 Iterable 类
class CountDown(Iterable):
    def __init__(self, start):
        self.start = start
    
    def __iter__(self):
        n = self.start
        while n > 0:
            yield n
            n -= 1

count_down = CountDown(5)
print(list(count_down))  # [5, 4, 3, 2, 1]

# 或者使用迭代器方式实现
class CountDownAlt:
    def __init__(self, start):
        self.start = start
    
    def __iter__(self):
        return CountDownIterator(self.start)

class CountDownIterator:
    def __init__(self, count):
        self.count = count
    
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.count <= 0:
            raise StopIteration
        value = self.count
        self.count -= 1
        return value

count_down = CountDownAlt(3)
print(list(count_down))  # [3, 2, 1]

Iterator

Iterator 继承自 Iterable,表示迭代器对象。

from collections.abc import Iterator

# 内置迭代器
numbers = [1, 2, 3]
iter_numbers = iter(numbers)
print(isinstance(iter_numbers, Iterator))  # True
print(isinstance(numbers, Iterator))       # False

# 生成器是迭代器
gen = (x for x in range(3))
print(isinstance(gen, Iterator))           # True


# 自定义 Iterator 类
class SquareIterator(Iterator):
    def __init__(self, numbers):
        self.numbers = iter(numbers)
    
    def __next__(self):
        num = next(self.numbers)  # 可能引发StopIteration
        return num * num
    
    # Iterator已经实现了__iter__方法,返回self

squares = SquareIterator([1, 2, 3])
print(list(squares))  # [1, 4, 9]

Generator

Generator 继承自 Iterator,表示生成器对象。

from collections.abc import Generator

def simple_gen():
    yield 1
    yield 2
    yield 3

gen = simple_gen()
print(isinstance(gen, Generator))  # True
print(isinstance(gen, Iterator))   # True
print(isinstance(gen, Iterable))   # True

# 自定义Genarator类
class MyGenerator(Generator):
    def __init__(self, iterable):
        self.iterable = iterable
        self.iterator = None
    
    def send(self, value):
        if self.iterator is None:
            self.iterator = iter(self.iterable)
            return next(self.iterator)
        try:
            return next(self.iterator)
        except StopIteration:
            raise StopIteration from None
    
    def throw(self, typ, val=None, tb=None):
        super().throw(typ, val, tb)
    
    def close(self):
        super().close()

gen = MyGenerator([1, 2, 3])
print(next(gen))  # 1
print(next(gen))  # 2
print(next(gen))  # 3

4. 应用技巧

基于Counter的词频统计

from collections import Counter
import re

def word_frequency(text, n=10):
    words = re.findall(r'\w+', text.lower())
    return Counter(words).most_common(n)

text = """Python is an interpreted, high-level, general-purpose programming language. 
Created by Guido van Rossum and first released in 1991, Python's design philosophy 
emphasizes code readability with its notable use of significant whitespace."""

print(word_frequency(text))
# [('python', 2), ('is', 1), ('an', 1), ('interpreted', 1), ('high', 1), 
#  ('level', 1), ('general', 1), ('purpose', 1), ('programming', 1), ('language', 1)]

使用OrderedDict实现LRU缓存

from collections import OrderedDict

class LRUCache:
    def __init__(self, capacity):
        self.cache = OrderedDict()
        self.capacity = capacity
    
    def get(self, key):
        if key not in self.cache:
            return -1
        self.cache.move_to_end(key)
        return self.cache[key]
    
    def put(self, key, value):
        if key in self.cache:
            self.cache.move_to_end(key)
        self.cache[key] = value
        if len(self.cache) > self.capacity:
            self.cache.popitem(last=False)

cache = LRUCache(2)
cache.put(1, 1)
cache.put(2, 2)
print(cache.get(1))    # 1
cache.put(3, 3)        # 超出容量,移除2
print(cache.get(2))    # -1 (已被移除)

使用ChainMap实现配置优先级

from collections import ChainMap

defaults = {'theme': 'light', 'language': 'en', 'show_help': True}
user_prefs = {'theme': 'dark', 'timezone': 'UTC'}

config = ChainMap(user_prefs, defaults)

print(config['theme'])       # 'dark' (用户设置优先)
print(config['language'])    # 'en' (使用默认值)
print(config['timezone'])    # 'UTC' (用户设置)

# 动态更新
user_prefs['language'] = 'fr'
print(config['language'])    # 'fr' (动态反映变化)

5. 性能比较

deque vs list:

  • deque 从两端添加/删除元素的时间复杂度为 O(1)
  • list 在开头插入/删除元素的时间复杂度为 O(n)

defaultdict vs dict.setdefault():

  • defaultdict 更简洁高效,适合大量使用默认值的场景
  • dict.setdefault() 更灵活,适合偶尔使用默认值的场景

Counter:

  • 统计元素频率比手动实现更高效
  • 支持快速的数学运算(并集、交集等)

OrderedDict:

  • Python 3.7+ 中普通 dict 也保持插入顺序
  • OrderedDict 仍然有用,因为它有 move_to_end() 等方法

6. 应用案例

使用namedtuple处理数据库记录

from collections import namedtuple
import sqlite3

# 定义记录结构
EmployeeRecord = namedtuple('EmployeeRecord', 'id name title department salary')

# 连接数据库
conn = sqlite3.connect('company.db')
cursor = conn.cursor()

# 查询并转换为namedtuple
cursor.execute('SELECT id, name, title, department, salary FROM employees')
for emp in map(EmployeeRecord._make, cursor.fetchall()):
    print(emp.name, emp.title, emp.salary)
    # 可以像对象一样访问字段,比字典更清晰

使用deque实现滑动窗口平均

from collections import deque
import random

def sliding_average(iterable, window_size):
    window = deque(maxlen=window_size)
    for item in iterable:
        window.append(item)
        yield sum(window) / len(window)

# 模拟温度数据
temperatures = [random.uniform(20, 25) for _ in range(100)]
averages = list(sliding_average(temperatures, 5))

print("原始数据:", temperatures[:10])
print("滑动平均:", averages[:10])

使用Counter分析日志文件

from collections import Counter
import re

def analyze_logs(logfile):
    ip_pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
    with open(logfile) as f:
        ip_counts = Counter(ip_pattern.search(line).group() for line in f if ip_pattern.search(line))
    
    print("Top 10 IP addresses:")
    for ip, count in ip_counts.most_common(10):
        print(f"{ip}: {count}")

# analyze_logs('access.log')

使用defaultdict构建图结构

from collections import defaultdict

class Graph:
    def __init__(self):
        self.graph = defaultdict(list)
    
    def add_edge(self, u, v):
        self.graph[u].append(v)
    
    def bfs(self, start):
        visited = set()
        queue = [start]
        visited.add(start)
        
        while queue:
            vertex = queue.pop(0)
            print(vertex, end=' ')
            
            for neighbor in self.graph[vertex]:
                if neighbor not in visited:
                    visited.add(neighbor)
                    queue.append(neighbor)

g = Graph()
g.add_edge(0, 1)
g.add_edge(0, 2)
g.add_edge(1, 2)
g.add_edge(2, 0)
g.add_edge(2, 3)
g.add_edge(3, 3)

print("BFS traversal:")
g.bfs(2)  # 2 0 3 1

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值