1 min read209 words

collections 与高效数据操作

标准库 collections 提供了一组高性能容器——用对数据结构，性能翻倍。

collections 家族

graph TD COLL[collections 模块] --> COUNTER[Counter] COLL --> DDICT[defaultdict] COLL --> ODICT[OrderedDict] COLL --> DEQUE[deque] COLL --> NAMED[namedtuple] COLL --> CHAIN[ChainMap] COUNTER --> C1[计数统计] DDICT --> D1[自动初始化] DEQUE --> DQ1[双端队列 O1] NAMED --> N1[不可变记录] CHAIN --> CH1[多字典合并] style COLL fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style COUNTER fill:#c8e6c9,stroke:#388e3c,stroke-width:2px style DEQUE fill:#fff3e0,stroke:#f57c00,stroke-width:2px

Counter 计数器

"""
Counter：高效计数就靠它
"""
from collections import Counter
# 基本计数
words = "the quick brown fox jumps over the lazy dog the fox".split()
count = Counter(words)
print(count.most_common(3))
# [('the', 3), ('fox', 2), ('quick', 1)]
# 数学运算
c1 = Counter(a=3, b=1)
c2 = Counter(a=1, b=2)
print(c1 + c2)  # Counter({'a': 4, 'b': 3})
print(c1 - c2)  # Counter({'a': 2})  自动去掉 <=0
# 实战：日志分析
log_levels = ["INFO", "ERROR", "INFO", "WARN", "ERROR", "INFO", "DEBUG"]
stats = Counter(log_levels)
print(f"错误数: {stats['ERROR']}")  # 2
print(f"总日志: {stats.total()}")   # 7 (Python 3.10+)

defaultdict 自动初始化

"""
defaultdict：再也不用 if key not in dict
"""
from collections import defaultdict
# ❌ 传统写法
groups = {}
for name, dept in [("Alice", "eng"), ("Bob", "eng"), ("Carol", "sales")]:
if dept not in groups:
groups[dept] = []
groups[dept].append(name)
# ✅ defaultdict
groups = defaultdict(list)
for name, dept in [("Alice", "eng"), ("Bob", "eng"), ("Carol", "sales")]:
groups[dept].append(name)
print(dict(groups))
# {'eng': ['Alice', 'Bob'], 'sales': ['Carol']}
# 嵌套 defaultdict（多层自动初始化）
tree = lambda: defaultdict(tree)
taxonomy = tree()
taxonomy["动物"]["哺乳类"]["猫科"] = ["猫", "虎", "豹"]
taxonomy["动物"]["鸟类"]["鹰科"] = ["鹰", "鹫"]
# defaultdict(int) 做计数器
char_count = defaultdict(int)
for ch in "hello world":
char_count[ch] += 1
print(dict(char_count))

deque 双端队列

"""
deque：两端操作都是 O(1)
"""
from collections import deque
# 列表 vs deque 性能对比
# list.insert(0, x)  → O(n) 慢!
# deque.appendleft(x) → O(1) 快!
# 基本用法
dq = deque([1, 2, 3])
dq.appendleft(0)     # [0, 1, 2, 3]
dq.append(4)          # [0, 1, 2, 3, 4]
dq.popleft()          # 0, deque = [1, 2, 3, 4]
dq.rotate(1)          # [4, 1, 2, 3] 右旋
# 固定长度滑动窗口
window = deque(maxlen=3)
for i in range(5):
window.append(i)
print(list(window))
# [0]
# [0, 1]
# [0, 1, 2]
# [1, 2, 3]  ← 自动弹出最老的
# [2, 3, 4]
# 实战：最近 N 条日志
recent_logs = deque(maxlen=100)
def log(message: str):
from datetime import datetime
recent_logs.append(f"[{datetime.now():%H:%M:%S}] {message}")
log("服务启动")
log("收到请求")
print(list(recent_logs))

性能对比

操作	list	deque	dict	set
头部插入	O(n)	O(1)	-	-
尾部插入	O(1)	O(1)	O(1)	O(1)
随机访问	O(1)	O(n)	O(1)	-
查找	O(n)	O(n)	O(1)	O(1)
删除	O(n)	O(n)	O(1)	O(1)

"""
选择合适的数据结构
"""
# namedtuple：轻量不可变记录
from collections import namedtuple
Point = namedtuple("Point", ["x", "y"])
p = Point(3, 4)
print(p.x, p.y)          # 3 4
print(p._asdict())        # {'x': 3, 'y': 4}
# 比 dict 省内存，比 tuple 更可读
from sys import getsizeof
d = {"x": 3, "y": 4}
t = (3, 4)
n = Point(3, 4)
print(f"dict: {getsizeof(d)} bytes")       # ~232 bytes
print(f"tuple: {getsizeof(t)} bytes")      # ~56 bytes
print(f"namedtuple: {getsizeof(n)} bytes") # ~56 bytes (同 tuple)
# ChainMap：多字典优先级合并
from collections import ChainMap
defaults = {"color": "blue", "size": "medium", "debug": False}
user_prefs = {"color": "red"}
env_config = {"debug": True}
config = ChainMap(env_config, user_prefs, defaults)
print(config["color"])   # red (user_prefs 优先)
print(config["debug"])   # True (env_config 优先)
print(config["size"])    # medium (defaults 兜底)

本章小结

知识点	要点
Counter	计数统计、最常见 N 个
defaultdict	自动初始化、嵌套结构
deque	双端 O(1)、滑动窗口
namedtuple	轻量不可变记录、省内存
ChainMap	多字典优先级合并

下一章：面向对象编程——类是 Python 的核心，掌握 OOP 才能写出可维护的大型项目。