第 7 讲：Python 生成器与协程 | yield、yield from、生成器表达式与协程原理

大家好，我是正在实战各种 AI 项目的程序员晚枫。

处理 100 万条数据，内存只占用几 KB？构建数据处理管道，代码像流水线一样优雅？生成器让这一切成为可能！

🔄 生成器函数：yield 的本质

普通函数 vs 生成器函数

import sys

# 普通函数：返回列表，一次性占用全部内存
def squares_list(n: int) -> list[int]:
    result = []
    for i in range(n):
        result.append(i ** 2)
    return result

# 生成器函数：惰性求值，按需计算
def squares_gen(n: int):
    for i in range(n):
        yield i ** 2  # 每次 yield 一个值，暂停执行

N = 1_000_000

lst = squares_list(N)
gen = squares_gen(N)

print(f"列表内存：{sys.getsizeof(lst):,} bytes")  # ~8,000,056 bytes（~8 MB）
print(f"生成器内存：{sys.getsizeof(gen)} bytes")   # ~112 bytes

# 两者使用方式相同
for x in squares_gen(10):
    print(x, end=" ")  # 0 1 4 9 16 25 36 49 64 81

yield 的执行流程

def step_by_step():
    print("Step 1")
    yield 1           # 暂停，返回 1
    print("Step 2")
    yield 2           # 暂停，返回 2
    print("Step 3")
    # 函数结束，触发 StopIteration

gen = step_by_step()

value = next(gen)
print(f"得到：{value}")   # Step 1 \n 得到：1

value = next(gen)
print(f"得到：{value}")   # Step 2 \n 得到：2

try:
    next(gen)             # Step 3，然后 StopIteration
except StopIteration:
    print("生成器耗尽")

生成器表达式

import sys

# 列表推导式：立即计算所有值
lst = [x ** 2 for x in range(1_000_000)]

# 生成器表达式：同样语法，但用括号
gen = (x ** 2 for x in range(1_000_000))

print(sys.getsizeof(lst))  # ~8 MB
print(sys.getsizeof(gen))  # ~120 bytes

# 可以直接传给 sum、max 等函数（不需要额外的括号）
total = sum(x ** 2 for x in range(1000))
max_val = max(x for x in [3, 1, 4, 1, 5, 9])

📡 yield from：子生成器委托

基础用法

def sub_generator():
    yield 1
    yield 2
    yield 3

def main_generator():
    yield "start"
    yield from sub_generator()   # 委托给子生成器
    yield from range(10, 13)     # 也可以委托给任意可迭代对象
    yield "end"

print(list(main_generator()))
# ['start', 1, 2, 3, 10, 11, 12, 'end']

嵌套结构扁平化

from typing import Any, Iterator

def flatten(nested: Any) -> Iterator:
    """递归展平任意深度嵌套的可迭代对象"""
    if isinstance(nested, (list, tuple)):
        for item in nested:
            yield from flatten(item)  # 递归委托
    else:
        yield nested

data = [1, [2, 3], [4, [5, [6, 7]]], 8]
print(list(flatten(data)))  # [1, 2, 3, 4, 5, 6, 7, 8]

# 对比不用 yield from 的写法（啰嗦且容易出错）
def flatten_manual(nested):
    for item in nested:
        if isinstance(item, (list, tuple)):
            for sub in flatten_manual(item):  # 手动 for 循环
                yield sub
        else:
            yield item

yield from 的值传递

def accumulator():
    """通过 yield 接收外部发送的值"""
    total = 0
    while True:
        value = yield total  # yield 既返回值，也接收 send() 传入的值
        if value is None:
            break
        total += value

gen = accumulator()
next(gen)       # 启动生成器，执行到第一个 yield

gen.send(10)    # 发送 10，total = 10，返回 10
gen.send(20)    # 发送 20，total = 30，返回 30
result = gen.send(5)  # 发送 5，total = 35，返回 35
print(result)   # 35

🏭 数据处理管道：生成器的杀手级应用

生成器最强的应用场景之一：构建 惰性数据处理管道，每个阶段都是生成器，数据流式处理：

import re
from typing import Iterator

# 管道的每个步骤都是生成器
def read_lines(filename: str) -> Iterator[str]:
    """读取文件（流式，不全量加载）"""
    with open(filename, encoding='utf-8') as f:
        for line in f:
            yield line.rstrip('\n')

def filter_empty(lines: Iterator[str]) -> Iterator[str]:
    """过滤空行"""
    for line in lines:
        if line.strip():
            yield line

def strip_comments(lines: Iterator[str]) -> Iterator[str]:
    """去除注释行"""
    for line in lines:
        if not line.startswith('#'):
            yield line

def parse_fields(lines: Iterator[str]) -> Iterator[dict]:
    """解析字段（假设 CSV 格式）"""
    for line in lines:
        fields = line.split(',')
        if len(fields) >= 3:
            yield {
                'name': fields[0].strip(),
                'age': int(fields[1].strip()),
                'email': fields[2].strip(),
            }

def filter_by_age(records: Iterator[dict], min_age: int) -> Iterator[dict]:
    """按年龄过滤"""
    for record in records:
        if record['age'] >= min_age:
            yield record


# 构建管道：每一步都是惰性的
# 数据只在最终消费时才真正流过管道
def process_users(filename: str) -> Iterator[dict]:
    pipeline = read_lines(filename)
    pipeline = filter_empty(pipeline)
    pipeline = strip_comments(pipeline)
    pipeline = parse_fields(pipeline)
    pipeline = filter_by_age(pipeline, 18)
    return pipeline

# 处理 100 万行文件，内存只占用几 KB（每次只处理一条记录）
for user in process_users('users.csv'):
    print(user)

🔧 itertools：生成器工具库

itertools 是 Python 标准库中的生成器工具集，全部是惰性的：

import itertools

# 1. count：无限计数
for i in itertools.count(10, 2):  # 从 10 开始，步长 2
    if i > 20:
        break
    print(i, end=" ")  # 10 12 14 16 18 20

# 2. cycle：无限循环
colors = itertools.cycle(['红', '绿', '蓝'])
for _ in range(7):
    print(next(colors), end=" ")  # 红 绿 蓝 红 绿 蓝 红

# 3. chain：串联多个可迭代对象
result = list(itertools.chain([1, 2], [3, 4], [5, 6]))
print(result)  # [1, 2, 3, 4, 5, 6]

# 4. islice：惰性切片
gen = (x**2 for x in range(1_000_000))
first_10 = list(itertools.islice(gen, 10))
print(first_10)  # [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

# 5. takewhile / dropwhile：条件过滤
data = [1, 2, 3, 10, 4, 5]
result = list(itertools.takewhile(lambda x: x < 5, data))
print(result)  # [1, 2, 3]

# 6. groupby：分组
data = [
    {'name': '张三', 'dept': '技术'},
    {'name': '李四', 'dept': '产品'},
    {'name': '王五', 'dept': '技术'},
    {'name': '赵六', 'dept': '产品'},
]
# 注意：groupby 前需要先排序
data.sort(key=lambda x: x['dept'])
for dept, members in itertools.groupby(data, key=lambda x: x['dept']):
    print(f"{dept}: {[m['name'] for m in members]}")
# 产品: ['李四', '赵六']
# 技术: ['张三', '王五']

# 7. combinations / permutations
print(list(itertools.combinations('ABC', 2)))
# [('A', 'B'), ('A', 'C'), ('B', 'C')]

print(list(itertools.permutations('AB', 2)))
# [('A', 'B'), ('B', 'A')]

💡 生成器协程：send 和 throw

生成器不仅能产出数据，还能通过 send() 接收数据，实现双向通信：

def running_average() -> None:
    """运行中实时计算平均值的协程"""
    total = 0.0
    count = 0
    average = None
    
    while True:
        value = yield average  # 产出当前平均值，等待下一个输入
        if value is None:
            break
        total += value
        count += 1
        average = total / count


# 使用协程
coro = running_average()
next(coro)  # 启动协程（执行到第一个 yield）

print(coro.send(10))   # 10.0
print(coro.send(20))   # 15.0
print(coro.send(30))   # 20.0
print(coro.send(40))   # 25.0

# 关闭协程
coro.close()

⚠️ 常见陷阱

1. 生成器只能遍历一次

gen = (x**2 for x in range(5))
print(list(gen))  # [0, 1, 4, 9, 16]
print(list(gen))  # []  ← 已耗尽，返回空列表！

# 如果需要多次遍历，转成列表
data = list(gen)  # 或者用生成器函数重新调用

2. 忘记启动协程（执行到第一个 yield）

def my_coroutine():
    while True:
        value = yield
        print(f"Got: {value}")

coro = my_coroutine()
# ❌ 直接 send，报错：can't send non-None value to a just-started generator
# coro.send(42)

# ✅ 先 next() 启动
next(coro)    # 或 coro.send(None)
coro.send(42)  # Got: 42

3. return 值在 StopIteration 中

def gen_with_return():
    yield 1
    yield 2
    return "final value"  # 生成器的 return 值

gen = gen_with_return()
next(gen)  # 1
next(gen)  # 2
try:
    next(gen)
except StopIteration as e:
    print(e.value)  # "final value" ← return 的值在这里

# yield from 会自动捕获这个值
def delegating():
    result = yield from gen_with_return()
    print(f"子生成器返回：{result}")  # "final value"