第 19 讲：Python 性能优化 | profiling、NumPy 向量化、缓存策略与 C 扩展加速

大家好，我是正在实战各种 AI 项目的程序员晚枫。

性能优化的黄金法则：先测量，再优化。没有测量的优化，是在猜谜。

《流畅的Python（第2版）》专门讲了一章性能，本讲带你从「测量」到「优化」，建立完整的 Python 性能优化工作流。

🔍 第一步：测量——找到真正的瓶颈

timeit：精准测量小段代码

import timeit

# 方法1：命令行
# python -m timeit "'-'.join(str(n) for n in range(100))"
# python -m timeit "'-'.join([str(n) for n in range(100)])"

# 方法2：代码内
setup = "data = list(range(1000))"

# 对比几种写法的性能
snippets = {
    "for 循环": "result = []\nfor x in data:\n    result.append(x**2)",
    "列表推导式": "result = [x**2 for x in data]",
    "map 函数": "result = list(map(lambda x: x**2, data))",
}

for name, code in snippets.items():
    t = timeit.timeit(code, setup=setup, number=10000)
    print(f"{name}: {t:.4f}s")

# 典型结果：
# for 循环: 1.2345s
# 列表推导式: 0.8901s  ← 最快
# map 函数: 1.1234s

cProfile：函数级性能分析

import cProfile
import pstats
import io

def read_file(filename: str) -> list[str]:
    """读取文件每行"""
    with open(filename) as f:
        return f.readlines()

def count_words(lines: list[str]) -> dict[str, int]:
    """统计词频"""
    counts: dict[str, int] = {}
    for line in lines:
        for word in line.split():
            counts[word] = counts.get(word, 0) + 1
    return counts

def process(filename: str) -> dict[str, int]:
    lines = read_file(filename)
    return count_words(lines)

# 方法1：命令行
# python -m cProfile -s cumulative your_script.py

# 方法2：代码内
profiler = cProfile.Profile()
profiler.enable()
# ... 你的代码 ...
result = process("large_file.txt")
profiler.disable()

# 格式化输出
stream = io.StringIO()
stats = pstats.Stats(profiler, stream=stream)
stats.sort_stats('cumulative')  # 按累计时间排序
stats.print_stats(20)           # 显示耗时最多的 20 个函数
print(stream.getvalue())

# 输出示例：
#          5432 function calls in 0.789 seconds
#
#    Ordered by: cumulative time
#
#    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
#         1    0.001    0.001    0.789    0.789 script.py:1(<module>)
#         1    0.012    0.012    0.788    0.788 script.py:20(process)
#         1    0.001    0.001    0.776    0.776 script.py:13(count_words)
#      5430    0.775    0.000    0.775    0.000 {built-in method builtins.dict.get}

line_profiler：行级性能分析

1	pip install line_profiler

# 在需要分析的函数上加 @profile 装饰器
@profile
def count_words_v1(lines: list[str]) -> dict[str, int]:
    counts: dict[str, int] = {}
    for line in lines:
        for word in line.split():
            counts[word] = counts.get(word, 0) + 1
    return counts

# 运行：kernprof -l -v script.py
# 输出：
# Line #      Hits         Time  Per Hit   % Time  Line Contents
# ==============================================================
#      1         1          2.0      2.0      0.0  @profile
#      2                                          def count_words_v1(lines):
#      3         1          3.0      3.0      0.0      counts = {}
#      4    100000     45678.0      0.5      8.1      for line in lines:
#      5    800000    123456.0      0.2     21.9          for word in line.split():
#      6    800000    394567.0      0.5     70.0              counts[word] = counts.get(word, 0) + 1
#      7         1          2.0      2.0      0.0      return counts

memory_profiler：内存使用分析

1	pip install memory_profiler

from memory_profiler import profile

@profile
def load_data():
    # 方式1：全部加载到内存
    data = [line for line in open('large_file.txt')]
    return data

@profile
def load_data_generator():
    # 方式2：生成器，按需加载
    return (line for line in open('large_file.txt'))

# python -m memory_profiler script.py
# 输出显示每行的内存增量

⚡ 第二步：优化——针对瓶颈用对工具

1. 数据结构选择

import time
from collections import deque, defaultdict, Counter

# list vs deque：频繁头部插入
n = 100_000

# list 的头部插入：O(n)
start = time.time()
lst = []
for i in range(n):
    lst.insert(0, i)
print(f"list 头部插入：{time.time()-start:.3f}s")  # ~2.5s

# deque 的头部插入：O(1)
start = time.time()
dq = deque()
for i in range(n):
    dq.appendleft(i)
print(f"deque 头部插入：{time.time()-start:.3f}s")  # ~0.01s


# dict.get vs defaultdict vs Counter
words = ["apple", "banana", "apple", "cherry", "banana", "apple"] * 10000

# 普通 dict
start = time.time()
counts = {}
for word in words:
    counts[word] = counts.get(word, 0) + 1
print(f"dict.get：{time.time()-start:.4f}s")

# defaultdict
start = time.time()
counts = defaultdict(int)
for word in words:
    counts[word] += 1
print(f"defaultdict：{time.time()-start:.4f}s")  # 稍快

# Counter（最简洁，性能最好）
start = time.time()
counts = Counter(words)
print(f"Counter：{time.time()-start:.4f}s")  # 最快

2. 字符串操作优化

import time

n = 10000
parts = [str(i) for i in range(n)]

# ❌ 字符串拼接：O(n²) 内存复制
start = time.time()
result = ""
for part in parts:
    result += part
print(f"字符串拼接：{time.time()-start:.4f}s")

# ✅ join：O(n)
start = time.time()
result = "".join(parts)
print(f"join：{time.time()-start:.4f}s")  # 快 10-100 倍

# ✅ f-string 比 % 和 format 更快
name, age = "张三", 25
# %s 格式化
result1 = "名字: %s, 年龄: %d" % (name, age)
# format
result2 = "名字: {}, 年龄: {}".format(name, age)
# f-string（最快）
result3 = f"名字: {name}, 年龄: {age}"

3. slots 节省内存

import sys

class PersonWithDict:
    """普通类：用 __dict__ 存储属性"""
    def __init__(self, name, age, email):
        self.name = name
        self.age = age
        self.email = email

class PersonWithSlots:
    """使用 __slots__：固定属性，节省内存"""
    __slots__ = ['name', 'age', 'email']
    
    def __init__(self, name, age, email):
        self.name = name
        self.age = age
        self.email = email


p1 = PersonWithDict("张三", 25, "zhang@example.com")
p2 = PersonWithSlots("张三", 25, "zhang@example.com")

print(f"有 __dict__：{sys.getsizeof(p1.__dict__)} bytes")  # ~232 bytes
print(f"有 __slots__：{sys.getsizeof(p2)} bytes")           # ~56 bytes

# 创建大量对象时差别很大
import tracemalloc

tracemalloc.start()
objs_dict = [PersonWithDict("name", i, "email") for i in range(100000)]
current, peak = tracemalloc.get_traced_memory()
print(f"PersonWithDict 峰值内存：{peak / 1024 / 1024:.1f} MB")

tracemalloc.reset_peak()
objs_slots = [PersonWithSlots("name", i, "email") for i in range(100000)]
current, peak = tracemalloc.get_traced_memory()
print(f"PersonWithSlots 峰值内存：{peak / 1024 / 1024:.1f} MB")  # 约减少 40%

4. 生成器 vs 列表：内存敏感场景

import sys

# 列表：一次性加载全部数据到内存
lst = [x ** 2 for x in range(1_000_000)]
print(f"列表内存：{sys.getsizeof(lst) / 1024 / 1024:.1f} MB")  # ~8 MB

# 生成器：按需计算，内存极小
gen = (x ** 2 for x in range(1_000_000))
print(f"生成器内存：{sys.getsizeof(gen)} bytes")  # ~112 bytes！

# 对大文件处理，生成器是必选
def process_large_file(filename: str):
    """用生成器处理大文件，内存友好"""
    with open(filename) as f:
        for line in f:           # 文件本身就是迭代器
            yield line.strip()   # 逐行处理，不全量加载

# 可以直接对生成器做 sum、max 等操作
total = sum(x**2 for x in range(1_000_000))  # 不占用额外内存

5. 局部变量缓存：减少属性查找

import time

data = list(range(100_000))

# 慢：每次循环都要查找 result.append
start = time.time()
result = []
for item in data:
    result.append(item * 2)
t1 = time.time() - start

# 快：缓存方法引用，减少属性查找
start = time.time()
result = []
append = result.append  # 缓存到局部变量
for item in data:
    append(item * 2)
t2 = time.time() - start

print(f"原始：{t1:.4f}s，优化后：{t2:.4f}s，提速：{t1/t2:.1f}x")

🔥 第三步：极限优化——突破 Python 速度

NumPy 向量化：数值计算加速 10-100x

import numpy as np
import time

n = 1_000_000

# 纯 Python：慢
start = time.time()
data = list(range(n))
result = [x ** 2 for x in data]
print(f"Python 列表推导式：{time.time()-start:.3f}s")

# NumPy：快
start = time.time()
arr = np.arange(n)
result = arr ** 2  # 向量化操作
print(f"NumPy 向量化：{time.time()-start:.3f}s")  # 快 10-50 倍

# NumPy 还能利用 SIMD 指令
a = np.random.rand(n)
b = np.random.rand(n)

start = time.time()
# ❌ 用 Python 循环
c = [a[i] * b[i] for i in range(n)]
print(f"Python 逐元素乘法：{time.time()-start:.3f}s")

start = time.time()
# ✅ NumPy 向量化
c = a * b
print(f"NumPy 向量化乘法：{time.time()-start:.4f}s")  # 快 100x+

functools.lru_cache：缓存计算结果

import functools
import time

# 没有缓存：递归斐波那契，指数级时间
def fib_slow(n: int) -> int:
    if n < 2:
        return n
    return fib_slow(n - 1) + fib_slow(n - 2)

# 有缓存：O(n) 时间
@functools.lru_cache(maxsize=None)  # 无限缓存
def fib_fast(n: int) -> int:
    if n < 2:
        return n
    return fib_fast(n - 1) + fib_fast(n - 2)

start = time.time()
result = fib_slow(35)
print(f"无缓存 fib(35) = {result}，耗时：{time.time()-start:.3f}s")  # ~2s

start = time.time()
result = fib_fast(35)
print(f"有缓存 fib(35) = {result}，耗时：{time.time()-start:.6f}s")  # <0.001s

print(fib_fast.cache_info())  # CacheInfo(hits=33, misses=36, maxsize=None, currsize=36)

懒加载与缓存属性

import functools

class DataProcessor:
    def __init__(self, filename: str):
        self.filename = filename
    
    @functools.cached_property  # Python 3.8+，计算一次后缓存
    def data(self) -> list:
        """懒加载数据，只在第一次访问时加载"""
        print(f"加载数据从 {self.filename}...")
        with open(self.filename) as f:
            return f.readlines()
    
    @functools.cached_property
    def word_count(self) -> int:
        """词数，依赖 data，同样只计算一次"""
        return sum(len(line.split()) for line in self.data)

processor = DataProcessor("large.txt")
# 第一次访问时才加载
print(processor.word_count)  # 触发加载 + 计算
print(processor.word_count)  # 直接返回缓存，不重新计算

📈 性能优化工作流总结

1. 明确优化目标（速度？内存？）
         ↓
2. 编写基准测试（timeit/time）
         ↓
3. 运行 cProfile 找热点函数
         ↓
4. 用 line_profiler 精确定位热点行
         ↓
5. 选择优化策略：
   • 算法/数据结构优化（最高收益）
   • NumPy 向量化（数值计算）
   • lru_cache 缓存（重复计算）
   • __slots__（内存优化）
   • C 扩展 / Cython（极限性能）
         ↓
6. 优化后再次基准测试，验证效果
         ↓
7. 检查可读性，权衡复杂度

⚠️ 常见误区

1. 过早优化

# ❌ 不要先写"优化版"，再测试是否有必要
def clever_but_unreadable(data):
    # 聪明但难懂的写法
    return {k: v for k, v in zip(data[::2], data[1::2])}

# ✅ 先写清晰版，只有 profiling 证明这是瓶颈时再优化
def clear_and_readable(data):
    result = {}
    for i in range(0, len(data), 2):
        result[data[i]] = data[i + 1]
    return result

2. 局部优化忽视全局

# 花了大量时间把某个函数从 0.1s 优化到 0.01s
# 但那个函数在整个程序中只占 1% 的耗时
# 整体性能提升不到 0.1%
# → 先用 cProfile 确认瓶颈在哪！

3. 忽视 I/O 通常才是真正的瓶颈

1
2
3

# 99% 的"慢"程序，瓶颈在 IO，不在 CPU
# 优先考虑：异步 IO、批量读写、缓存
# 而不是：精细的算法优化

🎯 本讲总结

测量工具：

timeit：精确测量小段代码
cProfile：函数级热点分析（先用这个）
line_profiler：行级精确分析（确认热点后）
memory_profiler：内存使用分析

常用优化技巧（按收益排序）：

选对数据结构：deque / Counter / defaultdict
NumPy 向量化：数值计算快 10-100x
lru_cache / cached_property：消除重复计算
生成器：大数据集的内存优化
slots：大量实例时节省 30-50% 内存
字符串 join：代替 += 拼接
局部变量缓存：减少属性查找

工作流：先测量 → 找瓶颈 → 针对性优化 → 再测量验证

学习路线： 零基础 → 《从入门到实践》 → 《流畅的 Python》 → 本门课程 → 《CPython 设计与实现》

🎓 加入《流畅的 Python》直播共读营

学到这里，如果你想系统吃透这本书——欢迎加入我的直播共读课。

每周直播精讲，逐章拆解核心知识点
专属学习群，随时答疑交流
试运营特惠：~~499 元~~ → 299 元

👉 【立即报名《流畅的 Python》共读课】：https://mp.weixin.qq.com/s/ivHJwn1nNx5ug4TFrapvGg

🔗 课程导航

← 上一讲：异步编程 | 下一讲：最佳实践 →

💬 联系我

平台	账号/链接
微信	扫码加好友
B 站	Python 自动化办公社区

主营业务：AI 编程培训、企业内训、技术咨询

🎓 AI 编程实战课程

想系统学习 AI 编程？程序员晚枫的 AI 编程实战课 帮你从零上手！

👉 免费试看：B站免费试看前3讲，先看看适不适合自己
👉 课程报名：点击这里报名，现在报名还送书📖