

大家好,我是正在实战各种AI项目的程序员晚枫。
今天学习Pandas性能优化技巧。
当数据量达到百万甚至千万级别时,普通的Pandas操作会变得很慢。掌握这些优化技巧,你能用更少的内存、更快的速度处理大数据。
技巧1:使用合适的数据类型
数值类型优化
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
| import pandas as pd import numpy as np
df = pd.DataFrame({ 'int_col': np.random.randint(0, 100, 1000000), 'float_col': np.random.randn(1000000), 'bool_col': np.random.choice([True, False], 1000000) })
print("原始内存占用:") print(df.memory_usage(deep=True).sum() / 1024**2, "MB")
df['int_col'] = df['int_col'].astype('int8')
df['float_col'] = df['float_col'].astype('float32')
print("\n优化后内存占用:") print(df.memory_usage(deep=True).sum() / 1024**2, "MB")
|
类别类型(Category)
1 2 3 4 5 6 7
| df['category'] = np.random.choice(['A', 'B', 'C', 'D'], 1000000)
df['category'] = df['category'].astype('category')
|
技巧2:避免循环,使用向量化
❌ 慢:Python循环
1 2 3 4 5
| def calc_distance(row): return (row['x']**2 + row['y']**2)**0.5
df['distance'] = df.apply(calc_distance, axis=1)
|
✅ 快:向量化运算
1 2
| df['distance'] = (df['x']**2 + df['y']**2)**0.5
|
其他向量化替代方案
1 2 3 4 5 6 7 8 9
|
for i in range(len(df)): if df.loc[i, 'score'] > 80: df.loc[i, 'grade'] = 'A'
df.loc[df['score'] > 80, 'grade'] = 'A' df['grade'] = np.where(df['score'] > 80, 'A', 'B')
|
技巧3:使用eval和query
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| import numexpr
df['result'] = df['a'] + df['b'] * df['c'] - df['d'] / df['e']
df['result'] = pd.eval('df.a + df.b * df.c - df.d / df.e')
df[(df['a'] > 0) & (df['b'] < 100)]
df.query('a > 0 and b < 100')
|
技巧4:分块读取大文件
1 2 3 4 5 6 7 8 9 10 11
| chunk_size = 100000 results = []
for chunk in pd.read_csv('huge_file.csv', chunksize=chunk_size): processed = chunk.groupby('category')['value'].sum() results.append(processed)
final_result = pd.concat(results).groupby(level=0).sum()
|
技巧5:使用迭代器
1 2 3 4 5 6 7 8 9 10 11 12 13
|
for index, row in df.iterrows(): print(row['column'])
for row in df.itertuples(): print(row.column)
for value in df['column']: print(value)
|
技巧6:及时释放内存
1 2 3 4 5 6 7 8 9
| del large_df
import gc gc.collect()
df = df[['col1', 'col2', 'col3']]
|
技巧7:使用更高效的数据格式
1 2 3 4 5 6 7 8 9 10 11 12
|
df.to_parquet('data.parquet', compression='snappy')
df = pd.read_parquet('data.parquet')
|
实战:完整优化流程
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
| import pandas as pd import numpy as np
def optimize_dataframe(df): """自动优化DataFrame""" for col in df.select_dtypes(include=['int']).columns: c_min = df[col].min() c_max = df[col].max() if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) for col in df.select_dtypes(include=['float']).columns: df[col] = df[col].astype(np.float32) for col in df.select_dtypes(include=['object']).columns: num_unique = df[col].nunique() num_total = len(df[col]) if num_unique / num_total < 0.5: df[col] = df[col].astype('category') return df
print("优化前内存:", df.memory_usage(deep=True).sum() / 1024**2, "MB") df = optimize_dataframe(df) print("优化后内存:", df.memory_usage(deep=True).sum() / 1024**2, "MB")
|
性能对比:向量化 vs 循环
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| import pandas as pd import numpy as np
df = pd.DataFrame({ 'a': np.random.randn(1000000), 'b': np.random.randn(1000000) })
%timeit [row['a'] + row['b'] for _, row in df.iterrows()]
%timeit df.apply(lambda row: row['a'] + row['b'], axis=1)
%timeit df['a'] + df['b']
|
| 方式 | 100万行耗时 | 适用场景 |
|---|
| iterrows | 60秒 | 尽量不用 |
| itertuples | 5秒 | 必须逐行时 |
| apply | 5秒 | 简单逐行逻辑 |
| 向量化 | 0.001秒 | 首选方案 |
进阶用法
内存优化技巧
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| print(df.memory_usage(deep=True)) print(f"总内存: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
df['id'] = df['id'].astype('int32') df['value'] = df['value'].astype('float32')
df['city'] = df['city'].astype('category')
df = df.drop(columns=['unused_col1', 'unused_col2'])
for chunk in pd.read_csv('huge.csv', chunksize=100000): process(chunk)
|
eval和query加速
1 2 3 4 5
| result = df.eval('total = a * b + c / d')
result = df.query('age > 30 and salary > 15000 and city in ["北京", "上海"]')
|
避坑指南
❌ 坑1:iterrows修改数据
1 2 3 4 5 6 7
| for idx, row in df.iterrows(): row['value'] = 100
for idx in df.index: df.at[idx, 'value'] = 100
|
❌ 坑2:大文件一次性读入
1 2 3 4 5 6 7 8 9 10 11 12 13
| df = pd.read_csv('huge.csv')
for chunk in pd.read_csv('huge.csv', chunksize=100000): result = process(chunk)
df = pd.read_csv('huge.csv', usecols=['id', 'name', 'amount'])
dtypes = {'id': 'int32', 'amount': 'float32'} df = pd.read_csv('huge.csv', dtype=dtypes)
|
实战案例:处理百万行电商数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
| import pandas as pd import numpy as np
np.random.seed(42) n = 1_000_000
df = pd.DataFrame({ 'order_id': range(n), 'user_id': np.random.randint(1, 100000, n), 'product_id': np.random.randint(1, 1000, n), 'quantity': np.random.randint(1, 10, n), 'price': np.random.uniform(10, 5000, n), 'date': pd.date_range('2024-01-01', periods=n, freq='30s'), 'city': np.random.choice(['北京', '上海', '广州', '深圳', '成都', '杭州', '重庆', '武汉'], n) })
print(f"原始内存: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
df['order_id'] = df['order_id'].astype('int32') df['user_id'] = df['user_id'].astype('int32') df['product_id'] = df['product_id'].astype('int16') df['quantity'] = df['quantity'].astype('int8') df['price'] = df['price'].astype('float32') df['city'] = df['city'].astype('category')
print(f"优化后内存: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
df['amount'] = df['quantity'] * df['price']
result = df.groupby('city').agg( total_sales=('amount', 'sum'), avg_order=('amount', 'mean'), order_count=('order_id', 'count') ).round(2).sort_values('total_sales', ascending=False)
print("\n=== 城市销售排行 ===") print(result)
|
更多性能优化技巧
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| df['city'] = df['city'].astype('category')
result = pd.eval("df['a'] * df['b'] + df['c'] / df['d']")
df['col'].values
np.where(df['col'].values > 0, 1, 0)
import swifter df['new_col'] = df['old_col'].swifter.apply(complex_function)
import dask.dataframe as dd ddf = dd.read_csv('huge_*.csv') result = ddf.groupby('city')['sales'].sum().compute()
|
内存优化全流程
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| def optimize_df(df): '''一键优化DataFrame内存使用''' start_mem = df.memory_usage(deep=True).sum() / 1024**2 for col in df.columns: col_type = df[col].dtype if col_type == 'object': num_unique = df[col].nunique() if num_unique / len(df) < 0.5: df[col] = df[col].astype('category') elif col_type == 'int64': df[col] = pd.to_numeric(df[col], downcast='integer') elif col_type == 'float64': df[col] = pd.to_numeric(df[col], downcast='float') end_mem = df.memory_usage(deep=True).sum() / 1024**2 print(f'内存优化: {start_mem:.1f}MB → {end_mem:.1f}MB (减少{(start_mem-end_mem)/start_mem*100:.1f}%)') return df
|
下节预告
下一课我们将进入数据可视化部分,学习Matplotlib基础。
👉 继续阅读:Matplotlib基础-绘制你的第一张图表
💬 加入学习交流群
扫码加入Python学习交流群,和数千名同学一起进步:
👉 点击加入交流群
群里不定期分享:
- 数据分析实战案例
- Python学习资料
- 求职面试经验
- 行业最新动态
推荐:AI Python数据分析实战营
🎁 限时福利:送《利用Python进行数据分析》实体书
👉 点击了解详情
课程导航
上一篇: Pandas字符串处理技巧
下一篇: Matplotlib基础-绘制你的第一张图表
PS:性能优化是进阶必备技能。记住:能用向量化就不用循环,能用category就不用object。
📚 推荐教材
主教材:《Excel+Python 飞速搞定数据分析与处理(图灵出品)》
💬 联系我
主营业务:AI 编程培训、企业内训、技术咨询
🎓 AI 编程实战课程
想系统学习 AI 编程?程序员晚枫的 AI 编程实战课 帮你从零上手!