Pandas数据清洗：我总结了类型转换的5个坑，帮你避开90%的错误

大家好，我是正在实战各种AI项目的程序员晚枫。

今天继续数据清洗——类型转换与异常值处理。

数据类型不对会导致计算错误，异常值会影响统计结果。掌握这些技巧，让你的数据质量更上一层楼。

常见数据类型问题

import pandas as pd

df = pd.DataFrame({
    '订单号': ['A001', 'A002', 'A003'],
    '金额': ['1500.50', '2300', 'invalid'],  # 字符串而非数值
    '日期': ['2024-01-15', '2024/02/20', '无效日期'],
    '数量': [10, -5, 1000],  # 有负数（异常）
    '类别': ['A', 'B', 'A']
})

print(df.dtypes)

数值类型转换

字符串转数值

# 基础转换
df['金额_num'] = pd.to_numeric(df['金额'], errors='coerce')
# errors='coerce'会把无法转换的变成NaN

# 处理千分位逗号
df['金额_clean'] = df['金额'].str.replace(',', '').astype(float)

# 处理货币符号
df['价格'] = df['价格'].str.replace('$', '').str.replace(',', '').astype(float)

# 批量转换多列
numeric_cols = ['金额', '数量', '单价']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

整数 vs 浮点数

# float转int（会截断小数）
df['金额_int'] = df['金额'].astype(int)  # 1500.50 → 1500

# 四舍五入后转int
df['金额_round'] = df['金额'].round().astype(int)

# 使用Int64（支持缺失值的整数）
df['金额_Int64'] = df['金额'].astype('Int64')

日期时间转换

基础转换

# 自动识别格式
df['日期_dt'] = pd.to_datetime(df['日期'], errors='coerce')

# 指定格式（更快）
df['日期_dt'] = pd.to_datetime(df['日期'], format='%Y-%m-%d', errors='coerce')

# 从多个列组合
df['完整时间'] = pd.to_datetime(df[['年', '月', '日']])

时间戳转换

# Unix时间戳（秒）
df['日期'] = pd.to_datetime(df['timestamp'], unit='s')

# Unix时间戳（毫秒）
df['日期'] = pd.to_datetime(df['timestamp'], unit='ms')

提取日期组件

df['年'] = df['日期'].dt.year
df['月'] = df['日期'].dt.month
df['日'] = df['日期'].dt.day
df['星期'] = df['日期'].dt.dayofweek  # 0=周一
df['季度'] = df['日期'].dt.quarter
df['是否周末'] = df['日期'].dt.dayofweek >= 5

类别型数据（Category）

为什么用Category？

# 节省内存
print(df['类别'].memory_usage())  # 普通object
df['类别_cat'] = df['类别'].astype('category')
print(df['类别_cat'].memory_usage())  # 大幅减少

# 提高性能（分组、筛选更快）

有序类别

from pandas.api.types import CategoricalDtype

# 定义等级顺序
grade_type = CategoricalDtype(
    categories=['C', 'B', 'A', 'S'], 
    ordered=True
)
df['等级'] = df['等级'].astype(grade_type)

# 现在可以比较大小
df[df['等级'] > 'B']  # 选出A和S级

异常值检测与处理

统计方法

# Z-score方法
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(df['金额']))
outliers = df[z_scores > 3]  # Z-score超过3认为是异常

# IQR方法（四分位距）
Q1 = df['金额'].quantile(0.25)
Q3 = df['金额'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['金额'] < lower_bound) | (df['金额'] > upper_bound)]
normal = df[(df['金额'] >= lower_bound) & (df['金额'] <= upper_bound)]

业务规则方法

# 根据业务知识判断
# 年龄应该在0-120之间
df.loc[df['年龄'] < 0, '年龄'] = np.nan
df.loc[df['年龄'] > 120, '年龄'] = np.nan

# 折扣率应该在0-1之间
df.loc[~df['折扣率'].between(0, 1), '折扣率'] = np.nan

# 标记异常
df['是否异常'] = (
    (df['年龄'] < 18) | 
    (df['消费金额'] < 0) |
    (df['订单日期'] > pd.Timestamp.now())
)

异常值处理策略

# 策略1：删除
df_clean = df[~df['是否异常']]

# 策略2：替换为边界值
df['金额'] = df['金额'].clip(lower=0, upper=100000)

# 策略3：用中位数填充
df.loc[df['是否异常'], '金额'] = df['金额'].median()

# 策略4：单独分析
outliers_df = df[df['是否异常']].copy()
print("异常订单需要人工审核:")
print(outliers_df)

文本数据处理

清理空白字符

1 2	df['姓名'] = df['姓名'].str.strip() # 去除首尾空格 df['姓名'] = df['姓名'].str.replace('\s+', ' ', regex=True) # 多个空格变一个

统一大小写

1 2	df['邮箱'] = df['邮箱'].str.lower() # 邮箱统一小写 df['城市'] = df['城市'].str.title() # 首字母大写

提取信息

# 提取手机号前3位（运营商）
df['运营商'] = df['手机号'].str[:3]

# 提取邮箱域名
df['邮箱域名'] = df['邮箱'].str.split('@').str[1]

# 提取年份
df['入职年份'] = df['入职日期'].str[:4]

完整清洗流程示例

def clean_dataframe(df):
    """完整的数据清洗流程"""
    
    # 1. 复制数据避免修改原数据
    df = df.copy()
    
    # 2. 去除完全重复行
    df = df.drop_duplicates()
    
    # 3. 处理缺失值
    df = df.dropna(how='all')  # 删除全空行
    
    # 4. 类型转换
    # 数值列
    numeric_cols = ['金额', '数量', '单价']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # 日期列
    if '日期' in df.columns:
        df['日期'] = pd.to_datetime(df['日期'], errors='coerce')
    
    # 类别列
    if '类别' in df.columns:
        df['类别'] = df['类别'].astype('category')
    
    # 5. 处理异常值
    # 金额不能为负
    if '金额' in df.columns:
        df.loc[df['金额'] < 0, '金额'] = np.nan
    
    # 6. 文本清理
    text_cols = df.select_dtypes(include=['object']).columns
    for col in text_cols:
        df[col] = df[col].str.strip()
    
    return df

# 使用
df_clean = clean_dataframe(df)
print("清洗完成！")
print(df_clean.info())

性能对比：类型转换对内存的影响

import pandas as pd
import numpy as np

# 100万行数据
df = pd.DataFrame({
    'id': np.random.randint(1, 1000000, 1000000),
    'category': np.random.choice(['A', 'B', 'C', 'D'], 1000000),
    'value': np.random.randn(1000000)
})

print("转换前:")
print(df.memory_usage(deep=True))

# int64 → int32（减半）
df['id'] = df['id'].astype('int32')

# object → category（大幅减少）
df['category'] = df['category'].astype('category')

# float64 → float32（减半）
df['value'] = df['value'].astype('float32')

print("\n转换后:")
print(df.memory_usage(deep=True))
# 内存减少60-80%！

进阶用法

智能类型推断

# 自动推断最佳类型
df = df.convert_dtypes()  # pandas 1.0+ 新功能

# 或者手动优化
def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            # 尝试转日期
            try:
                df[col] = pd.to_datetime(df[col])
                continue
            except:
                pass
            # 尝试转数值
            try:
                df[col] = pd.to_numeric(df[col])
                continue
            except:
                pass
            # 尝试转category
            if df[col].nunique() / len(df) < 0.5:
                df[col] = df[col].astype('category')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
    return df

异常值检测方法

# 方法1：IQR法（箱线图原理）
def detect_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] < lower) | (df[col] > upper)]

# 方法2：Z-score法（3σ原则）
from scipy import stats
def detect_outliers_zscore(df, col, threshold=3):
    z_scores = np.abs(stats.zscore(df[col].dropna()))
    return df[z_scores > threshold]

# 方法3：百分位法
def detect_outliers_percentile(df, col, lower_pct=1, upper_pct=99):
    lower = df[col].quantile(lower_pct / 100)
    upper = df[col].quantile(upper_pct / 100)
    return df[(df[col] < lower) | (df[col] > upper)]

避坑指南

❌ 坑1：混合类型列

# 列中同时有数字和字符串
df = pd.DataFrame({'value': ['100', '200', 'N/A', '300']})

# 直接转数值会报错
df['value'] = df['value'].astype(float)  # ValueError!

# 用to_numeric + errors参数
df['value'] = pd.to_numeric(df['value'], errors='coerce')  # N/A变成NaN

❌ 坑2：日期格式混乱

# 同一列有多种日期格式
dates = ['2025-01-15', '01/15/2025', 'Jan 15, 2025', '20250115']

# pd.to_datetime很聪明，能自动识别
df['date'] = pd.to_datetime(df['date'], format='mixed')

# 如果还是不行，指定format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')

实战案例：清洗零售销售数据

import pandas as pd
import numpy as np

np.random.seed(42)
n = 10000

# 模拟含各种问题的零售数据
df = pd.DataFrame({
    'order_id': [f'ORD{i:06d}' for i in range(n)],
    'amount': np.where(np.random.rand(n) > 0.05,
                       np.random.uniform(10, 5000, n).round(2), 
                       np.random.choice([-999, 0, 99999], n)),  # 异常值
    'quantity': np.where(np.random.rand(n) > 0.03,
                         np.random.randint(1, 100, n),
                         np.random.choice([-5, 0, 999], n)),  # 异常值
    'customer_type': np.random.choice(['VIP', 'Normal', 'vip', 'normal', 'VIP ', '正常'], n),
    'date': np.random.choice(pd.date_range('2025-01-01', '2025-12-31'), n)
})

# 1. 统一customer_type
type_map = {'VIP': 'VIP', 'vip': 'VIP', 'VIP ': 'VIP', 
            'Normal': '普通', 'normal': '普通', '正常': '普通'}
df['customer_type'] = df['customer_type'].map(type_map).fillna('未知')

# 2. 检测异常值（IQR法）
for col in ['amount', 'quantity']:
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    outlier_mask = (df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)
    print(f"{col}异常值: {outlier_mask.sum()}条")
    # 用中位数替换异常值
    df.loc[outlier_mask, col] = df.loc[~outlier_mask, col].median()

# 3. 类型优化
df['customer_type'] = df['customer_type'].astype('category')
df['date'] = pd.to_datetime(df['date'])

print(f"\n清洗后数据概览:")
print(df.describe())
print(f"\n客户类型分布:")
print(df['customer_type'].value_counts())

类型转换常见场景

# 场景1：金额列是字符串"¥1,234.56"，需要转为数值
df['price'] = df['price'].str.replace('¥', '').str.replace(',', '').astype(float)

# 场景2：日期列是"2025年1月15日"，需要转为日期
df['date'] = pd.to_datetime(df['date'], format='%Y年%m月%d日')

# 场景3：百分数是字符串"85%"，需要转为数值
df['rate'] = df['rate'].str.rstrip('%').astype(float) / 100

# 场景4：布尔列是"是/否"
df['flag'] = df['flag'].map({'是': True, '否': False})

# 场景5：评分是"5星"，需要提取数字
df['rating'] = df['rating'].str.extract(r'(\d+)').astype(int)

异常值处理策略对比

策略	优点	缺点	适用场景
删除	简单	丢数据	异常值少
替换为边界值	保留数据量	压缩分布	有明确合理范围
替换为中位数	鲁棒	改变分布	异常值不影响中位数
对数变换	压缩极端值	不能处理0/负值	右偏分布
分箱	简单	损失精度	模型输入