Pandas时间序列：我总结了8个时间处理技巧，轻松搞定日期数据分析

大家好，我是正在实战各种AI项目的程序员晚枫。

今天学习时间序列数据处理，这是金融、销售、运营等领域必备的技能。

无论是分析日活趋势、计算同比环比，还是预测未来销量，都需要掌握时间序列的处理方法。

DatetimeIndex：时间索引

创建时间序列

import pandas as pd
import numpy as np

# 方式1：从字符串转换
dates = pd.to_datetime(['2024-01-01', '2024-01-02', '2024-01-03'])

# 方式2：生成日期范围
dates = pd.date_range(start='2024-01-01', periods=10, freq='D')  # 每日
dates = pd.date_range(start='2024-01-01', end='2024-12-31', freq='M')  # 每月
dates = pd.date_range(start='2024-01-01', periods=12, freq='MS')  # 每月初

# 常见频率
# D: 日历日, B: 工作日
# W: 周, M: 月末, MS: 月初
# Q: 季末, A: 年末
# H: 小时, T/min: 分钟, S: 秒

创建时间序列DataFrame

# 模拟销售数据
df = pd.DataFrame({
    '日期': pd.date_range('2024-01-01', periods=365, freq='D'),
    '销售额': np.random.randint(1000, 5000, 365)
})

# 设置为索引
df.set_index('日期', inplace=True)
print(df.head())

时间索引切片

# 按日期切片（超级方便！）
df['2024-01']        # 2024年1月所有数据
df['2024-01':'2024-03']  # 1月到3月
df['2024-01-15':'2024-01-20']  # 具体日期范围

# 按年份
df.loc['2024']       # 2024年全年

# 按月份
df.loc['2024-02']    # 2024年2月

提取时间组件

df['年'] = df.index.year
df['月'] = df.index.month
df['日'] = df.index.day
df['星期'] = df.index.dayofweek  # 0=周一, 6=周日
df['季度'] = df.index.quarter
df['是否周末'] = df.index.dayofweek >= 5

# 获取月初月末
df['月初'] = df.index.to_period('M').to_timestamp()
df['月末'] = df.index.to_period('M').to_timestamp(how='end')

重采样Resample

# 日数据转为月数据
monthly = df.resample('M').sum()      # 每月总和
monthly = df.resample('M').mean()     # 每月平均

# 周统计
weekly = df.resample('W').sum()       # 每周总和（周日结束）
weekly = df.resample('W-MON').sum()   # 每周一结束

# 季度统计
quarterly = df.resample('Q').sum()

# 常用聚合
stats = df.resample('M').agg({
    '销售额': ['sum', 'mean', 'max', 'min']
})

移动窗口计算

# 7天移动平均
df['MA7'] = df['销售额'].rolling(window=7).mean()

# 30天移动平均
df['MA30'] = df['销售额'].rolling(window=30).mean()

# 指数加权移动平均（更重视近期数据）
df['EMA'] = df['销售额'].ewm(span=7).mean()

# 其他窗口函数
df['7天总和'] = df['销售额'].rolling(window=7).sum()
df['7天标准差'] = df['销售额'].rolling(window=7).std()

滞后与差分

# 滞后1期（前一天的数据）
df['昨日销售'] = df['销售额'].shift(1)

# 滞后7期（上周同一天）
df['上周同日'] = df['销售额'].shift(7)

# 日环比
df['日环比'] = (df['销售额'] - df['昨日销售']) / df['昨日销售'] * 100

# 周同比
df['周同比'] = (df['销售额'] - df['上周同日']) / df['上周同日'] * 100

# 差分（相邻数据的差）
df['日增量'] = df['销售额'].diff()

实战：完整的销售时间分析

import pandas as pd
import numpy as np

# 生成一年的销售数据
np.random.seed(42)
dates = pd.date_range('2024-01-01', '2024-12-31', freq='D')
base_sales = 3000
trend = np.linspace(0, 1000, len(dates))  # 增长趋势
seasonal = 500 * np.sin(2 * np.pi * np.arange(len(dates)) / 365.25)  # 季节性
noise = np.random.normal(0, 200, len(dates))  # 随机波动

sales = base_sales + trend + seasonal + noise
sales = np.maximum(sales, 1000)  # 最小值1000

df = pd.DataFrame({'销售额': sales}, index=dates)

print("=== 时间序列分析报告 ===\n")

# 1. 基础统计
print("【年度概览】")
print(f"总销售额: ¥{df['销售额'].sum():,.0f}")
print(f"日均销售: ¥{df['销售额'].mean():,.0f}")
print(f"最高单日: ¥{df['销售额'].max():,.0f} ({df['销售额'].idxmax().date()})")
print(f"最低单日: ¥{df['销售额'].min():,.0f} ({df['销售额'].idxmin().date()})")

# 2. 月度趋势
print("\n【月度销售】")
monthly = df.resample('M')['销售额'].sum()
print(monthly)

# 3. 星期模式
print("\n【星期销售模式】")
df['星期'] = df.index.day_name()
weekly_pattern = df.groupby('星期')['销售额'].mean()
print(weekly_pattern)

# 4. 移动平均趋势
print("\n【7日移动平均（最近10天）】")
df['MA7'] = df['销售额'].rolling(7).mean()
print(df[['销售额', 'MA7']].tail(10))

# 5. 同比增长（假设有去年数据）
# df['去年今日'] = ...
# df['同比增长'] = (df['销售额'] - df['去年今日']) / df['去年今日']

性能对比：不同时间处理方式

import pandas as pd
import numpy as np

dates = pd.date_range('2020-01-01', periods=100000, freq='H')
df = pd.DataFrame({'value': np.random.randn(100000)}, index=dates)

# resample重采样
%timeit df.resample('D').mean()     # 约5ms

# rolling滚动窗口
%timeit df.rolling('7D').mean()     # 约10ms

# 手动循环实现
# 约需几十秒到几分钟

进阶用法

重采样详解

# 日数据 → 月数据
monthly = df.resample('M').agg({
    'open': 'first',    # 月初开盘价
    'close': 'last',    # 月末收盘价
    'high': 'max',      # 月内最高
    'low': 'min',       # 月内最低
    'volume': 'sum'     # 月内总量
})

# 自定义重采样频率
df.resample('2W')   # 两周
df.resample('QS')   # 季度初
df.resample('BA')   # 年末工作日

滚动窗口高级用法

# 多指标滚动计算
rolling_stats = df['price'].rolling(20).agg({
    'mean': 'mean',
    'std': 'std',
    'min': 'min',
    'max': 'max'
})

# 指数加权移动平均（EMA）
df['ema_12'] = df['price'].ewm(span=12).mean()
df['ema_26'] = df['price'].ewm(span=26).mean()

# 扩展窗口（从开始到当前）
df['cummax'] = df['price'].expanding().max()

避坑指南

❌ 坑1：时区问题

# 时间戳默认无时区
ts = pd.Timestamp('2025-01-01 10:00')
print(ts.tz)  # None

# 添加时区
ts = ts.tz_localize('Asia/Shanghai')

# 转换时区
ts_utc = ts.tz_convert('UTC')
print(ts_utc)  # 2025-01-01 02:00:00+00:00

❌ 坑2：resample的label和closed

# 默认：左闭右开，左标签
df.resample('M').mean()  # 1月数据标签为1月31日

# 改为右标签
df.resample('M', label='right').mean()  # 标签为2月1日

# 通常金融数据用右闭右标签
df.resample('M', closed='right', label='right').mean()

实战案例：分析股票价格数据

import pandas as pd
import numpy as np

np.random.seed(42)
# 模拟股票日K线数据
dates = pd.bdate_range('2024-01-01', '2025-12-31')  # 工作日
n = len(dates)
price = 100 * np.cumprod(1 + np.random.normal(0.001, 0.02, n))

df = pd.DataFrame({
    'open': price * (1 + np.random.uniform(-0.01, 0.01, n)),
    'close': price,
    'high': price * (1 + np.abs(np.random.uniform(0, 0.02, n))),
    'low': price * (1 - np.abs(np.random.uniform(0, 0.02, n))),
    'volume': np.random.randint(1000000, 10000000, n)
}, index=dates)

# 1. 计算技术指标
df['MA5'] = df['close'].rolling(5).mean()
df['MA20'] = df['close'].rolling(20).mean()
df['EMA12'] = df['close'].ewm(span=12).mean()
df['EMA26'] = df['close'].ewm(span=26).mean()
df['MACD'] = df['EMA12'] - df['EMA26']
df['returns'] = df['close'].pct_change()

# 2. 月度汇总
monthly = df.resample('M').agg({
    'open': 'first', 'close': 'last',
    'high': 'max', 'low': 'min', 'volume': 'sum'
})
monthly['change_pct'] = monthly['close'].pct_change() * 100

print("=== 月度收益 ===")
print(monthly[['close', 'change_pct']].tail(12).round(2))

# 3. 波动率分析
df['volatility_20d'] = df['returns'].rolling(20).std() * np.sqrt(252)  # 年化波动率
print(f"\n当前20日年化波动率: {df['volatility_20d'].iloc[-1]*100:.1f}%")

时间序列处理速查

# 创建时间索引
df['date'] = pd.date_range('2025-01-01', periods=365, freq='D')
df = df.set_index('date')

# 日期提取
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['weekday'] = df.index.day_name()  # Monday, Tuesday...
df['is_weekend'] = df.index.dayofweek >= 5
df['quarter'] = df.index.quarter

# 时间偏移
df['yesterday'] = df.index - pd.Timedelta(days=1)
df['next_month'] = df.index + pd.DateOffset(months=1)
df['month_end'] = df.index + pd.offsets.MonthEnd(0)

# 时间差计算
df['days_since'] = (pd.Timestamp('today') - df.index).days

时区处理

# 本地化时间
df.index = df.index.tz_localize('Asia/Shanghai')

# 转换时区
df.index = df.index.tz_convert('UTC')

# 去掉时区信息
df.index = df.index.tz_localize(None)

时间序列处理场景速查

# 场景1：日期列转datetime
df['date'] = pd.to_datetime(df['date'])

# 场景2：提取年月日
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['weekday'] = df['date'].dt.day_name()

# 场景3：计算时间差
df['days_since'] = (pd.Timestamp('today') - df['date']).dt.days

# 场景4：按月汇总
df.resample('M', on='date')['sales'].sum()

# 场景5：7天移动平均
df['ma7'] = df['sales'].rolling(7).mean()

# 场景6：同比增长率
df['yoy'] = df['sales'].pct_change(12)  # 12个月前的对比

# 场景7：环比增长率
df['mom'] = df['sales'].pct_change(1)  # 上月对比

# 场景8：日期范围生成
dates = pd.date_range('2025-01-01', '2025-12-31', freq='B')  # 工作日