

大家好,我是正在实战各种AI项目的程序员晚枫。
今天继续NumPy进阶内容,学习统计函数和线性代数运算。
这些函数是数据分析的核心工具,掌握它们,你就能处理90%的数值计算需求。
统计函数
基础统计量
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
| import numpy as np
data = np.array([85, 92, 78, 90, 88, 95, 82])
print(np.sum(data))
print(np.mean(data))
print(np.median(data))
print(np.std(data))
print(np.var(data))
print(np.min(data), np.max(data)) print(np.argmin(data), np.argmax(data))
print(np.ptp(data))
|
百分位数
1 2 3 4 5 6 7
| scores = np.array([65, 70, 75, 80, 85, 90, 95, 98, 100])
print(np.percentile(scores, 25)) print(np.percentile(scores, 50)) print(np.percentile(scores, 75)) print(np.percentile(scores, 90))
|
多维数组统计
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| scores = np.array([ [85, 92, 78, 90], [88, 85, 92, 87], [90, 88, 85, 95] ])
print(np.mean(scores, axis=0))
print(np.sum(scores, axis=1))
print(np.max(scores, axis=0))
|
常用数学函数
四舍五入与取整
1 2 3 4 5 6
| arr = np.array([3.14, 2.718, 1.414, 4.669])
print(np.round(arr, 2)) print(np.floor(arr)) print(np.ceil(arr)) print(np.trunc(arr))
|
条件统计
1 2 3 4 5 6 7 8 9 10 11
| sales = np.array([120, 150, 80, 200, 95, 180, 110])
print(np.sum(sales > 100))
print(np.sum(sales[sales > 100]))
filtered = sales[sales >= 50] print(np.mean(filtered))
|
累积运算
1 2 3 4 5
| revenue = np.array([100, 120, 90, 150, 130])
print(np.cumsum(revenue)) print(np.cumprod(revenue)) print(np.diff(revenue))
|
线性代数运算
矩阵创建
1 2 3 4 5 6 7
| A = np.array([[1, 2], [3, 4]]) B = np.array([[5, 6], [7, 8]])
print(A)
|
矩阵运算
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| print(A @ B)
print(A.T)
print(np.linalg.inv(A))
print(np.linalg.det(A))
eigenvalues, eigenvectors = np.linalg.eig(A)
|
解线性方程组
1 2 3 4 5 6 7 8 9
|
A = np.array([[2, 1], [1, -1]]) b = np.array([5, 1])
solution = np.linalg.solve(A, b) print(solution)
|
随机数生成
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| np.random.seed(42)
np.random.rand(3, 3) np.random.uniform(1, 10, 5)
np.random.randn(1000) np.random.normal(0, 1, 100)
np.random.randint(1, 7, 10)
np.random.choice(['A', 'B', 'C'], size=5, p=[0.5, 0.3, 0.2])
arr = np.arange(10) np.random.shuffle(arr)
|
实战:销售数据分析
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| import numpy as np
np.random.seed(42) daily_sales = np.random.normal(1000, 200, 30).astype(int)
print("=== 销售数据报告 ===") print(f"总销售额: ¥{np.sum(daily_sales):,}") print(f"日均销售: ¥{np.mean(daily_sales):.0f}") print(f"最高单日: ¥{np.max(daily_sales)}") print(f"最低单日: ¥{np.min(daily_sales)}") print(f"标准差: ¥{np.std(daily_sales):.0f} (波动程度)")
top_10_threshold = np.percentile(daily_sales, 90) top_days = daily_sales[daily_sales >= top_10_threshold] print(f"\n优秀天数(前10%): {len(top_days)}天") print(f"优秀标准: ¥{top_10_threshold:.0f}以上")
weekly_sales = daily_sales.reshape(5, 6).sum(axis=1) print(f"\n每周销售: {weekly_sales}") print(f"最佳周: 第{np.argmax(weekly_sales)+1}周")
|
性能优化技巧
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
|
result = [] for i in range(len(arr)): result.append(arr[i] * 2)
result = arr * 2
standardized = (arr - np.mean(arr)) / np.std(arr)
result = np.zeros_like(arr) for i in range(len(arr)): result[i] = complex_operation(arr[i])
|
性能对比:NumPy vs 纯Python实现
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| import numpy as np import time import math
data = np.random.randn(1000000) py_list = data.tolist()
start = time.time() mean_val = sum(py_list) / len(py_list) var_val = sum((x - mean_val)**2 for x in py_list) / len(py_list) std_val = math.sqrt(var_val) py_time = time.time() - start
start = time.time() np_std = np.std(data) np_time = time.time() - start
print(f"纯Python: {py_time:.4f}秒, 结果={std_val:.6f}") print(f"NumPy: {np_time:.4f}秒, 结果={np_std:.6f}") print(f"NumPy快了{py_time/np_time:.0f}倍")
|
进阶用法
矩阵分解与应用
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| A = np.random.randn(5, 3) U, S, Vt = np.linalg.svd(A, full_matrices=False) print("奇异值:", S)
A_reconstructed = U @ np.diag(S) @ Vt print("还原误差:", np.linalg.norm(A - A_reconstructed))
k = 2 A_low_rank = U[:, :k] @ np.diag(S[:k]) @ Vt[:k, :] print("压缩后误差:", np.linalg.norm(A - A_low_rank)) print(f"压缩率: {A.size / (U[:,:k].size + S[:k].size + Vt[:k,:].size):.1f}倍")
|
多维数组的轴操作
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| data = np.random.randn(30, 24, 5)
daily_mean = data.mean(axis=1)
hourly_mean = data.mean(axis=0)
max_per_metric = data.max(axis=(0, 1))
best_hour_per_day = data.mean(axis=2).argmax(axis=1)
|
向量化实现移动平均
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| def moving_avg_cumsum(arr, window): cumsum = np.cumsum(arr) cumsum[window:] = cumsum[window:] - cumsum[:-window] return cumsum[window-1:] / window
def moving_avg_convolve(arr, window): return np.convolve(arr, np.ones(window)/window, mode='valid')
def moving_avg_loop(arr, window): result = [] for i in range(len(arr) - window + 1): result.append(arr[i:i+window].mean()) return np.array(result)
data = np.random.randn(100000) window = 20
%timeit moving_avg_cumsum(data, window) %timeit moving_avg_convolve(data, window) %timeit moving_avg_loop(data, window)
|
避坑指南
❌ 坑1:除以零的警告
1 2 3 4 5 6 7 8 9 10
| arr = np.array([1, 2, 0, 4])
result = 1 / arr print(result)
with np.errstate(divide='ignore', invalid='ignore'): result = np.divide(1, arr) result[arr == 0] = 0
|
❌ 坑2:浮点数精度问题
1 2 3 4 5 6 7 8 9 10
| a = np.array([0.1, 0.2]) print(a.sum() == 0.3)
print(np.isclose(a.sum(), 0.3))
b = np.array([0.3]) print(np.allclose(a.sum(), b))
|
❌ 坑3:随机数的可重复性
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| print(np.random.rand(3))
np.random.seed(42) print(np.random.rand(3))
rng = np.random.RandomState(42) print(rng.rand(3))
rng = np.random.default_rng(42) print(rng.random(3))
|
实战案例:分析电商平台用户消费数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
| import numpy as np
np.random.seed(42) n_users = 10000
user_id = np.arange(1, n_users + 1) register_days = np.random.exponential(365, n_users).astype(int) total_spend = np.random.exponential(2000, n_users) order_count = np.random.poisson(8, n_users) avg_order_value = total_spend / np.maximum(order_count, 1) last_purchase_days = np.random.exponential(30, n_users).astype(int)
print("=" * 50) print("电商平台用户消费分析报告") print("=" * 50) print(f"总用户数: {n_users:,}") print(f"平均消费: ¥{total_spend.mean():,.0f}") print(f"消费中位数: ¥{np.median(total_spend):,.0f}") print(f"消费标准差: ¥{total_spend.std():,.0f}")
is_active = last_purchase_days < 30 is_high_value = total_spend > np.percentile(total_spend, 80)
vip = is_active & is_high_value potential = is_active & ~is_high_value at_risk = ~is_active & is_high_value lost = ~is_active & ~is_high_value
print(f"\nVIP用户: {vip.sum()} ({vip.mean()*100:.1f}%)") print(f"潜力用户: {potential.sum()} ({potential.mean()*100:.1f}%)") print(f"风险用户: {at_risk.sum()} ({at_risk.mean()*100:.1f}%)") print(f"流失用户: {lost.sum()} ({lost.mean()*100:.1f}%)")
for name, mask in [("VIP", vip), ("潜力", potential), ("风险", at_risk), ("流失", lost)]: print(f"{name}用户平均客单价: ¥{avg_order_value[mask].mean():.0f}")
|
下节预告
下一课我们将进入Pandas,这是数据分析最核心的库。
你将学会:
- Series和DataFrame数据结构
- 如何创建和操作表格数据
👉 继续阅读:Pandas入门-Series和DataFrame
💬 加入学习交流群
扫码加入Python学习交流群,和数千名同学一起进步:
👉 点击加入交流群
群里不定期分享:
- 数据分析实战案例
- Python学习资料
- 求职面试经验
- 行业最新动态
推荐:AI Python数据分析实战营
🎁 限时福利:送《利用Python进行数据分析》实体书
👉 点击了解详情
课程导航
上一篇: NumPy基础-数组操作
下一篇: Pandas入门-Series和DataFrame
PS:NumPy的统计和数学函数是数据分析的基础。熟练掌握,后面的学习会事半功倍。
📚 推荐教材
主教材:《Excel+Python 飞速搞定数据分析与处理(图灵出品)》
💬 联系我
主营业务:AI 编程培训、企业内训、技术咨询
🎓 AI 编程实战课程
想系统学习 AI 编程?程序员晚枫的 AI 编程实战课 帮你从零上手!