NumPy 与 Pandas 实战
数据分析两大利器——NumPy 快速数值运算,Pandas 灵活数据处理。
数据分析工具链
graph LR
RAW[原始数据] --> PANDAS[Pandas 清洗]
PANDAS --> NUMPY[NumPy 计算]
NUMPY --> VIZ[可视化]
PANDAS --> DF[DataFrame]
PANDAS --> SERIES[Series]
NUMPY --> ARRAY[ndarray]
NUMPY --> MATH[线性代数]
VIZ --> MPL[Matplotlib]
VIZ --> SNS[Seaborn]
style RAW fill:#fff3e0,stroke:#f57c00,stroke-width:2px
style PANDAS fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
style NUMPY fill:#c8e6c9,stroke:#388e3c,stroke-width:2px
NumPy 基础
"""
NumPy:高性能数值计算
pip install numpy
"""
import numpy as np
# === 创建数组 ===
a = np.array([1, 2, 3, 4, 5]) # 从列表创建
b = np.zeros((3, 4)) # 3x4 零矩阵
c = np.ones((2, 3)) # 2x3 全 1
d = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
e = np.linspace(0, 1, 5) # [0, 0.25, 0.5, 0.75, 1]
f = np.random.randn(3, 3) # 3x3 标准正态分布
print(f"形状: {a.shape}, 类型: {a.dtype}")
print(f"维度: {f.ndim}, 大小: {f.size}")
# === 数组运算(向量化)===
x = np.array([1, 2, 3, 4])
y = np.array([10, 20, 30, 40])
print(x + y) # [11, 22, 33, 44] 逐元素加
print(x * y) # [10, 40, 90, 160] 逐元素乘
print(x ** 2) # [1, 4, 9, 16] 逐元素平方
print(np.sqrt(x)) # [1, 1.41, 1.73, 2] 逐元素开方
print(np.dot(x, y)) # 300 点积
# 对比 Python 列表(慢 100 倍)
# python_sum = sum([a*b for a,b in zip(list_x, list_y)])
# === 索引与切片 ===
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(matrix[0, 1]) # 2 单元素
print(matrix[0]) # [1,2,3] 第一行
print(matrix[:, 0]) # [1,4,7] 第一列
print(matrix[0:2, 1:3]) # [[2,3],[5,6]] 子矩阵
# 布尔索引
data = np.array([15, 23, 8, 42, 31, 5, 19])
mask = data > 20
print(data[mask]) # [23, 42, 31]
print(data[data > 10]) # [15, 23, 42, 31, 19]
# === 统计函数 ===
scores = np.array([85, 92, 78, 95, 88, 73, 91])
print(f"均值: {scores.mean():.1f}")
print(f"标准差: {scores.std():.1f}")
print(f"最大/最小: {scores.max()}/{scores.min()}")
print(f"中位数: {np.median(scores):.1f}")
print(f"百分位数 P90: {np.percentile(scores, 90):.1f}")
# === 变形 ===
a = np.arange(12)
b = a.reshape(3, 4) # 变为 3x4
c = b.T # 转置 4x3
d = b.flatten() # 展平为 1D
print(f"reshape: {b.shape}, transpose: {c.shape}")
Pandas 基础
"""
Pandas:数据分析核心
pip install pandas
"""
import pandas as pd
import numpy as np
# === 创建 DataFrame ===
df = pd.DataFrame({
"姓名": ["张三", "李四", "王五", "赵六", "钱七"],
"年龄": [25, 30, 28, 35, 22],
"城市": ["北京", "上海", "广州", "北京", "深圳"],
"薪资": [15000, 25000, 18000, 30000, 12000],
})
print(df)
print(f"\n形状: {df.shape}") # (5, 4)
print(f"列名: {list(df.columns)}")
print(f"\n基本统计:")
print(df.describe())
# === 查询与过滤 ===
# 选择列
print(df["姓名"]) # Series
print(df[["姓名", "薪资"]]) # DataFrame
# 条件过滤
high_salary = df[df["薪资"] > 20000]
print("高薪员工:\n", high_salary)
# 多条件
bj_high = df[(df["城市"] == "北京") & (df["薪资"] > 20000)]
print("北京高薪:\n", bj_high)
# loc(按标签)和 iloc(按位置)
print(df.loc[0, "姓名"]) # 张三
print(df.iloc[0:2, 0:2]) # 前两行前两列
# === 数据处理 ===
# 添加列
df["税后"] = df["薪资"] * 0.85
# 排序
df_sorted = df.sort_values("薪资", ascending=False)
print("按薪资排序:\n", df_sorted)
# 分组聚合
city_stats = df.groupby("城市").agg({
"薪资": ["mean", "max", "count"],
"年龄": "mean",
})
print("城市统计:\n", city_stats)
# === 缺失值处理 ===
df_with_nan = df.copy()
df_with_nan.loc[1, "薪资"] = np.nan
df_with_nan.loc[3, "年龄"] = np.nan
print(f"缺失值:\n{df_with_nan.isnull().sum()}")
# 填充
df_filled = df_with_nan.fillna({"薪资": df["薪资"].median(), "年龄": 0})
# 删除
df_dropped = df_with_nan.dropna()
# === 常用操作 ===
# value_counts
print(df["城市"].value_counts())
# apply
df["薪资等级"] = df["薪资"].apply(
lambda x: "高" if x > 20000 else "中" if x > 15000 else "低"
)
print(df[["姓名", "薪资", "薪资等级"]])
# merge(表连接)
dept = pd.DataFrame({
"姓名": ["张三", "李四", "王五"],
"部门": ["技术", "产品", "技术"],
})
merged = df.merge(dept, on="姓名", how="left")
print("合并后:\n", merged)
文件读写
"""
Pandas 读写各种格式
"""
FORMATS = {
"CSV": {
"读取": 'pd.read_csv("data.csv", encoding="utf-8")',
"写入": 'df.to_csv("out.csv", index=False, encoding="utf-8-sig")',
},
"Excel": {
"读取": 'pd.read_excel("data.xlsx", sheet_name="Sheet1")',
"写入": 'df.to_excel("out.xlsx", index=False)',
},
"JSON": {
"读取": 'pd.read_json("data.json")',
"写入": 'df.to_json("out.json", orient="records", force_ascii=False)',
},
"Parquet": {
"读取": 'pd.read_parquet("data.parquet")',
"写入": 'df.to_parquet("out.parquet")',
},
"SQL": {
"读取": 'pd.read_sql("SELECT * FROM users", conn)',
"写入": 'df.to_sql("users", conn, if_exists="replace")',
},
}
for fmt, ops in FORMATS.items():
print(f"\n{fmt}:")
for op, code in ops.items():
print(f" {op}: {code}")
本章小结
| 工具 | 核心对象 | 适用场景 | 安装 |
|---|---|---|---|
| NumPy | ndarray | 数值计算、矩阵运算 | pip install numpy |
| Pandas | DataFrame | 数据清洗、分析 | pip install pandas |
| Matplotlib | Figure | 基础绑图 | pip install matplotlib |
下一章:Web 开发——FastAPI 构建 RESTful API。