NumPy 与 Pandas 实战
High Contrast
Dark Mode
Light Mode
Sepia
Forest
1 min read98 words

NumPy 与 Pandas 实战

数据分析两大利器——NumPy 快速数值运算,Pandas 灵活数据处理。

数据分析工具链

graph LR RAW[原始数据] --> PANDAS[Pandas 清洗] PANDAS --> NUMPY[NumPy 计算] NUMPY --> VIZ[可视化] PANDAS --> DF[DataFrame] PANDAS --> SERIES[Series] NUMPY --> ARRAY[ndarray] NUMPY --> MATH[线性代数] VIZ --> MPL[Matplotlib] VIZ --> SNS[Seaborn] style RAW fill:#fff3e0,stroke:#f57c00,stroke-width:2px style PANDAS fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style NUMPY fill:#c8e6c9,stroke:#388e3c,stroke-width:2px

NumPy 基础

"""
NumPy:高性能数值计算
pip install numpy
"""
import numpy as np
# === 创建数组 ===
a = np.array([1, 2, 3, 4, 5])           # 从列表创建
b = np.zeros((3, 4))                      # 3x4 零矩阵
c = np.ones((2, 3))                       # 2x3 全 1
d = np.arange(0, 10, 2)                   # [0, 2, 4, 6, 8]
e = np.linspace(0, 1, 5)                  # [0, 0.25, 0.5, 0.75, 1]
f = np.random.randn(3, 3)                 # 3x3 标准正态分布
print(f"形状: {a.shape}, 类型: {a.dtype}")
print(f"维度: {f.ndim}, 大小: {f.size}")
# === 数组运算(向量化)===
x = np.array([1, 2, 3, 4])
y = np.array([10, 20, 30, 40])
print(x + y)       # [11, 22, 33, 44]  逐元素加
print(x * y)       # [10, 40, 90, 160] 逐元素乘
print(x ** 2)      # [1, 4, 9, 16]     逐元素平方
print(np.sqrt(x))  # [1, 1.41, 1.73, 2] 逐元素开方
print(np.dot(x, y))  # 300  点积
# 对比 Python 列表(慢 100 倍)
# python_sum = sum([a*b for a,b in zip(list_x, list_y)])
# === 索引与切片 ===
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(matrix[0, 1])     # 2        单元素
print(matrix[0])         # [1,2,3]  第一行
print(matrix[:, 0])      # [1,4,7]  第一列
print(matrix[0:2, 1:3])  # [[2,3],[5,6]] 子矩阵
# 布尔索引
data = np.array([15, 23, 8, 42, 31, 5, 19])
mask = data > 20
print(data[mask])        # [23, 42, 31]
print(data[data > 10])   # [15, 23, 42, 31, 19]
# === 统计函数 ===
scores = np.array([85, 92, 78, 95, 88, 73, 91])
print(f"均值: {scores.mean():.1f}")
print(f"标准差: {scores.std():.1f}")
print(f"最大/最小: {scores.max()}/{scores.min()}")
print(f"中位数: {np.median(scores):.1f}")
print(f"百分位数 P90: {np.percentile(scores, 90):.1f}")
# === 变形 ===
a = np.arange(12)
b = a.reshape(3, 4)     # 变为 3x4
c = b.T                  # 转置 4x3
d = b.flatten()          # 展平为 1D
print(f"reshape: {b.shape}, transpose: {c.shape}")

Pandas 基础

"""
Pandas:数据分析核心
pip install pandas
"""
import pandas as pd
import numpy as np
# === 创建 DataFrame ===
df = pd.DataFrame({
"姓名": ["张三", "李四", "王五", "赵六", "钱七"],
"年龄": [25, 30, 28, 35, 22],
"城市": ["北京", "上海", "广州", "北京", "深圳"],
"薪资": [15000, 25000, 18000, 30000, 12000],
})
print(df)
print(f"\n形状: {df.shape}")    # (5, 4)
print(f"列名: {list(df.columns)}")
print(f"\n基本统计:")
print(df.describe())
# === 查询与过滤 ===
# 选择列
print(df["姓名"])               # Series
print(df[["姓名", "薪资"]])     # DataFrame
# 条件过滤
high_salary = df[df["薪资"] > 20000]
print("高薪员工:\n", high_salary)
# 多条件
bj_high = df[(df["城市"] == "北京") & (df["薪资"] > 20000)]
print("北京高薪:\n", bj_high)
# loc(按标签)和 iloc(按位置)
print(df.loc[0, "姓名"])        # 张三
print(df.iloc[0:2, 0:2])        # 前两行前两列
# === 数据处理 ===
# 添加列
df["税后"] = df["薪资"] * 0.85
# 排序
df_sorted = df.sort_values("薪资", ascending=False)
print("按薪资排序:\n", df_sorted)
# 分组聚合
city_stats = df.groupby("城市").agg({
"薪资": ["mean", "max", "count"],
"年龄": "mean",
})
print("城市统计:\n", city_stats)
# === 缺失值处理 ===
df_with_nan = df.copy()
df_with_nan.loc[1, "薪资"] = np.nan
df_with_nan.loc[3, "年龄"] = np.nan
print(f"缺失值:\n{df_with_nan.isnull().sum()}")
# 填充
df_filled = df_with_nan.fillna({"薪资": df["薪资"].median(), "年龄": 0})
# 删除
df_dropped = df_with_nan.dropna()
# === 常用操作 ===
# value_counts
print(df["城市"].value_counts())
# apply
df["薪资等级"] = df["薪资"].apply(
lambda x: "高" if x > 20000 else "中" if x > 15000 else "低"
)
print(df[["姓名", "薪资", "薪资等级"]])
# merge(表连接)
dept = pd.DataFrame({
"姓名": ["张三", "李四", "王五"],
"部门": ["技术", "产品", "技术"],
})
merged = df.merge(dept, on="姓名", how="left")
print("合并后:\n", merged)

文件读写

"""
Pandas 读写各种格式
"""
FORMATS = {
"CSV": {
"读取": 'pd.read_csv("data.csv", encoding="utf-8")',
"写入": 'df.to_csv("out.csv", index=False, encoding="utf-8-sig")',
},
"Excel": {
"读取": 'pd.read_excel("data.xlsx", sheet_name="Sheet1")',
"写入": 'df.to_excel("out.xlsx", index=False)',
},
"JSON": {
"读取": 'pd.read_json("data.json")',
"写入": 'df.to_json("out.json", orient="records", force_ascii=False)',
},
"Parquet": {
"读取": 'pd.read_parquet("data.parquet")',
"写入": 'df.to_parquet("out.parquet")',
},
"SQL": {
"读取": 'pd.read_sql("SELECT * FROM users", conn)',
"写入": 'df.to_sql("users", conn, if_exists="replace")',
},
}
for fmt, ops in FORMATS.items():
print(f"\n{fmt}:")
for op, code in ops.items():
print(f"  {op}: {code}")

本章小结

工具 核心对象 适用场景 安装
NumPy ndarray 数值计算、矩阵运算 pip install numpy
Pandas DataFrame 数据清洗、分析 pip install pandas
Matplotlib Figure 基础绑图 pip install matplotlib

下一章:Web 开发——FastAPI 构建 RESTful API。