数据治理与主数据管理
High Contrast
Dark Mode
Light Mode
Sepia
Forest
2 min read452 words

数据治理与主数据管理

同一个供应商在 ERP 里叫"深圳精密",在财务系统叫"深圳精密制造有限公司"——这不只是命名问题,它会导致对账失败、报表错误和系统无法打通。主数据管理解决的就是这类"数据真实性"问题。

供应链主数据体系

graph TD MDM[主数据管理 MDM] --> PRODUCT[物料主数据] MDM --> SUPPLIER[供应商主数据] MDM --> CUSTOMER[客户主数据] MDM --> LOCATION[地点主数据] PRODUCT --> P1[SKU编码\n描述/规格/单位] PRODUCT --> P2[BOM物料清单] PRODUCT --> P3[价格/成本] SUPPLIER --> S1[供应商编码\n名称/税号/银行] SUPPLIER --> S2[认证资质] SUPPLIER --> S3[联系人/评级] CUSTOMER --> C1[客户编码\n信用额度] LOCATION --> L1[仓库/库位\n地址/时区] style MDM fill:#e3f2fd,stroke:#1565c0,stroke-width:2px style PRODUCT fill:#c8e6c9,stroke:#388e3c,stroke-width:2px

数据质量检查系统

"""
主数据质量检查器
"""
from dataclasses import dataclass
from typing import Any
@dataclass
class DataRecord:
"""数据记录"""
id: str
fields: dict[str, Any]
source_system: str
class DataQualityChecker:
"""数据质量检查器(5 维度)"""
@staticmethod
def completeness(
records: list[DataRecord],
required_fields: list[str],
) -> dict:
"""完整性:必填字段是否都有值"""
issues = []
for r in records:
missing = [
f for f in required_fields
if not r.fields.get(f)
]
if missing:
issues.append({
"记录": r.id,
"缺失字段": missing,
})
score = (len(records) - len(issues)) / len(records) if records else 0
return {
"维度": "完整性",
"得分": f"{score*100:.1f}%",
"问题数": len(issues),
"问题样本": issues[:3],
}
@staticmethod
def uniqueness(
records: list[DataRecord],
unique_field: str,
) -> dict:
"""唯一性:关键字段是否有重复"""
seen = {}
duplicates = []
for r in records:
val = r.fields.get(unique_field)
if val:
if val in seen:
duplicates.append({
"重复值": val,
"记录": [seen[val], r.id],
"来源": [
seen.get(f"src_{val}", "unknown"),
r.source_system,
],
})
else:
seen[val] = r.id
seen[f"src_{val}"] = r.source_system
score = (len(records) - len(duplicates)) / len(records) if records else 0
return {
"维度": "唯一性",
"字段": unique_field,
"得分": f"{score*100:.1f}%",
"重复数": len(duplicates),
"重复样本": duplicates[:3],
}
@staticmethod
def consistency(
records: list[DataRecord],
consistency_rules: list[dict],
) -> dict:
"""一致性:跨字段逻辑是否一致"""
violations = []
for r in records:
for rule in consistency_rules:
try:
if not rule["check"](r.fields):
violations.append({
"记录": r.id,
"规则": rule["name"],
"说明": rule["description"],
})
except (KeyError, TypeError):
pass
score = (len(records) - len(violations)) / len(records) if records else 0
return {
"维度": "一致性",
"得分": f"{score*100:.1f}%",
"违规数": len(violations),
"样本": violations[:3],
}
@classmethod
def full_report(
cls,
records: list[DataRecord],
config: dict,
) -> dict:
"""完整数据质量报告"""
results = []
if "required_fields" in config:
results.append(
cls.completeness(records, config["required_fields"])
)
if "unique_field" in config:
results.append(
cls.uniqueness(records, config["unique_field"])
)
if "consistency_rules" in config:
results.append(
cls.consistency(records, config["consistency_rules"])
)
scores = [
float(r["得分"][:-1]) for r in results if "得分" in r
]
overall = sum(scores) / len(scores) if scores else 0
return {
"总体质量评分": f"{overall:.1f}%",
"评级": (
"优秀" if overall >= 95
else "良好" if overall >= 85
else "需改善" if overall >= 70
else "需立即整治"
),
"维度报告": results,
}
# 演示 — 供应商主数据质量检查
supplier_records = [
DataRecord("SUP-001", {
"name": "深圳精密制造",
"tax_id": "91440300XXXXXXXX",
"bank_account": "6225XXXXXXXXXXXX",
"contact": "李明",
"email": "li@szmfg.com",
}, "ERP"),
DataRecord("SUP-002", {
"name": "东莞快捷电子",
"tax_id": "",               # 缺失!
"bank_account": "6228XXXXXXXXXXXX",
"contact": "",              # 缺失!
"email": "sales@dgjd.com",
}, "SRM"),
DataRecord("SUP-003", {
"name": "深圳精密制造有限公司",   # 与 SUP-001 重复!
"tax_id": "91440300XXXXXXXX",    # 税号重复!
"bank_account": "6225XXXXXXXXXXXX",
"contact": "王华",
"email": "wang@szmfg.com",
}, "FINANCE"),
]
checker = DataQualityChecker()
config = {
"required_fields": ["name", "tax_id", "bank_account", "contact", "email"],
"unique_field": "tax_id",
"consistency_rules": [
{
"name": "邮箱格式",
"description": "邮箱必须包含@符号",
"check": lambda f: "@" in (f.get("email") or ""),
},
],
}
print("=== 供应商主数据质量报告 ===")
report = checker.full_report(supplier_records, config)
print(f"\n总体评分: {report['总体质量评分']} [{report['评级']}]")
for dim in report["维度报告"]:
print(f"\n{dim['维度']}: {dim['得分']}")
if dim.get("问题数", 0) > 0 or dim.get("重复数", 0) > 0:
issues = dim.get("问题样本") or dim.get("重复样本") or dim.get("样本") or []
for issue in issues:
print(f"  问题: {issue}")

主数据治理框架

责任人 关键属性 唯一标识符
物料 供应链/技术 编码、描述、单位、重量、HS Code 物料编码
供应商 采购 名称、税号、银行、认证、评级 供应商编码
客户 销售/财务 名称、税号、信用额度、地址 客户编码
地点 运营 仓库编码、地址、时区、联系人 地点编码

主数据 Golden Record 策略

"""
多系统主数据合并 — Golden Record
"""
def merge_to_golden(records: list[dict], field_priority: dict) -> dict:
"""
多系统数据合并为黄金记录
field_priority: {field_name: [system_priority_order]}
"""
golden = {}
for field, priority_systems in field_priority.items():
for system in priority_systems:
val = next(
(r.get(field) for r in records if r.get("source") == system and r.get(field)),
None,
)
if val:
golden[field] = val
golden[f"{field}_source"] = system
break
else:
golden[field] = None
return golden
# 同一供应商的多系统记录
supplier_records_raw = [
{
"source": "ERP",
"name": "深圳精密制造",
"tax_id": "91440300XXXXXXXX",
"bank_account": None,
"rating": "A",
},
{
"source": "FINANCE",
"name": "深圳精密制造有限公司",
"tax_id": "91440300XXXXXXXX",
"bank_account": "6225XXXXXXXXXXXX",
"rating": None,
},
{
"source": "SRM",
"name": "深圳精密",
"tax_id": None,
"bank_account": "6225XXXXXXXXXXXX",
"rating": "A+",
},
]
priority = {
"name": ["FINANCE", "ERP", "SRM"],     # 财务系统名称最规范
"tax_id": ["ERP", "FINANCE", "SRM"],   # ERP 税号最可靠
"bank_account": ["FINANCE", "SRM", "ERP"],
"rating": ["SRM", "ERP", "FINANCE"],   # SRM 评级最新
}
golden = merge_to_golden(supplier_records_raw, priority)
print("=== Golden Record ===")
for k, v in golden.items():
if not k.endswith("_source"):
print(f"  {k}: {v} (来源: {golden.get(k+'_source', '无')})")

行动清单

下一章风险管理 → — 为不确定性做好准备。