mock_dashboards.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. from __future__ import annotations
  2. import argparse
  3. import hashlib
  4. import random
  5. from datetime import datetime, timedelta
  6. from pathlib import Path
  7. import pandas as pd
  8. ROOT = Path(__file__).resolve().parents[1]
  9. OUTPUT_PATH = ROOT / "data" / "dashboard.parquet"
  10. RANDOM_SEED = 20260506
  11. BRANCH_ID_MAP = {
  12. "总行": "00100",
  13. "北京分行": "00110",
  14. "广东分行": "00120",
  15. }
  16. BRANCH_DM_MAP = {
  17. "总行": "zh",
  18. "北京分行": "bjfh",
  19. "广东分行": "gdfh",
  20. }
  21. DEFAULT_BRANCH_NAMES = ["总行", "北京分行", "广东分行"]
  22. DOMAIN_CONFIGS = [
  23. {
  24. "domain": "零售",
  25. "level1": "零售金融部",
  26. "level2": "经营分析室",
  27. "level3": "客户经营团队",
  28. "topics": [
  29. "零售客户经营总览",
  30. "高价值客户增长分析",
  31. "私人银行客户资产监测",
  32. "私钻客户AUM提升分析",
  33. "财富客户资产配置看板",
  34. "养老金客户经营跟踪",
  35. "代发客群转化跟踪",
  36. "零售存款日均分析",
  37. "储蓄存款客群结构分析",
  38. "零售贷款投放监控",
  39. "个人住房贷款质量看板",
  40. "消费贷获客转化分析",
  41. "汽车分期贷款经营分析",
  42. "理财产品销售分析",
  43. "基金产品持仓透视",
  44. "保险产品销售监测",
  45. "客户资产提升作战图",
  46. "网点零售业绩排名",
  47. "MGM获客转化分析",
  48. ],
  49. "folder": "零售数据门户",
  50. },
  51. {
  52. "domain": "对公",
  53. "level1": "公司金融部",
  54. "level2": "公司客户经营室",
  55. "level3": "对公经营分析团队",
  56. "topics": [
  57. "对公客户经营总览",
  58. "战略客户贡献分析",
  59. "机构客户综合贡献分析",
  60. "上市公司客群经营看板",
  61. "对公存款增长看板",
  62. "单位活期存款结构分析",
  63. "保证金存款监测",
  64. "代发业务拓展监测",
  65. "对公信贷投放监控",
  66. "项目融资投放分析",
  67. "小微企业贷款分析",
  68. "供应链金融业务看板",
  69. "信用证业务分析",
  70. "保函业务风险监测",
  71. "票据贴现经营分析",
  72. "现金管理客户活跃分析",
  73. "交易银行客户活跃分析",
  74. "公司客户流失预警",
  75. ],
  76. "folder": "公司金融数据门户",
  77. },
  78. {
  79. "domain": "信用卡",
  80. "level1": "信用卡中心",
  81. "level2": "经营管理部",
  82. "level3": "数据分析团队",
  83. "topics": [
  84. "信用卡新增客户分析",
  85. "信用卡交易额监测",
  86. "信用卡分期业务看板",
  87. "信用卡账单分期转化分析",
  88. "信用卡逾期风险预警",
  89. "信用卡活跃客户经营",
  90. "信用卡渠道获客分析",
  91. "信用卡额度使用分析",
  92. "信用卡客群画像看板",
  93. "信用卡权益活动效果分析",
  94. "信用卡商户交易监测",
  95. "信用卡睡眠客户唤醒看板",
  96. ],
  97. "folder": "信用卡经营分析",
  98. },
  99. {
  100. "domain": "风险",
  101. "level1": "风险管理部",
  102. "level2": "组合风险管理室",
  103. "level3": "模型监测团队",
  104. "topics": [
  105. "信贷资产质量监测",
  106. "不良贷款迁徙分析",
  107. "逾期客户风险预警",
  108. "关注类贷款压降监控",
  109. "拨备覆盖率经营看板",
  110. "重点行业风险排查",
  111. "授信集中度监控",
  112. "贷后检查进度跟踪",
  113. "资产分类变动分析",
  114. "抵质押品价值重估分析",
  115. "风险预警信号处置看板",
  116. "大额风险暴露监测",
  117. ],
  118. "folder": "风险管理驾驶舱",
  119. },
  120. {
  121. "domain": "运营",
  122. "level1": "运营管理部",
  123. "level2": "运营监控室",
  124. "level3": "流程管理团队",
  125. "topics": [
  126. "网点运营效率看板",
  127. "柜面业务量监测",
  128. "远程银行服务分析",
  129. "账户开立质量监测",
  130. "企业账户年检进度监控",
  131. "支付结算业务分析",
  132. "反洗钱可疑交易跟踪",
  133. "客户投诉处理看板",
  134. "运营风险事件监控",
  135. "集中作业处理时效分析",
  136. "现金库存与调拨监测",
  137. "电子回单服务分析",
  138. ],
  139. "folder": "运营管理专区",
  140. },
  141. {
  142. "domain": "财务",
  143. "level1": "计划财务部",
  144. "level2": "管理会计室",
  145. "level3": "财务分析团队",
  146. "topics": [
  147. "分行利润贡献分析",
  148. "FTP收支测算看板",
  149. "费用预算执行监控",
  150. "中间业务收入分析",
  151. "净利息收入贡献分析",
  152. "资本占用收益分析",
  153. "经营计划完成率看板",
  154. "资产负债结构分析",
  155. "税务成本监测看板",
  156. "经济利润EVA分析",
  157. "分产品收益率分析",
  158. "管理会计分摊结果看板",
  159. ],
  160. "folder": "财务管理驾驶舱",
  161. },
  162. {
  163. "domain": "渠道",
  164. "level1": "网络经营服务部",
  165. "level2": "渠道管理室",
  166. "level3": "数字渠道团队",
  167. "topics": [
  168. "手机银行活跃分析",
  169. "网银交易监测",
  170. "网点客流热力分析",
  171. "自助设备运行看板",
  172. "远程渠道转化分析",
  173. "企业网银客户经营",
  174. "开放银行接口监测",
  175. "渠道协同营销看板",
  176. "数字人民币交易分析",
  177. "小程序渠道转化分析",
  178. "线上预约到店分析",
  179. "渠道客户体验监测",
  180. ],
  181. "folder": "渠道经营分析",
  182. },
  183. {
  184. "domain": "普惠",
  185. "level1": "普惠金融部",
  186. "level2": "普惠经营管理室",
  187. "level3": "小微客户团队",
  188. "topics": [
  189. "普惠贷款投放监控",
  190. "小微客户增长看板",
  191. "普惠首贷户拓展分析",
  192. "涉农贷款经营监测",
  193. "科技型企业贷款分析",
  194. "普惠风险补偿跟踪",
  195. "普惠延期还本付息监测",
  196. "科创小微客户经营分析",
  197. "个体工商户贷款看板",
  198. "普惠贷款利率定价分析",
  199. ],
  200. "folder": "普惠金融专区",
  201. },
  202. {
  203. "domain": "金融市场",
  204. "level1": "金融市场部",
  205. "level2": "投资交易管理室",
  206. "level3": "市场风险与经营分析团队",
  207. "topics": [
  208. "债券投资组合分析",
  209. "同业负债成本监测",
  210. "资金头寸预测看板",
  211. "外汇交易损益分析",
  212. "衍生品估值监测",
  213. "票据转贴现业务分析",
  214. "理财投资资产穿透看板",
  215. "金融市场限额占用监控",
  216. "市场价格波动预警",
  217. ],
  218. "folder": "金融市场经营专区",
  219. },
  220. {
  221. "domain": "国际业务",
  222. "level1": "国际业务部",
  223. "level2": "跨境金融管理室",
  224. "level3": "国际结算分析团队",
  225. "topics": [
  226. "跨境结算业务分析",
  227. "贸易融资投放监控",
  228. "进口信用证业务看板",
  229. "出口托收业务跟踪",
  230. "外汇存款结构分析",
  231. "结售汇客户贡献分析",
  232. "跨境人民币业务监测",
  233. "国际业务风险预警",
  234. ],
  235. "folder": "国际业务分析专区",
  236. },
  237. {
  238. "domain": "合规",
  239. "level1": "法律合规部",
  240. "level2": "反洗钱与制裁合规管理团队",
  241. "level3": "合规数据应用室",
  242. "topics": [
  243. "受益所有人信息缺失监测",
  244. "客户尽职调查进度看板",
  245. "反洗钱名单命中分析",
  246. "可疑交易报告质效分析",
  247. "员工异常行为排查",
  248. "监管报送质量监控",
  249. "合规检查问题整改跟踪",
  250. "制裁筛查处理时效分析",
  251. ],
  252. "folder": "合规管理专区",
  253. },
  254. {
  255. "domain": "人力",
  256. "level1": "人力资源部",
  257. "level2": "绩效薪酬管理室",
  258. "level3": "人力数据分析团队",
  259. "topics": [
  260. "机构人均产能分析",
  261. "客户经理绩效看板",
  262. "支行奖金分配测算",
  263. "岗位编制与人员缺口分析",
  264. "员工培训完成率监测",
  265. "人才盘点与梯队建设看板",
  266. "一线人员工作量分析",
  267. ],
  268. "folder": "人力资源分析专区",
  269. },
  270. ]
  271. VERSION_SUFFIXES = [
  272. "",
  273. " V1.0",
  274. " V2.0",
  275. " V3.0",
  276. " V4.0",
  277. "(机构版)",
  278. "(管理端)",
  279. "(支行版)",
  280. "(客户经理版)",
  281. "(测试版)",
  282. "月报",
  283. "日报",
  284. "周报",
  285. "快报",
  286. "大屏",
  287. ]
  288. def make_id(prefix: str, ordinal: int) -> str:
  289. raw = f"{prefix}-{ordinal}-{RANDOM_SEED}".encode("utf-8")
  290. return hashlib.md5(raw).hexdigest()[:24]
  291. def parse_branch_names(raw_branch_names: list[str] | None) -> list[str]:
  292. if not raw_branch_names:
  293. return DEFAULT_BRANCH_NAMES
  294. branch_names: list[str] = []
  295. for raw_value in raw_branch_names:
  296. for branch_name in raw_value.split(","):
  297. branch_name = branch_name.strip()
  298. if branch_name and branch_name not in branch_names:
  299. branch_names.append(branch_name)
  300. unknown_branch_names = [name for name in branch_names if name not in BRANCH_DM_MAP]
  301. if unknown_branch_names:
  302. supported = "、".join(BRANCH_DM_MAP)
  303. unknown = "、".join(unknown_branch_names)
  304. raise ValueError(f"unsupported branch name: {unknown}. Supported branch names: {supported}")
  305. return branch_names
  306. def build_branch_pool(branch_names: list[str]) -> list[tuple[str, str, str]]:
  307. return [(BRANCH_ID_MAP[name], name, BRANCH_DM_MAP[name]) for name in branch_names]
  308. def choose_branch(
  309. rng: random.Random,
  310. domain: str,
  311. branch_pool: list[tuple[str, str, str]],
  312. ) -> tuple[str, str, str]:
  313. total_branch = next((branch for branch in branch_pool if branch[1] == "总行"), None)
  314. if total_branch and domain in {"风险", "财务", "信用卡"} and rng.random() < 0.36:
  315. return total_branch
  316. return rng.choice(branch_pool)
  317. def make_timestamp(rng: random.Random, start: datetime, end: datetime) -> str:
  318. seconds = int((end - start).total_seconds())
  319. value = start + timedelta(seconds=rng.randint(0, seconds))
  320. microsecond = rng.choice([0, 152000, 281000, 371000, 495000, 659000, 749000, 911000])
  321. return value.replace(microsecond=microsecond).strftime("%Y-%m-%d %H:%M:%S.%f")
  322. def build_folder_route(rng: random.Random, branch_name: str, config: dict[str, object], topic: str) -> str:
  323. roots = [
  324. f"根目录/{config['folder']}/{config['domain']}/{topic}",
  325. f"根目录/业务网仪表板/{config['domain']}条线/{branch_name}/{topic}",
  326. f"根目录/总行/经营管理驾驶舱/{config['folder']}/{topic}",
  327. f"根目录/用户开发报表区/{branch_name}/{config['folder']}/{topic}",
  328. f"根目录/应用市场下载目录/分析模板/{config['domain']}经营/{topic}",
  329. ]
  330. return rng.choice(roots)
  331. def make_dashboard_name(
  332. rng: random.Random,
  333. ordinal: int,
  334. branch_name: str,
  335. topic: str,
  336. used_names: set[str],
  337. ) -> str:
  338. suffix = rng.choice(VERSION_SUFFIXES)
  339. branch_prefix = branch_name if rng.random() < 0.34 and branch_name != "总行" else ""
  340. base_name = f"{branch_prefix}{topic}{suffix}"
  341. dash_name = base_name
  342. if dash_name in used_names:
  343. dash_name = f"{base_name}(第{ordinal:03d}期)"
  344. retry_index = 2
  345. while dash_name in used_names:
  346. dash_name = f"{base_name}(第{ordinal:03d}-{retry_index}期)"
  347. retry_index += 1
  348. used_names.add(dash_name)
  349. return dash_name
  350. def build_record(
  351. rng: random.Random,
  352. ordinal: int,
  353. config: dict[str, object],
  354. topic: str,
  355. branch_pool: list[tuple[str, str, str]],
  356. used_names: set[str],
  357. ) -> dict[str, object]:
  358. bbk_id, bbk_name, dm_nm = choose_branch(rng, str(config["domain"]), branch_pool)
  359. dash_name = make_dashboard_name(rng, ordinal, bbk_name, topic, used_names)
  360. level1 = str(config["level1"])
  361. level2 = str(config["level2"])
  362. level3 = str(config["level3"])
  363. level4_candidates = ["数据应用室", "经营推动组", "业务支持组", "指标管理组", ""]
  364. level4 = rng.choice(level4_candidates)
  365. dept_fname_parts = ["XX银行", bbk_name, level1, level2, level3]
  366. if level4:
  367. dept_fname_parts.append(level4)
  368. chart_cnt = rng.randint(20, 50)
  369. author_cnt = rng.randint(8, 360)
  370. first_visit = make_timestamp(rng, datetime(2023, 1, 1), datetime(2026, 2, 28))
  371. recent_start = datetime.strptime(first_visit, "%Y-%m-%d %H:%M:%S.%f") + timedelta(days=1)
  372. recent_end = datetime(2026, 5, 5, 23, 59, 59)
  373. recent_visit = make_timestamp(rng, recent_start, recent_end) if recent_start < recent_end else first_visit
  374. efficiency = rng.choices(["高效", "中效", "低效"], weights=[0.34, 0.46, 0.20], k=1)[0]
  375. visit_ranges = {
  376. "高效": (360, 6800),
  377. "中效": (80, 1200),
  378. "低效": (0, 180),
  379. }
  380. visit_low, visit_high = visit_ranges[efficiency]
  381. return {
  382. "dash_id": make_id("dash", ordinal),
  383. "dash_dsply_name": dash_name,
  384. "folder_all_route": build_folder_route(rng, bbk_name, config, topic),
  385. "build_main": rng.choices(["总行建设", "分行自建"], weights=[0.48, 0.52], k=1)[0],
  386. "dept_cls": rng.choices(["业务部门", "信息技术部", "风险合规部门", "管理部门"], weights=[0.66, 0.18, 0.09, 0.07], k=1)[0],
  387. "dash_rat": efficiency,
  388. "bbk_name": bbk_name,
  389. "bbk_id": bbk_id,
  390. "dept_fname": "/".join(dept_fname_parts),
  391. "level1_dept_name": level1,
  392. "level2_dept_name": level2,
  393. "level3_dept_name": level3,
  394. "level4_dept_name": level4,
  395. "chart_cnt": chart_cnt,
  396. "be_author_cnt": author_cnt,
  397. "fir_be_visit_tm": first_visit,
  398. "recnt_be_visit_tm": recent_visit,
  399. "m3_vis_cnt": rng.randint(visit_low, visit_high),
  400. "mon_add_idf": rng.choices(["是", "否"], weights=[0.08, 0.92], k=1)[0],
  401. "dm_nm": dm_nm,
  402. }
  403. def make_dashboards(
  404. target_count: int = 200,
  405. start_ordinal: int = 1,
  406. branch_pool: list[tuple[str, str, str]] | None = None,
  407. used_names: set[str] | None = None,
  408. ) -> pd.DataFrame:
  409. rng = random.Random(RANDOM_SEED + start_ordinal - 1)
  410. branch_pool = branch_pool or build_branch_pool(DEFAULT_BRANCH_NAMES)
  411. used_names = used_names if used_names is not None else set()
  412. records: list[dict[str, object]] = []
  413. topic_pairs: list[tuple[dict[str, object], str]] = []
  414. for config in DOMAIN_CONFIGS:
  415. for topic in config["topics"]:
  416. topic_pairs.append((config, topic))
  417. while len(records) < target_count:
  418. rng.shuffle(topic_pairs)
  419. for config, topic in topic_pairs:
  420. records.append(
  421. build_record(
  422. rng,
  423. start_ordinal + len(records),
  424. config,
  425. topic,
  426. branch_pool,
  427. used_names,
  428. )
  429. )
  430. if len(records) >= target_count:
  431. break
  432. columns = [
  433. "dash_id",
  434. "dash_dsply_name",
  435. "folder_all_route",
  436. "build_main",
  437. "dept_cls",
  438. "dash_rat",
  439. "bbk_name",
  440. "bbk_id",
  441. "dept_fname",
  442. "level1_dept_name",
  443. "level2_dept_name",
  444. "level3_dept_name",
  445. "level4_dept_name",
  446. "chart_cnt",
  447. "be_author_cnt",
  448. "fir_be_visit_tm",
  449. "recnt_be_visit_tm",
  450. "m3_vis_cnt",
  451. "mon_add_idf",
  452. "dm_nm",
  453. ]
  454. return pd.DataFrame(records, columns=columns)
  455. def parse_args() -> argparse.Namespace:
  456. parser = argparse.ArgumentParser(description="Generate mock BI dashboard parquet data.")
  457. parser.add_argument(
  458. "--mode",
  459. choices=["overwrite", "append"],
  460. default="overwrite",
  461. help="overwrite replaces the parquet file; append inserts generated rows into the existing parquet file.",
  462. )
  463. parser.add_argument(
  464. "--count",
  465. type=int,
  466. default=200,
  467. help="number of dashboard rows to generate in this run.",
  468. )
  469. parser.add_argument(
  470. "--output",
  471. type=Path,
  472. default=OUTPUT_PATH,
  473. help="target parquet path.",
  474. )
  475. parser.add_argument(
  476. "--branch-name",
  477. action="append",
  478. help=(
  479. "branch names to generate, separated by comma or passed repeatedly. "
  480. "Defaults to 总行,北京分行,广东分行."
  481. ),
  482. )
  483. return parser.parse_args()
  484. def main() -> None:
  485. args = parse_args()
  486. if args.count <= 0:
  487. raise ValueError("--count must be a positive integer")
  488. output_path = args.output.resolve()
  489. output_path.parent.mkdir(parents=True, exist_ok=True)
  490. branch_names = parse_branch_names(args.branch_name)
  491. branch_pool = build_branch_pool(branch_names)
  492. existing = pd.DataFrame()
  493. if args.mode == "append" and output_path.exists():
  494. existing = pd.read_parquet(output_path)
  495. used_names = set(existing["dash_dsply_name"]) if not existing.empty else set()
  496. new_dashboards = make_dashboards(
  497. args.count,
  498. start_ordinal=len(existing) + 1,
  499. branch_pool=branch_pool,
  500. used_names=used_names,
  501. )
  502. dashboards = pd.concat([existing, new_dashboards], ignore_index=True) if not existing.empty else new_dashboards
  503. dashboards.to_parquet(output_path, index=False)
  504. duplicate_count = int(dashboards["dash_id"].duplicated().sum())
  505. duplicate_name_count = int(dashboards["dash_dsply_name"].duplicated().sum())
  506. print(f"mode: {args.mode}")
  507. print(f"branch names: {', '.join(branch_names)}")
  508. print(f"generated rows: {len(new_dashboards)}")
  509. print(f"wrote total rows: {len(dashboards)} to {output_path}")
  510. print(f"duplicate dash_id: {duplicate_count}")
  511. print(f"duplicate dash_dsply_name: {duplicate_name_count}")
  512. print(f"columns: {', '.join(dashboards.columns)}")
  513. print("dash_rat:", dashboards["dash_rat"].value_counts().to_dict())
  514. print("chart_cnt:", int(dashboards["chart_cnt"].min()), int(dashboards["chart_cnt"].max()))
  515. print("level1_dept_name:", dashboards["level1_dept_name"].value_counts().to_dict())
  516. if __name__ == "__main__":
  517. main()