mock_cards.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. from __future__ import annotations
  2. import argparse
  3. import hashlib
  4. import json
  5. import random
  6. from pathlib import Path
  7. import pandas as pd
  8. from card_blueprints import DOMAIN_KEYWORDS, LEVEL1_DOMAIN_MAP, get_card_blueprints
  9. ROOT = Path(__file__).resolve().parents[1]
  10. DASHBOARD_PATH = ROOT / "data" / "dashboard.parquet"
  11. OUTPUT_PATH = ROOT / "data" / "card_info.parquet"
  12. RANDOM_SEED = 20260507
  13. CARD_COLUMNS = [
  14. "card_id",
  15. "bbk_id",
  16. "bbk_name",
  17. "card_name",
  18. "ds_id",
  19. "ds_name",
  20. "dash_id",
  21. "dash_name",
  22. "card_type_cd",
  23. "field_alias",
  24. "field_name",
  25. "filters_field_name",
  26. "filters_field_value",
  27. "sort_field_name",
  28. "sort_way",
  29. "num_value_field_name",
  30. "num_value_field_alias",
  31. "num_value_field_merge_way",
  32. "folder_name",
  33. "folder_all_route",
  34. ]
  35. def to_json(values: list[str]) -> str:
  36. return json.dumps(values, ensure_ascii=False)
  37. def make_id(prefix: str, ordinal: int) -> str:
  38. raw = f"{prefix}-{ordinal}-{RANDOM_SEED}".encode("utf-8")
  39. return hashlib.md5(raw).hexdigest()[:24]
  40. def parse_branch_names(raw_branch_names: list[str] | None, available_branch_names: set[str]) -> list[str] | None:
  41. if not raw_branch_names:
  42. return None
  43. branch_names: list[str] = []
  44. for raw_value in raw_branch_names:
  45. for branch_name in raw_value.split(","):
  46. branch_name = branch_name.strip()
  47. if branch_name and branch_name not in branch_names:
  48. branch_names.append(branch_name)
  49. unknown_branch_names = [name for name in branch_names if name not in available_branch_names]
  50. if unknown_branch_names:
  51. supported = "、".join(sorted(available_branch_names))
  52. unknown = "、".join(unknown_branch_names)
  53. raise ValueError(f"unsupported branch name: {unknown}. Available branch names in dashboards: {supported}")
  54. return branch_names
  55. def infer_domain(dashboard: pd.Series) -> str:
  56. level1_name = str(dashboard.get("level1_dept_name", ""))
  57. if level1_name in LEVEL1_DOMAIN_MAP:
  58. return LEVEL1_DOMAIN_MAP[level1_name]
  59. dash_name = str(dashboard.get("dash_dsply_name", ""))
  60. folder_route = str(dashboard.get("folder_all_route", ""))
  61. text = f"{level1_name} {dash_name} {folder_route}"
  62. for domain, keywords in DOMAIN_KEYWORDS.items():
  63. if any(keyword in text for keyword in keywords):
  64. return domain
  65. return "零售"
  66. def make_dataset_id(dataset_prefix: str, dataset_name: str) -> str:
  67. return make_id(f"dataset-{dataset_prefix}-{dataset_name}", 1)
  68. def resolve_filter_value(filter_spec: dict[str, str], dashboard: pd.Series) -> str:
  69. if "value" in filter_spec:
  70. return str(filter_spec["value"])
  71. value_from = filter_spec.get("value_from")
  72. if value_from == "dashboard.bbk_id":
  73. return str(dashboard["bbk_id"])
  74. if value_from == "dashboard.bbk_name":
  75. return str(dashboard["bbk_name"])
  76. value_kind = filter_spec.get("value_kind")
  77. field_name = str(filter_spec["field"])
  78. if value_kind == "month_end":
  79. return "2026-05" if "月份" in field_name else "2026-05-31"
  80. raise ValueError(f"unsupported filter spec: {filter_spec}")
  81. def build_card_record(
  82. global_ordinal: int,
  83. dashboard: pd.Series,
  84. spec: dict[str, object],
  85. ) -> dict[str, str]:
  86. dataset_prefix = str(spec["dataset_prefix"])
  87. dataset_name = str(spec["dataset"])
  88. fields = list(spec["fields"])
  89. metrics = list(spec["metrics"])
  90. filters = list(spec["filters"])
  91. sort = list(spec["sort"])
  92. filter_names = [str(item["field"]) for item in filters]
  93. filter_values = [resolve_filter_value(item, dashboard) for item in filters]
  94. metric_names = [str(item["field"]) for item in metrics]
  95. metric_aliases = [str(item["alias"]) for item in metrics]
  96. metric_aggs = [str(item["agg"]) for item in metrics]
  97. sort_names = [str(item["field"]) for item in sort]
  98. sort_ways = [str(item["way"]) for item in sort]
  99. folder_parts = str(dashboard["folder_all_route"]).split("/")
  100. folder_name = folder_parts[-1] if folder_parts else infer_domain(dashboard)
  101. return {
  102. "card_id": make_id("card", global_ordinal),
  103. "bbk_id": str(dashboard["bbk_id"]),
  104. "bbk_name": str(dashboard["bbk_name"]),
  105. "card_name": str(spec["card_name"]),
  106. "ds_id": make_dataset_id(dataset_prefix, dataset_name),
  107. "ds_name": f"{dataset_prefix}_{str(dashboard['bbk_id'])}_{dataset_name}",
  108. "dash_id": str(dashboard["dash_id"]),
  109. "dash_name": str(dashboard["dash_dsply_name"]),
  110. "card_type_cd": "图表",
  111. "field_alias": to_json(fields),
  112. "field_name": to_json(fields),
  113. "filters_field_name": to_json(filter_names),
  114. "filters_field_value": to_json(filter_values),
  115. "sort_field_name": to_json(sort_names),
  116. "sort_way": to_json(sort_ways),
  117. "num_value_field_name": to_json(metric_names),
  118. "num_value_field_alias": to_json(metric_aliases),
  119. "num_value_field_merge_way": to_json(metric_aggs),
  120. "folder_name": folder_name,
  121. "folder_all_route": str(dashboard["folder_all_route"]),
  122. }
  123. def select_dashboards(dashboards: pd.DataFrame, branch_names: list[str] | None, count: int | None) -> pd.DataFrame:
  124. selected = dashboards
  125. if branch_names:
  126. selected = selected[selected["bbk_name"].isin(branch_names)]
  127. if count is not None:
  128. selected = selected.head(count)
  129. return selected.reset_index(drop=True)
  130. def choose_card_specs(rng: random.Random, domain: str) -> list[dict[str, object]]:
  131. specs = get_card_blueprints(domain)
  132. card_count = rng.randint(10, min(20, len(specs)))
  133. return rng.sample(specs, card_count)
  134. def make_cards(
  135. dashboards: pd.DataFrame,
  136. start_ordinal: int = 1,
  137. ) -> pd.DataFrame:
  138. rng = random.Random(RANDOM_SEED + start_ordinal - 1)
  139. records: list[dict[str, str]] = []
  140. global_ordinal = start_ordinal
  141. for _, dashboard in dashboards.iterrows():
  142. domain = infer_domain(dashboard)
  143. for spec in choose_card_specs(rng, domain):
  144. records.append(build_card_record(global_ordinal, dashboard, spec))
  145. global_ordinal += 1
  146. cards = pd.DataFrame(records, columns=CARD_COLUMNS)
  147. validate_generated_cards(cards)
  148. return cards
  149. def validate_generated_cards(cards: pd.DataFrame) -> None:
  150. for row in cards.itertuples(index=False):
  151. metric_names = json.loads(row.num_value_field_name)
  152. metric_aliases = json.loads(row.num_value_field_alias)
  153. metric_aggs = json.loads(row.num_value_field_merge_way)
  154. filter_names = json.loads(row.filters_field_name)
  155. filter_values = json.loads(row.filters_field_value)
  156. sort_names = json.loads(row.sort_field_name)
  157. sort_ways = json.loads(row.sort_way)
  158. if not (len(metric_names) == len(metric_aliases) == len(metric_aggs)):
  159. raise ValueError(f"metric length mismatch in card {row.card_id}")
  160. if len(filter_names) != len(filter_values):
  161. raise ValueError(f"filter length mismatch in card {row.card_id}")
  162. if len(sort_names) != len(sort_ways):
  163. raise ValueError(f"sort length mismatch in card {row.card_id}")
  164. def parse_args() -> argparse.Namespace:
  165. parser = argparse.ArgumentParser(description="Generate mock BI card parquet data.")
  166. parser.add_argument(
  167. "--mode",
  168. choices=["overwrite", "append"],
  169. default="overwrite",
  170. help="overwrite replaces the parquet file; append inserts generated rows into the existing parquet file.",
  171. )
  172. parser.add_argument(
  173. "--count",
  174. type=int,
  175. default=None,
  176. help="number of dashboards to generate cards for. Defaults to all selected dashboards.",
  177. )
  178. parser.add_argument(
  179. "--output",
  180. type=Path,
  181. default=OUTPUT_PATH,
  182. help="target parquet path.",
  183. )
  184. parser.add_argument(
  185. "--branch-name",
  186. action="append",
  187. help=(
  188. "branch names to generate, separated by comma or passed repeatedly. "
  189. "Defaults to all branches in dashboard parquet."
  190. ),
  191. )
  192. parser.add_argument(
  193. "--dashboard-input",
  194. type=Path,
  195. default=DASHBOARD_PATH,
  196. help="source dashboard parquet path.",
  197. )
  198. return parser.parse_args()
  199. def main() -> None:
  200. args = parse_args()
  201. if args.count is not None and args.count <= 0:
  202. raise ValueError("--count must be a positive integer")
  203. dashboard_path = args.dashboard_input.resolve()
  204. if not dashboard_path.exists():
  205. raise FileNotFoundError(f"dashboard parquet not found: {dashboard_path}")
  206. dashboards = pd.read_parquet(dashboard_path)
  207. branch_names = parse_branch_names(args.branch_name, set(dashboards["bbk_name"]))
  208. selected_dashboards = select_dashboards(dashboards, branch_names, args.count)
  209. if selected_dashboards.empty:
  210. raise ValueError("no dashboards selected for card generation")
  211. output_path = args.output.resolve()
  212. output_path.parent.mkdir(parents=True, exist_ok=True)
  213. existing = pd.DataFrame()
  214. if args.mode == "append" and output_path.exists():
  215. existing = pd.read_parquet(output_path)
  216. new_cards = make_cards(selected_dashboards, start_ordinal=len(existing) + 1)
  217. cards = pd.concat([existing, new_cards], ignore_index=True) if not existing.empty else new_cards
  218. validate_generated_cards(cards)
  219. cards.to_parquet(output_path, index=False)
  220. duplicate_card_id = int(cards["card_id"].duplicated().sum())
  221. card_count_by_dash = new_cards.groupby("dash_id")["card_id"].count()
  222. selected_branch_names = branch_names or sorted(selected_dashboards["bbk_name"].unique())
  223. print(f"mode: {args.mode}")
  224. print(f"dashboard input: {dashboard_path}")
  225. print(f"branch names: {', '.join(selected_branch_names)}")
  226. print(f"selected dashboards: {len(selected_dashboards)}")
  227. print(f"generated rows: {len(new_cards)}")
  228. print(f"wrote total rows: {len(cards)} to {output_path}")
  229. print(f"duplicate card_id: {duplicate_card_id}")
  230. print(f"cards per dashboard: {int(card_count_by_dash.min())} {int(card_count_by_dash.max())}")
  231. print("bbk_name:", new_cards["bbk_name"].value_counts().to_dict())
  232. if __name__ == "__main__":
  233. main()