{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# DuckDB 仿真数据生成\n", "\n", "根据题目给定的五张表结构生成不少于 20 条的测试数据,并写入 `data/duckdb/bi_metadata.duckdb`。代码仅依赖标准库与 duckdb,避免外部网络。" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import random\n", "import json\n", "\n", "import duckdb\n", "\n", "# 路径与随机种子\n", "BASE_DIR = Path('data')\n", "DB_PATH = BASE_DIR / 'duckdb' / 'bi_metadata.duckdb'\n", "DB_PATH.parent.mkdir(parents=True, exist_ok=True)\n", "random.seed(2024)\n", "DB_PATH" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 构造模拟数据\n", "bbk_ids = [f'BBK{idx:03d}' for idx in range(1, 6)]\n", "dimensions = ['region', 'channel', 'product_line', 'customer_tier', 'province']\n", "group_dims = ['stat_date', 'region', 'channel']\n", "desc_samples = ['收入趋势', '用户留存', '渠道质量', '机构画像', '产品分析']\n", "\n", "dashboards = []\n", "cards = []\n", "mappings = []\n", "filters = []\n", "datasets = []\n", "\n", "card_counter = 1\n", "dataset_counter = 1\n", "dataset_max = 25 # 保证数据集条数 >=20\n", "\n", "for dash_idx in range(1, 6):\n", " dashboard_id = f'DB{dash_idx:03d}'\n", " dashboard_name = f'运营仪表板{dash_idx:02d}'\n", " dashboards.append({\n", " 'dashboard_id': dashboard_id,\n", " 'dashboard_name': dashboard_name,\n", " 'dashboard_desc': random.choice(desc_samples) + '概览',\n", " 'folder_path': f'/bbk/{dashboard_id.lower()}/folder_{dash_idx:02d}',\n", " })\n", "\n", " for _ in range(5): # 每个仪表板 5 张卡片,共 25 行\n", " card_id = f'C{card_counter:04d}'\n", " dataset_id = f'DS{dataset_counter:04d}'\n", " dataset_counter = dataset_counter % dataset_max + 1\n", " card_name = f'卡片{card_counter:02d}'\n", " bbk_id = random.choice(bbk_ids)\n", " select_field = random.choice(group_dims)\n", " sql_select = f'SELECT {select_field}, SUM(metric_value) AS metric_value FROM {dataset_id}'\n", " sql_groupby = f'GROUP BY {select_field}'\n", " sql_where = \"stat_date >= date '2023-01-01'\"\n", "\n", " cards.append({\n", " 'card_id': card_id,\n", " 'card_name': card_name,\n", " 'card_desc': f'{card_name} 指标解析',\n", " 'sql_select': sql_select,\n", " 'sql_where': sql_where,\n", " 'sql_groupby': sql_groupby,\n", " })\n", "\n", " mappings.append({\n", " 'card_id': card_id,\n", " 'dashboard_id': dashboard_id,\n", " 'dataset_id': dataset_id,\n", " 'dashboard_name': dashboard_name,\n", " 'card_name': card_name,\n", " 'dataset_name': f'{dataset_id}_data',\n", " 'bbk_id': bbk_id,\n", " })\n", "\n", " dim = random.choice(dimensions)\n", " filters.append({\n", " 'card_id': card_id,\n", " 'filter_id': f'{dim.upper()}_{card_counter:02d}',\n", " 'filter_type': 'F',\n", " 'where_clause': f'{dim}_name = ?',\n", " 'default_value': 'ALL',\n", " 'options': json.dumps([f\"{dim.title()} {opt}\" for opt in range(1, 5)]),\n", " })\n", " filters.append({\n", " 'card_id': card_id,\n", " 'filter_id': f'DATE_{card_counter:02d}',\n", " 'filter_type': 'D',\n", " 'where_clause': \"stat_date >= date '2023-01-01'\",\n", " 'default_value': '2023-01-01',\n", " 'options': json.dumps([]),\n", " })\n", "\n", " datasets.append({\n", " 'dataset_id': dataset_id,\n", " 'dataset_ddl': f\"CREATE TABLE {dataset_id.lower()} (\\n bbk_id VARCHAR,\\n stat_date DATE,\\n metric_value DOUBLE,\\n dimension VARCHAR\\n);\",\n", " })\n", "\n", " card_counter += 1\n", "\n", "len(cards), len(mappings), len(filters), len(datasets), len(dashboards)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 创建表并插入数据\n", "con = duckdb.connect(str(DB_PATH))\n", "\n", "con.execute(\"\nCREATE OR REPLACE TABLE card_dataset_dashboard_mapping (\n card_id VARCHAR,\n dashboard_id VARCHAR,\n dataset_id VARCHAR,\n dashboard_name VARCHAR,\n card_name VARCHAR,\n dataset_name VARCHAR,\n bbk_id VARCHAR\n);\n\")\n", "\n", "con.execute(\"\nCREATE OR REPLACE TABLE card_info (\n card_id VARCHAR,\n card_name VARCHAR,\n card_desc VARCHAR,\n sql_select VARCHAR,\n sql_where VARCHAR,\n sql_groupby VARCHAR\n);\n\")\n", "\n", "con.execute(\"\nCREATE OR REPLACE TABLE card_filter_info (\n card_id VARCHAR,\n filter_id VARCHAR,\n filter_type VARCHAR,\n where_clause VARCHAR,\n default_value VARCHAR,\n options VARCHAR\n);\n\")\n", "\n", "con.execute(\"\nCREATE OR REPLACE TABLE dashboard_info (\n dashboard_id VARCHAR,\n dashboard_name VARCHAR,\n dashboard_desc VARCHAR,\n folder_path VARCHAR\n);\n\")\n", "\n", "con.execute(\"\nCREATE OR REPLACE TABLE dataset_ddl (\n dataset_id VARCHAR,\n dataset_ddl VARCHAR\n);\n\")\n", "\n", "con.executemany(\"INSERT INTO card_dataset_dashboard_mapping VALUES (?,?,?,?,?,?,?)\",\n [(m['card_id'], m['dashboard_id'], m['dataset_id'], m['dashboard_name'], m['card_name'], m['dataset_name'], m['bbk_id']) for m in mappings])\n", "con.executemany(\"INSERT INTO card_info VALUES (?,?,?,?,?,?)\",\n [(c['card_id'], c['card_name'], c['card_desc'], c['sql_select'], c['sql_where'], c['sql_groupby']) for c in cards])\n", "con.executemany(\"INSERT INTO card_filter_info VALUES (?,?,?,?,?,?)\",\n [(f['card_id'], f['filter_id'], f['filter_type'], f['where_clause'], f['default_value'], f['options']) for f in filters])\n", "con.executemany(\"INSERT INTO dashboard_info VALUES (?,?,?,?)\",\n [(d['dashboard_id'], d['dashboard_name'], d['dashboard_desc'], d['folder_path']) for d in dashboards])\n", "con.executemany(\"INSERT INTO dataset_ddl VALUES (?,?)\",\n [(d['dataset_id'], d['dataset_ddl']) for d in datasets])\n", "\n", "con.close()\n", "'tables created'\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 校验行数\n", "with duckdb.connect(str(DB_PATH)) as con:\n", " for tbl in ['card_dataset_dashboard_mapping', 'card_info', 'card_filter_info', 'dashboard_info', 'dataset_ddl']:\n", " cnt = con.execute(f'SELECT COUNT(*) FROM {tbl}').fetchone()[0]\n", " print(tbl, cnt)\n", " print('示例行:', con.execute('SELECT * FROM card_dataset_dashboard_mapping LIMIT 3').fetchall())\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.11" } }, "nbformat": 4, "nbformat_minor": 5 }