ysl2007 2 tháng trước cách đây
mục cha
commit
f28eef1ec9
1 tập tin đã thay đổi với 27 bổ sung5 xóa
  1. 27 5
      carddef2sql.py

+ 27 - 5
carddef2sql.py

@@ -173,7 +173,7 @@ def build_with_part(new_date_fields, new_dimension_fields, dataset_fid_name_map,
     return sql_part
 
 # 处理计算字段
-def process_calculation_fields(measure_fields, measure_aggs, calculation_fields, card_id, card_name):
+def process_measure_fields(measure_fields, measure_aggs, calculation_fields, card_id, card_name):
     ## 数值字段数量 小于 聚合函数数量,不合法
     if len(measure_fields) < len(measure_aggs):
         print(f"警告: 卡片 {card_id} {card_name}: 数值字段数量小于聚合函数数量,不合法")
@@ -214,6 +214,17 @@ def process_calculation_fields(measure_fields, measure_aggs, calculation_fields,
                 new_measure_aggs.append(measure_aggs.pop(0))
     return new_measure_fields, new_measure_aggs, agg_flag
 
+# sql部分去重
+def dedupe_sql_parts(parts):
+    deduped = []
+    seen = set()
+    for part in parts:
+        if not part or part in seen:
+            continue
+        seen.add(part)
+        deduped.append(part)
+    return deduped
+
 def quote_identifier(identifier, formula=False):
     if not QUOTE_FLAG:
         return identifier
@@ -492,6 +503,10 @@ def build_sql_query(card_data, added_fields_info, dataset_fid_name_map):
 
     # 处理字段重命名关系
     fields_rename_map = get_fields_rename_map(card_data.get("field_info", ""))
+
+    # 处理field_id与重命名关系,用于筛选Order by子句中的字段
+    # 需要处理的只有日期转换类型,将转换前的原始字段名加入map
+    # 只需要更新有重命名的字段即可
     selected_fid_alias_map = dict(zip(dimension_fids+measure_fids, dimension_fields+measure_fields))
 
     # 构建WITH
@@ -516,6 +531,7 @@ def build_sql_query(card_data, added_fields_info, dataset_fid_name_map):
     # 构建SELECT
     select_parts = []
     has_aggregation = False
+    non_aggregated_select_parts = []
     
     # 添加维度字段
     for field in dimension_fields:
@@ -529,7 +545,7 @@ def build_sql_query(card_data, added_fields_info, dataset_fid_name_map):
             selected_fid_alias_map[fid] = field
     
     # 加工计算字段
-    new_measure_fields, measure_aggs, agg_flag = process_calculation_fields(measure_fields, measure_aggs, added_fields_info, card_id, card_name)
+    new_measure_fields, measure_aggs, agg_flag = process_measure_fields(measure_fields, measure_aggs, added_fields_info, card_id, card_name)
     if agg_flag:
         has_aggregation = True
     for i, field in enumerate(new_measure_fields):
@@ -540,6 +556,8 @@ def build_sql_query(card_data, added_fields_info, dataset_fid_name_map):
             if not alias or alias == "null":
                 alias = measure_fields[i]
             select_parts.append(f"{field} AS {quote_identifier(alias)}")
+            if field and re.search(r"\b(sum|avg|count|max|min|stddev|variance|collect_list|collect_set|percentile|percentile_approx)|\s*\(", field, flags=re.IGNORECASE) is None:
+                non_aggregated_select_parts.append(field)
             selected_fid_alias_map[fid] = alias
         else:
             has_aggregation = True
@@ -578,9 +596,12 @@ def build_sql_query(card_data, added_fields_info, dataset_fid_name_map):
 
     # 构建GROUPBY
     group_by_clause = ""
-    if has_aggregation and dimension_fields:
+    if has_aggregation:
         group_by_parts = [quote_identifier(field) for field in dimension_fields]
-        group_by_clause = "GROUP BY " + ", ".join(group_by_parts)
+        group_by_parts.extend(non_aggregated_select_parts)
+        group_by_parts = dedupe_sql_parts(group_by_parts)
+        if group_by_parts:
+            group_by_clause = "GROUP BY " + ", ".join(group_by_parts)
     
     # 构建ORDERBY
     order_by_clause = ""
@@ -597,7 +618,7 @@ def build_sql_query(card_data, added_fields_info, dataset_fid_name_map):
     
     # 组装SQL
     sql_parts = [with_part, select_clause, from_clause]
-
+    # 返回 select, where, groupby, orderby
     return ("\n".join(sql_parts)).strip(), json.dumps(filter_conditions, ensure_ascii=False), group_by_clause, order_by_clause
 
 def generate():
@@ -638,3 +659,4 @@ def generate():
 if __name__ == "__main__":
     df = generate()
     df.to_parquet("output/sql.parquet")
+    df.to_excel("output/sql.xlsx")