Skip to content

Redundant Joins with Time Spine and Multiple Fact Tables #9499

@tlangton3

Description

@tlangton3

Describe the bug
I've identified a pattern of redundant joins when querying across multiple fact tables using a time spine for conformance. Employing a time spine for this purpose is a fairly standard approach in analytics, ensuring consistent reporting across different datasets at a specific grain (e.g., daily). When I issue a query against the time_spine requesting measures from both fact_table_1 and fact_table_2, the generated SQL unnecessarily carries out the join to each fact table multiple times. This redundancy significantly increases query complexity and negatively impacts performance.

SELECT
  q_0.`time_spine__date_day`,
  `fact_table_1__value_sum` `fact_table_1__value_sum`,
  `fact_table_2__value_sum` `fact_table_2__value_sum`
FROM
  (
    SELECT
      `keys`.`time_spine__date_day`,
      sum(`fact_table_1_key__fact_table_1`.value) `fact_table_1__value_sum`
    FROM
      (
        SELECT
          DISTINCT DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_1_key__time_spine`.date),
              'UTC'
            ),
            DAY
          ) `time_spine__date_day`,
          `fact_table_1_key__fact_table_1`.surrogate_key `fact_table_1__surrogate_key`
        FROM
          (
            select
              date
            from
              unnest(generate_date_array('2025-04-19', '2025-04-22')) as date
          ) AS `fact_table_1_key__time_spine`
          LEFT JOIN (
            select
              date('2025-04-19') as record_date,
              10 as value,
              'a' as surrogate_key
            union all
            select
              date('2025-04-19') as record_date,
              10 as value,
              'b' as surrogate_key
          ) AS `fact_table_1_key__fact_table_1` ON DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_1_key__time_spine`.date),
              'UTC'
            ),
            DAY
          ) = DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_1_key__fact_table_1`.record_date),
              'UTC'
            ),
            DAY
          )
          LEFT JOIN (
            select
              date('2025-04-19') as record_date,
              20 as value,
              'c' as surrogate_key
            union all
            select
              date('2025-04-19') as record_date,
              20 as value,
              'd' as surrogate_key
          ) AS `fact_table_1_key__fact_table_2` ON DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_1_key__time_spine`.date),
              'UTC'
            ),
            DAY
          ) = DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_1_key__fact_table_2`.record_date),
              'UTC'
            ),
            DAY
          )
      ) AS `keys`
      LEFT JOIN (
        select
          date('2025-04-19') as record_date,
          10 as value,
          'a' as surrogate_key
        union all
        select
          date('2025-04-19') as record_date,
          10 as value,
          'b' as surrogate_key
      ) AS `fact_table_1_key__fact_table_1` ON `keys`.`fact_table_1__surrogate_key` = `fact_table_1_key__fact_table_1`.surrogate_key
    GROUP BY
      1
  ) as q_0
  INNER JOIN (
    SELECT
      `keys`.`time_spine__date_day`,
      sum(`fact_table_2_key__fact_table_2`.value) `fact_table_2__value_sum`
    FROM
      (
        SELECT
          DISTINCT DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_2_key__time_spine`.date),
              'UTC'
            ),
            DAY
          ) `time_spine__date_day`,
          `fact_table_2_key__fact_table_2`.surrogate_key `fact_table_2__surrogate_key`
        FROM
          (
            select
              date
            from
              unnest(generate_date_array('2025-04-19', '2025-04-22')) as date
          ) AS `fact_table_2_key__time_spine`
          LEFT JOIN (
            select
              date('2025-04-19') as record_date,
              10 as value,
              'a' as surrogate_key
            union all
            select
              date('2025-04-19') as record_date,
              10 as value,
              'b' as surrogate_key
          ) AS `fact_table_2_key__fact_table_1` ON DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_2_key__time_spine`.date),
              'UTC'
            ),
            DAY
          ) = DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_2_key__fact_table_1`.record_date),
              'UTC'
            ),
            DAY
          )
          LEFT JOIN (
            select
              date('2025-04-19') as record_date,
              20 as value,
              'c' as surrogate_key
            union all
            select
              date('2025-04-19') as record_date,
              20 as value,
              'd' as surrogate_key
          ) AS `fact_table_2_key__fact_table_2` ON DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_2_key__time_spine`.date),
              'UTC'
            ),
            DAY
          ) = DATETIME_TRUNC(
            DATETIME(
              timestamp(`fact_table_2_key__fact_table_2`.record_date),
              'UTC'
            ),
            DAY
          )
      ) AS `keys`
      LEFT JOIN (
        select
          date('2025-04-19') as record_date,
          20 as value,
          'c' as surrogate_key
        union all
        select
          date('2025-04-19') as record_date,
          20 as value,
          'd' as surrogate_key
      ) AS `fact_table_2_key__fact_table_2` ON `keys`.`fact_table_2__surrogate_key` = `fact_table_2_key__fact_table_2`.surrogate_key
    GROUP BY
      1
  ) as q_1 ON (
    q_0.`time_spine__date_day` = q_1.`time_spine__date_day`
    OR (
      q_0.`time_spine__date_day` IS NULL
      AND q_1.`time_spine__date_day` IS NULL
    )
  )
ORDER BY
  1 ASC
LIMIT
  10000

To Reproduce
See screenshots and cube schema provided.

Expected behavior
Ideally, the query should perform the join from the time_spine to each fact table only once to retrieve the necessary data for all the requested measures. This would lead to a more efficient and less complex SQL query.

The current behavior of repeating the joins makes this sort of pattern difficult to implement efficiently, leading to significant performance issues, especially when querying across multiple fact tables, even with moderately sized datasets. In one instance in my project, the generated query took ten minutes to run. Manually optimising the query to remove the redundant joins reduced the runtime to under five seconds, demonstrating the significant overhead caused by this issue.

Screenshots

Image

Minimally reproducible Cube Schema
*My engine is BQ.

cubes:
  - name: time_spine
    sql: select date from unnest(generate_date_array('2025-04-19', '2025-04-22')) as date
    public: true
    joins:
      - name: fact_table_1
        sql: >
          {date.day} = {fact_table_1.record_date.day}
        relationship: one_to_many

      - name: fact_table_2
        sql: >
          {date.day} = {fact_table_2.record_date.day}
        relationship: one_to_many

    dimensions:
      - name: date
        sql: "timestamp({CUBE}.date)"
        type: time
        primary_key: true


  - name: fact_table_1
    sql: >
      select date('2025-04-19') as record_date, 10 as value, 'a' as surrogate_key
      union all
      select date('2025-04-19') as record_date, 10 as value, 'b' as surrogate_key
    public: true

    dimensions:
      - name: surrogate_key
        sql: "{CUBE}.surrogate_key"
        type: string
        primary_key: true

      - name: record_date
        sql: "timestamp({CUBE}.record_date)"
        type: time
      
      - name: value
        sql: "{CUBE}.value"
        type: number

    measures:
      - name: value_sum
        sql: "{value}"
        type: sum


  - name: fact_table_2
    sql: >
      select date('2025-04-19') as record_date, 20 as value, 'c' as surrogate_key
      union all
      select date('2025-04-19') as record_date, 20 as value, 'd' as surrogate_key
    public: true

    dimensions:
      - name: surrogate_key
        sql: "{CUBE}.surrogate_key"
        type: string
        primary_key: true

      - name: record_date
        sql: "timestamp({CUBE}.record_date)"
        type: time
      
      - name: value
        sql: "{CUBE}.value"
        type: number

    measures:
      - name: value_sum
        sql: "{value}"
        type: sum

Version:
1.3.5

Additional context

Metadata

Metadata

Assignees

No one assigned

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions