fastflowtransform.executors.bigquery.base¶

BigQueryBaseExecutor ¶

Bases: SqlIdentifierMixin, BaseExecutor[TFrame]

Shared BigQuery executor logic (SQL, incremental, meta, DQ helpers).

Subclasses are responsible for

frame type (pandas / BigFrames / ...)
_read_relation()
_materialize_relation()
_is_frame()
_frame_name()

Source code in src/fastflowtransform/executors/bigquery/base.py

class BigQueryBaseExecutor(SqlIdentifierMixin, BaseExecutor[TFrame]):
    """
    Shared BigQuery executor logic (SQL, incremental, meta, DQ helpers).

    Subclasses are responsible for:
      - frame type (pandas / BigFrames / ...)
      - _read_relation()
      - _materialize_relation()
      - _is_frame()
      - _frame_name()
    """

    # Subclasses override ENGINE_NAME ("bigquery", "bigquery_batch", ...)
    ENGINE_NAME = "bigquery_base"
    runtime_query_stats: BigQueryQueryStatsRuntime
    runtime_budget: BigQueryBudgetRuntime

    def __init__(
        self,
        project: str,
        dataset: str,
        location: str | None = None,
        client: Client | None = None,
        allow_create_dataset: bool = False,
    ):
        self.project = project
        self.dataset = dataset
        self.location = location
        self.allow_create_dataset = allow_create_dataset
        self.client: Client = client or bigquery.Client(
            project=self.project,
            location=self.location,
        )
        self.runtime_query_stats = BigQueryQueryStatsRuntime(self)
        self.runtime_budget = BigQueryBudgetRuntime(self)
        self.snapshot_runtime = BigQuerySnapshotRuntime(self)

    # ---- Identifier helpers ----
    def _bq_quote(self, value: str) -> str:
        return value.replace("`", "\\`")

    def _quote_identifier(self, ident: str) -> str:
        return self._bq_quote(ident)

    def _default_schema(self) -> str | None:
        return self.dataset

    def _default_catalog(self) -> str | None:
        return self.project

    def _should_include_catalog(
        self, catalog: str | None, schema: str | None, *, explicit: bool
    ) -> bool:
        # BigQuery always expects a project + dataset.
        return True

    def _qualify_identifier(
        self,
        ident: str,
        *,
        schema: str | None = None,
        catalog: str | None = None,
        quote: bool = True,
    ) -> str:
        proj = self._clean_part(catalog) or self._default_catalog()
        dset = self._clean_part(schema) or self._default_schema()
        normalized = self._normalize_identifier(ident)
        parts = [proj, dset, normalized]
        if not quote:
            return ".".join(p for p in parts if p)
        return f"`{'.'.join(self._bq_quote(p) for p in parts if p)}`"

    def _qualified_identifier(
        self, relation: str, project: str | None = None, dataset: str | None = None
    ) -> str:
        return self._qualify_identifier(relation, schema=dataset, catalog=project)

    def _qualified_api_identifier(
        self, relation: str, project: str | None = None, dataset: str | None = None
    ) -> str:
        """
        Build an API-safe identifier (project.dataset.table) without backticks.
        """
        return self._qualify_identifier(
            relation,
            schema=dataset,
            catalog=project,
            quote=False,
        )

    def _ensure_dataset(self) -> None:
        ds_id = f"{self.project}.{self.dataset}"
        try:
            self.client.get_dataset(ds_id)
            return
        except NotFound:
            if not getattr(self, "allow_create_dataset", False):
                raise

        ds_obj = bigquery.Dataset(ds_id)
        if getattr(self, "location", None):
            ds_obj.location = self.location
        self.client.create_dataset(ds_obj, exists_ok=True)

    def execute_test_sql(self, stmt: Any) -> Any:
        """
        Execute lightweight SQL for DQ tests using the BigQuery client.
        """

        def _infer_param_type(value: Any) -> str:
            if isinstance(value, bool):
                return "BOOL"
            if isinstance(value, int) and not isinstance(value, bool):
                return "INT64"
            if isinstance(value, float):
                return "FLOAT64"
            return "STRING"

        def _run_job(sql: str, params: dict[str, Any] | None = None) -> Any:
            job_config = bigquery.QueryJobConfig()
            if self.dataset:
                job_config.default_dataset = bigquery.DatasetReference(self.project, self.dataset)
            if params:
                job_config.query_parameters = [
                    bigquery.ScalarQueryParameter(k, _infer_param_type(v), v)
                    for k, v in params.items()
                ]
            return self.client.query(sql, job_config=job_config, location=self.location)

        def _run_one(s: Any) -> Any:
            statement_len = 2
            if (
                isinstance(s, tuple)
                and len(s) == statement_len
                and isinstance(s[0], str)
                and isinstance(s[1], dict)
            ):
                return _run_job(s[0], s[1]).result()
            if isinstance(s, str):
                # Use guarded execution path for simple statements
                return self._execute_sql(s).result()
            if isinstance(s, Iterable) and not isinstance(s, (bytes, bytearray, str)):
                res = None
                for item in s:
                    res = _run_one(item)
                return res
            return _run_job(str(s)).result()

        return make_fetchable(_run_one(stmt))

    def compute_freshness_delay_minutes(self, table: str, ts_col: str) -> tuple[float | None, str]:
        sql = (
            f"select cast(TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), max({ts_col}), MINUTE) as float64) "
            f"as delay_min from {table}"
        )
        res = self.execute_test_sql(sql)
        delay = getattr(res, "fetchone", lambda: None)()
        val = delay[0] if delay else None
        return (float(val) if val is not None else None, sql)

    def _execute_sql_basic(self, sql: str) -> _TrackedQueryJob:
        job_config = bigquery.QueryJobConfig()
        if self.dataset:
            # Let unqualified tables resolve to project.dataset.table
            job_config.default_dataset = bigquery.DatasetReference(self.project, self.dataset)

        job = self.client.query(
            sql,
            job_config=job_config,
            location=self.location,
        )
        return self.runtime_query_stats.wrap_job(job)

    def _execute_sql(self, sql: str) -> _TrackedQueryJob:
        """
        Central BigQuery query runner.

        - All 'real' SQL statements in this executor should go through here.
        - Returns the QueryJob so callers can call .result().
        """

        def _exec() -> _TrackedQueryJob:
            return self._execute_sql_basic(sql)

        return self.runtime_budget.run_sql(
            sql,
            exec_fn=_exec,
            stats_runtime=self.runtime_query_stats,
            record_stats=False,
        )

    # ---- DQ test table formatting (fft test) ----
    def _format_test_table(self, table: str | None) -> str | None:
        """
        Ensure tests use fully-qualified BigQuery identifiers in fft test.
        """
        table = super()._format_test_table(table)
        if not isinstance(table, str):
            return table
        stripped = table.strip()
        if not stripped or stripped.startswith("`"):
            return stripped
        if "." in stripped:
            return stripped
        return self._qualified_identifier(stripped)

    # ---- SQL hooks ----
    def _this_identifier(self, node: Node) -> str:
        """
        Ensure {{ this }} renders as a fully-qualified identifier so BigQuery
        incremental SQL (e.g., subqueries against {{ this }}) includes project
        and dataset.
        """
        return self._qualify_identifier(relation_for(node.name))

    def _apply_sql_materialization(
        self,
        node: Node,
        target_sql: str,
        select_body: str,
        materialization: str,
    ) -> None:
        self._ensure_dataset()
        try:
            super()._apply_sql_materialization(node, target_sql, select_body, materialization)
        except BadRequest as e:
            raise RuntimeError(
                f"BigQuery SQL failed for {target_sql}:\n{select_body}\n\n{e}"
            ) from e

    def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None:
        self._execute_sql_basic(f"CREATE OR REPLACE VIEW {target_sql} AS {select_body}").result()

    def _create_or_replace_table(self, target_sql: str, select_body: str, node: Node) -> None:
        self._execute_sql(f"CREATE OR REPLACE TABLE {target_sql} AS {select_body}").result()

    def _create_or_replace_view_from_table(
        self,
        view_name: str,
        backing_table: str,
        node: Node,
    ) -> None:
        view_id = self._qualified_identifier(view_name)
        back_id = self._qualified_identifier(backing_table)
        self._ensure_dataset()
        self._execute_sql_basic(
            f"CREATE OR REPLACE VIEW {view_id} AS SELECT * FROM {back_id}"
        ).result()

    # ---- Meta hook ----
    def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None:
        """
        Write/update dataset._ff_meta after a successful build.
        Both pandas + BigFrames executors use the logical engine key 'bigquery'.
        """
        ensure_meta_table(self)
        upsert_meta(self, node.name, relation, fingerprint, "bigquery")

    # ── Incremental API (shared across BigQuery executors) ───────────────
    def exists_relation(self, relation: str) -> bool:
        """
        Check presence in INFORMATION_SCHEMA for tables/views.
        """
        proj = self.project
        dset = self.dataset
        rel = relation
        q = f"""
        SELECT 1
        FROM `{proj}.{dset}.INFORMATION_SCHEMA.TABLES`
        WHERE LOWER(table_name)=LOWER(@rel)
        UNION ALL
        SELECT 1
        FROM `{proj}.{dset}.INFORMATION_SCHEMA.VIEWS`
        WHERE LOWER(table_name)=LOWER(@rel)
        LIMIT 1
        """
        job = self.client.query(
            q,
            job_config=bigquery.QueryJobConfig(
                query_parameters=[bigquery.ScalarQueryParameter("rel", "STRING", rel)]
            ),
            location=self.location,
        )
        return bool(list(job.result()))

    def create_table_as(self, relation: str, select_sql: str) -> None:
        """
        CREATE TABLE AS with cleaned SELECT body (no trailing semicolons).
        """
        self._ensure_dataset()
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        target = self._qualified_identifier(
            relation,
            project=self.project,
            dataset=self.dataset,
        )
        self._execute_sql(f"CREATE TABLE {target} AS {body}").result()

    def incremental_insert(self, relation: str, select_sql: str) -> None:
        """
        INSERT INTO with cleaned SELECT body.
        """
        self._ensure_dataset()
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        target = self._qualified_identifier(
            relation,
            project=self.project,
            dataset=self.dataset,
        )
        self._execute_sql(f"INSERT INTO {target} {body}").result()

    def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None:
        """
        Portable fallback without native MERGE:
          - DELETE collisions via WHERE EXISTS against the cleaned SELECT body
          - INSERT new rows from the same body
        """
        self._ensure_dataset()
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        target = self._qualified_identifier(
            relation,
            project=self.project,
            dataset=self.dataset,
        )
        pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE"

        delete_sql = f"""
        DELETE FROM {target} t
        WHERE EXISTS (SELECT 1 FROM ({body}) s WHERE {pred})
        """
        self._execute_sql(delete_sql).result()

        insert_sql = f"INSERT INTO {target} SELECT * FROM ({body})"
        self._execute_sql(insert_sql).result()

    def alter_table_sync_schema(
        self,
        relation: str,
        select_sql: str,
        *,
        mode: str = "append_new_columns",
    ) -> None:
        """
        Best-effort additive schema sync:
          - infer select schema via LIMIT 0 query
          - add missing columns as NULLABLE using inferred BigQuery types
        """
        if mode not in {"append_new_columns", "sync_all_columns"}:
            return
        self._ensure_dataset()

        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")

        # Infer schema using a no-row query (lets BigQuery type the expressions)
        probe = self.client.query(
            f"SELECT * FROM ({body}) WHERE 1=0",
            job_config=bigquery.QueryJobConfig(dry_run=False, use_query_cache=False),
            location=self.location,
        )
        probe.result()
        out_fields = {f.name: f for f in (probe.schema or [])}

        # Existing table schema
        table_ref = f"{self.project}.{self.dataset}.{relation}"
        try:
            tbl = self.client.get_table(table_ref)
        except NotFound:
            return
        existing_cols = {f.name for f in (tbl.schema or [])}

        to_add = [name for name in out_fields if name not in existing_cols]
        if not to_add:
            return

        target = self._qualified_identifier(
            relation,
            project=self.project,
            dataset=self.dataset,
        )
        for col in to_add:
            f = out_fields[col]
            typ = str(f.field_type) if hasattr(f, "field_type") else "STRING"
            self._execute_sql_basic(f"ALTER TABLE {target} ADD COLUMN {col} {typ}").result()

    # ── Snapshots API (shared for pandas + BigFrames) ─────────────────────

    def execute_hook_sql(self, sql: str) -> None:
        """
        Execute one SQL statement for pre/post/on_run hooks.
        """
        self._execute_sql(sql).result()

    # ---- Snapshot runtime delegation (shared for pandas + BigFrames) ----
    def run_snapshot_sql(self, node: Node, env: Any) -> None:
        self.snapshot_runtime.run_snapshot_sql(node, env)

    def snapshot_prune(
        self,
        relation: str,
        unique_key: list[str],
        keep_last: int,
        *,
        dry_run: bool = False,
    ) -> None:
        self.snapshot_runtime.snapshot_prune(
            relation,
            unique_key,
            keep_last,
            dry_run=dry_run,
        )

    def _introspect_columns_metadata(
        self,
        table: str,
        *,
        column: str | None = None,
    ) -> list[tuple[str, str]]:
        """
        Internal helper: return [(column_name_lower, data_type_upper), ...]
        for a BigQuery table using INFORMATION_SCHEMA.COLUMNS.

        Accepts:
          - `table` as "table" or "dataset.table" or "project.dataset.table"
          - optional `column` to restrict to a single column
        """
        project = self.project
        dataset = self.dataset
        table_name = table

        parts = table.split(".")
        if len(parts) == 3:
            project, dataset, table_name = parts
        elif len(parts) == 2:
            dataset, table_name = parts

        table_name = table_name.strip("`")
        dataset = dataset.strip("`") if dataset else dataset
        project = project.strip("`") if project else project

        if not table_name:
            return []

        where = ["lower(table_name) = lower(@t)"]
        params = [bigquery.ScalarQueryParameter("t", "STRING", table_name)]

        if column is not None:
            where.append("lower(column_name) = lower(@c)")
            params.append(bigquery.ScalarQueryParameter("c", "STRING", column))

        sql = f"""
        select lower(column_name) as column_name, upper(data_type) as data_type
        from `{project}.{dataset}.INFORMATION_SCHEMA.COLUMNS`
        where {" and ".join(where)}
        order by ordinal_position
        """

        job = self.client.query(
            sql,
            job_config=bigquery.QueryJobConfig(
                query_parameters=params,
                default_dataset=bigquery.DatasetReference(project, dataset),
            ),
            location=self.location,
        )
        rows = list(job.result())
        return [(str(r[0]), str(r[1])) for r in rows]

    def introspect_column_physical_type(self, table: str, column: str) -> str | None:
        rows = self._introspect_columns_metadata(table, column=column)
        return rows[0][1] if rows else None

    def introspect_table_physical_schema(self, table: str) -> dict[str, str]:
        rows = self._introspect_columns_metadata(table, column=None)
        # keys are lowercased to match the DuckRuntimeContracts verify logic
        return {name: dtype for (name, dtype) in rows}

    def collect_docs_columns(self) -> dict[str, list[ColumnInfo]]:
        """
        Column metadata for docs (project+dataset scoped).
        """
        sql = f"""
        select table_name, column_name, data_type, is_nullable
        from `{self.project}.{self.dataset}.INFORMATION_SCHEMA.COLUMNS`
        order by table_name, ordinal_position
        """
        try:
            job = self.client.query(
                sql,
                job_config=bigquery.QueryJobConfig(
                    default_dataset=bigquery.DatasetReference(self.project, self.dataset)
                ),
                location=self.location,
            )
            rows = list(job.result())
        except Exception:
            return {}

        out: dict[str, list[ColumnInfo]] = {}
        for row in rows:
            table = str(row["table_name"])
            col = str(row["column_name"])
            dtype = str(row["data_type"])
            nullable = str(row["is_nullable"]).upper() == "YES"
            out.setdefault(table, []).append(ColumnInfo(col, dtype, nullable))
        return out

    def load_seed(self, table: str, df: Any, schema: str | None = None) -> tuple[bool, str, bool]:
        dataset_id = schema or self.dataset

        table_id = self._qualified_api_identifier(
            table,
            project=self.project,
            dataset=dataset_id,
        )
        full_name = table_id
        self._ensure_dataset()

        job_config = bigquery.LoadJobConfig(
            write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
        )

        load_job = self.client.load_table_from_dataframe(df, table_id, job_config=job_config)
        load_job.result()

        return True, full_name, False

execute_test_sql ¶

execute_test_sql(stmt)

Execute lightweight SQL for DQ tests using the BigQuery client.

Source code in src/fastflowtransform/executors/bigquery/base.py

def execute_test_sql(self, stmt: Any) -> Any:
    """
    Execute lightweight SQL for DQ tests using the BigQuery client.
    """

    def _infer_param_type(value: Any) -> str:
        if isinstance(value, bool):
            return "BOOL"
        if isinstance(value, int) and not isinstance(value, bool):
            return "INT64"
        if isinstance(value, float):
            return "FLOAT64"
        return "STRING"

    def _run_job(sql: str, params: dict[str, Any] | None = None) -> Any:
        job_config = bigquery.QueryJobConfig()
        if self.dataset:
            job_config.default_dataset = bigquery.DatasetReference(self.project, self.dataset)
        if params:
            job_config.query_parameters = [
                bigquery.ScalarQueryParameter(k, _infer_param_type(v), v)
                for k, v in params.items()
            ]
        return self.client.query(sql, job_config=job_config, location=self.location)

    def _run_one(s: Any) -> Any:
        statement_len = 2
        if (
            isinstance(s, tuple)
            and len(s) == statement_len
            and isinstance(s[0], str)
            and isinstance(s[1], dict)
        ):
            return _run_job(s[0], s[1]).result()
        if isinstance(s, str):
            # Use guarded execution path for simple statements
            return self._execute_sql(s).result()
        if isinstance(s, Iterable) and not isinstance(s, (bytes, bytearray, str)):
            res = None
            for item in s:
                res = _run_one(item)
            return res
        return _run_job(str(s)).result()

    return make_fetchable(_run_one(stmt))

on_node_built ¶

on_node_built(node, relation, fingerprint)

Write/update dataset._ff_meta after a successful build. Both pandas + BigFrames executors use the logical engine key 'bigquery'.

Source code in src/fastflowtransform/executors/bigquery/base.py

def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None:
    """
    Write/update dataset._ff_meta after a successful build.
    Both pandas + BigFrames executors use the logical engine key 'bigquery'.
    """
    ensure_meta_table(self)
    upsert_meta(self, node.name, relation, fingerprint, "bigquery")

exists_relation ¶

exists_relation(relation)

Check presence in INFORMATION_SCHEMA for tables/views.

Source code in src/fastflowtransform/executors/bigquery/base.py

def exists_relation(self, relation: str) -> bool:
    """
    Check presence in INFORMATION_SCHEMA for tables/views.
    """
    proj = self.project
    dset = self.dataset
    rel = relation
    q = f"""
    SELECT 1
    FROM `{proj}.{dset}.INFORMATION_SCHEMA.TABLES`
    WHERE LOWER(table_name)=LOWER(@rel)
    UNION ALL
    SELECT 1
    FROM `{proj}.{dset}.INFORMATION_SCHEMA.VIEWS`
    WHERE LOWER(table_name)=LOWER(@rel)
    LIMIT 1
    """
    job = self.client.query(
        q,
        job_config=bigquery.QueryJobConfig(
            query_parameters=[bigquery.ScalarQueryParameter("rel", "STRING", rel)]
        ),
        location=self.location,
    )
    return bool(list(job.result()))

create_table_as ¶

create_table_as(relation, select_sql)

CREATE TABLE AS with cleaned SELECT body (no trailing semicolons).

Source code in src/fastflowtransform/executors/bigquery/base.py

def create_table_as(self, relation: str, select_sql: str) -> None:
    """
    CREATE TABLE AS with cleaned SELECT body (no trailing semicolons).
    """
    self._ensure_dataset()
    body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
    target = self._qualified_identifier(
        relation,
        project=self.project,
        dataset=self.dataset,
    )
    self._execute_sql(f"CREATE TABLE {target} AS {body}").result()

incremental_insert ¶

incremental_insert(relation, select_sql)

INSERT INTO with cleaned SELECT body.

Source code in src/fastflowtransform/executors/bigquery/base.py

def incremental_insert(self, relation: str, select_sql: str) -> None:
    """
    INSERT INTO with cleaned SELECT body.
    """
    self._ensure_dataset()
    body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
    target = self._qualified_identifier(
        relation,
        project=self.project,
        dataset=self.dataset,
    )
    self._execute_sql(f"INSERT INTO {target} {body}").result()

incremental_merge ¶

incremental_merge(relation, select_sql, unique_key)

Portable fallback without native MERGE

DELETE collisions via WHERE EXISTS against the cleaned SELECT body
INSERT new rows from the same body

Source code in src/fastflowtransform/executors/bigquery/base.py

def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None:
    """
    Portable fallback without native MERGE:
      - DELETE collisions via WHERE EXISTS against the cleaned SELECT body
      - INSERT new rows from the same body
    """
    self._ensure_dataset()
    body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
    target = self._qualified_identifier(
        relation,
        project=self.project,
        dataset=self.dataset,
    )
    pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE"

    delete_sql = f"""
    DELETE FROM {target} t
    WHERE EXISTS (SELECT 1 FROM ({body}) s WHERE {pred})
    """
    self._execute_sql(delete_sql).result()

    insert_sql = f"INSERT INTO {target} SELECT * FROM ({body})"
    self._execute_sql(insert_sql).result()

alter_table_sync_schema ¶

alter_table_sync_schema(relation, select_sql, *, mode='append_new_columns')

Best-effort additive schema sync

infer select schema via LIMIT 0 query
add missing columns as NULLABLE using inferred BigQuery types

Source code in src/fastflowtransform/executors/bigquery/base.py

def alter_table_sync_schema(
    self,
    relation: str,
    select_sql: str,
    *,
    mode: str = "append_new_columns",
) -> None:
    """
    Best-effort additive schema sync:
      - infer select schema via LIMIT 0 query
      - add missing columns as NULLABLE using inferred BigQuery types
    """
    if mode not in {"append_new_columns", "sync_all_columns"}:
        return
    self._ensure_dataset()

    body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")

    # Infer schema using a no-row query (lets BigQuery type the expressions)
    probe = self.client.query(
        f"SELECT * FROM ({body}) WHERE 1=0",
        job_config=bigquery.QueryJobConfig(dry_run=False, use_query_cache=False),
        location=self.location,
    )
    probe.result()
    out_fields = {f.name: f for f in (probe.schema or [])}

    # Existing table schema
    table_ref = f"{self.project}.{self.dataset}.{relation}"
    try:
        tbl = self.client.get_table(table_ref)
    except NotFound:
        return
    existing_cols = {f.name for f in (tbl.schema or [])}

    to_add = [name for name in out_fields if name not in existing_cols]
    if not to_add:
        return

    target = self._qualified_identifier(
        relation,
        project=self.project,
        dataset=self.dataset,
    )
    for col in to_add:
        f = out_fields[col]
        typ = str(f.field_type) if hasattr(f, "field_type") else "STRING"
        self._execute_sql_basic(f"ALTER TABLE {target} ADD COLUMN {col} {typ}").result()

execute_hook_sql ¶

execute_hook_sql(sql)

Execute one SQL statement for pre/post/on_run hooks.

Source code in src/fastflowtransform/executors/bigquery/base.py

def execute_hook_sql(self, sql: str) -> None:
    """
    Execute one SQL statement for pre/post/on_run hooks.
    """
    self._execute_sql(sql).result()

collect_docs_columns ¶

collect_docs_columns()

Column metadata for docs (project+dataset scoped).

Source code in src/fastflowtransform/executors/bigquery/base.py

def collect_docs_columns(self) -> dict[str, list[ColumnInfo]]:
    """
    Column metadata for docs (project+dataset scoped).
    """
    sql = f"""
    select table_name, column_name, data_type, is_nullable
    from `{self.project}.{self.dataset}.INFORMATION_SCHEMA.COLUMNS`
    order by table_name, ordinal_position
    """
    try:
        job = self.client.query(
            sql,
            job_config=bigquery.QueryJobConfig(
                default_dataset=bigquery.DatasetReference(self.project, self.dataset)
            ),
            location=self.location,
        )
        rows = list(job.result())
    except Exception:
        return {}

    out: dict[str, list[ColumnInfo]] = {}
    for row in rows:
        table = str(row["table_name"])
        col = str(row["column_name"])
        dtype = str(row["data_type"])
        nullable = str(row["is_nullable"]).upper() == "YES"
        out.setdefault(table, []).append(ColumnInfo(col, dtype, nullable))
    return out

configure_contracts ¶

configure_contracts(contracts, project_contracts)

Inject parsed contracts into this executor instance. The run engine should call this once at startup.

Source code in src/fastflowtransform/executors/base.py

def configure_contracts(
    self,
    contracts: Mapping[str, ContractsFileModel] | None,
    project_contracts: ProjectContractsModel | None,
) -> None:
    """
    Inject parsed contracts into this executor instance.
    The run engine should call this once at startup.
    """
    self._ff_contracts = contracts or {}
    self._ff_project_contracts = project_contracts

run_sql ¶

run_sql(node, env)

Orchestrate SQL models

1) Render Jinja (ref/source/this) and strip leading {{ config(...) }}. 2) If the SQL is full DDL (CREATE …), execute it verbatim (passthrough). 3) Otherwise, normalize to CREATE OR REPLACE {TABLE|VIEW} AS . The body is CTE-aware (keeps WITH … SELECT … intact).

On failure, raise ModelExecutionError with a helpful snippet.

Source code in src/fastflowtransform/executors/base.py

def run_sql(self, node: Node, env: Environment) -> None:
    """
    Orchestrate SQL models:
      1) Render Jinja (ref/source/this) and strip leading {{ config(...) }}.
      2) If the SQL is full DDL (CREATE …), execute it verbatim (passthrough).
      3) Otherwise, normalize to CREATE OR REPLACE {TABLE|VIEW} AS <body>.
         The body is CTE-aware (keeps WITH … SELECT … intact).
    On failure, raise ModelExecutionError with a helpful snippet.
    """
    meta = getattr(node, "meta", {}) or {}
    if self._meta_is_incremental(meta):
        # Delegates to incremental engine: render, schema sync, merge/insert, etc.
        return _ff_incremental.run_or_dispatch(self, node, env)

    if self._meta_is_snapshot(meta):
        # Snapshots are executed via the dedicated CLI: `fft snapshot run`.
        raise ModelExecutionError(
            node_name=node.name,
            relation=relation_for(node.name),
            message=(
                "Snapshot models cannot be executed via 'fft run'. "
                "Use 'fft snapshot run' instead."
            ),
            sql_snippet="",
        )

    sql_rendered = self.render_sql(
        node,
        env,
        ref_resolver=lambda name: self._resolve_ref(name, env),
        source_resolver=self._resolve_source,
    )
    sql = self._strip_leading_config(sql_rendered).strip()

    materialization = (node.meta or {}).get("materialized", "table")
    if materialization == "ephemeral":
        return

    # 1) Direct DDL passthrough (CREATE [OR REPLACE] {TABLE|VIEW} …)
    if self._looks_like_direct_ddl(sql):
        try:
            self._execute_sql_direct(sql, node)
            return
        except NotImplementedError:
            # Engine doesn't implement direct DDL → fall back to normalized materialization.
            pass
        except Exception as e:
            raise ModelExecutionError(
                node_name=node.name,
                relation=relation_for(node.name),
                message=str(e),
                sql_snippet=sql,
            ) from e

    # 2) Normalized materialization path (CTE-safe body)
    body = self._selectable_body(sql).rstrip(" ;\n\t")
    target_sql = self._format_relation_for_ref(node.name)

    # Centralized SQL preview logging (applies to ALL engines)
    preview = (
        f"=== MATERIALIZE ===\n"
        f"-- model: {node.name}\n"
        f"-- materialized: {materialization}\n"
        f"-- target: {target_sql}\n"
        f"{body}\n"
    )
    echo_debug(preview)

    try:
        runtime = getattr(self, "runtime_contracts", None)
        # contracts only for TABLE materialization for now
        if runtime is not None and materialization == "table":
            contracts = getattr(self, "_ff_contracts", {}) or {}
            project_contracts = getattr(self, "_ff_project_contracts", None)

            # keying: prefer the logical table name (contracts.table),
            # but node.name or relation_for(node.name) is usually what you want.
            logical_name = relation_for(node.name)
            contract = contracts.get(logical_name) or contracts.get(node.name)

            ctx = runtime.build_context(
                node=node,
                relation=logical_name,
                physical_table=target_sql,
                contract=contract,
                project_contracts=project_contracts,
                is_incremental=self._meta_is_incremental(meta),
            )
            # Engine-specific enforcement (verify/cast/off)
            runtime.apply_sql_contracts(ctx=ctx, select_body=body)
        else:
            # Old behavior
            self._apply_sql_materialization(node, target_sql, body, materialization)
    except Exception as e:
        preview = f"-- materialized={materialization}\n-- target={target_sql}\n{body}"
        raise ModelExecutionError(
            node_name=node.name,
            relation=relation_for(node.name),
            message=str(e),
            sql_snippet=preview,
        ) from e

configure_query_budget_limit ¶

configure_query_budget_limit(limit)

Inject a configured per-query byte limit (e.g. from budgets.yml).

Source code in src/fastflowtransform/executors/base.py

def configure_query_budget_limit(self, limit: int | None) -> None:
    """
    Inject a configured per-query byte limit (e.g. from budgets.yml).
    """
    if limit is None:
        self._ff_configured_query_limit = None
        return
    try:
        iv = int(limit)
    except Exception:
        self._ff_configured_query_limit = None
        return
    self._ff_configured_query_limit = iv if iv > 0 else None

reset_node_stats ¶

reset_node_stats()

Reset per-node statistics buffer.

The run engine calls this before executing a model so that all stats recorded via _record_query_stats(...) belong to that node.

Source code in src/fastflowtransform/executors/base.py

def reset_node_stats(self) -> None:
    """
    Reset per-node statistics buffer.

    The run engine calls this before executing a model so that all
    stats recorded via `_record_query_stats(...)` belong to that node.
    """
    # just clear the buffer; next recording will re-create it
    self._ff_query_stats_buffer = []

get_node_stats ¶

get_node_stats()

Aggregate buffered QueryStats into a simple dict:

{
  "bytes_scanned": <sum>,
  "rows": <sum>,
  "query_duration_ms": <sum>,
}

Called by the run engine after a node finishes.

Source code in src/fastflowtransform/executors/base.py

def get_node_stats(self) -> dict[str, int]:
    """
    Aggregate buffered QueryStats into a simple dict:

        {
          "bytes_scanned": <sum>,
          "rows": <sum>,
          "query_duration_ms": <sum>,
        }

    Called by the run engine after a node finishes.
    """
    stats_list = self._drain_query_stats()
    if not stats_list:
        return {}

    total_bytes = 0
    total_rows = 0
    total_duration = 0

    for s in stats_list:
        if s.bytes_processed is not None:
            total_bytes += int(s.bytes_processed)
        if s.rows is not None:
            total_rows += int(s.rows)
        if s.duration_ms is not None:
            total_duration += int(s.duration_ms)

    return {
        "bytes_scanned": total_bytes,
        "rows": total_rows,
        "query_duration_ms": total_duration,
    }

run_python ¶

run_python(node)

Execute the Python model for a given node and materialize its result.

Source code in src/fastflowtransform/executors/base.py

def run_python(self, node: Node) -> None:
    """Execute the Python model for a given node and materialize its result."""
    func = REGISTRY.py_funcs[node.name]
    deps = REGISTRY.nodes[node.name].deps or []

    self._reset_http_ctx(node)

    args, argmap = self._build_python_inputs(node, deps)
    requires = REGISTRY.py_requires.get(node.name, {})
    if deps:
        # Required-columns check works against the mapping
        self._validate_required(node.name, argmap, requires)

    # out = self._execute_python_func(func, arg, node)
    out = self._execute_python_func(func, args, node)

    target = relation_for(node.name)
    meta = getattr(node, "meta", {}) or {}
    mat = self._resolve_materialization_strategy(meta)

    # ---------- Runtime contracts for Python models ----------
    runtime = getattr(self, "runtime_contracts", None)
    ctx = None
    took_over = False

    if runtime is not None:
        contracts = getattr(self, "_ff_contracts", {}) or {}
        project_contracts = getattr(self, "_ff_project_contracts", None)

        logical = target  # usually relation_for(node.name)
        contract = contracts.get(logical) or contracts.get(node.name)

        if contract is not None or project_contracts is not None:
            physical_table = self._format_relation_for_ref(node.name)
            ctx = runtime.build_context(
                node=node,
                relation=logical,
                physical_table=physical_table,
                contract=contract,
                project_contracts=project_contracts,
                is_incremental=(mat == "incremental"),
            )

            # Optional pre-coercion (default is no-op).
            if hasattr(runtime, "coerce_frame_schema"):
                out = runtime.coerce_frame_schema(out, ctx)

            # Allow engine-specific runtime to take over Python materialization
            if mat == "table" and hasattr(runtime, "materialize_python"):
                took_over = bool(runtime.materialize_python(ctx=ctx, df=out))

    # ---------- Materialization ----------
    if not took_over:
        if mat == "incremental":
            self._materialize_incremental(target, out, node, meta)
        elif mat == "view":
            self._materialize_view(target, out, node)
        else:
            self._materialize_relation(target, out, node)

    if ctx is not None and runtime is not None:
        runtime.verify_after_materialization(ctx=ctx)

    self._snapshot_http_ctx(node)

utest_load_relation_from_rows ¶

utest_load_relation_from_rows(relation, rows)

Load test input rows into a physical relation for unit tests.

Default: not implemented. Engines that support fft utest should override.

Source code in src/fastflowtransform/executors/base.py

def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None:
    """
    Load test input rows into a physical relation for unit tests.

    Default: not implemented. Engines that support `fft utest` should override.
    """
    raise NotImplementedError(
        f"utest_load_relation_from_rows not implemented for engine '{self.engine_name}'."
    )

utest_read_relation ¶

utest_read_relation(relation)

Read a physical relation into a pandas.DataFrame for unit-test assertions.

Default: not implemented. Engines that support fft utest should override.

Source code in src/fastflowtransform/executors/base.py

def utest_read_relation(self, relation: str) -> _PDDataFrame:
    """
    Read a physical relation into a pandas.DataFrame for unit-test assertions.

    Default: not implemented. Engines that support `fft utest` should override.
    """
    raise NotImplementedError(
        f"utest_read_relation not implemented for engine '{self.engine_name}'."
    )

utest_clean_target ¶

utest_clean_target(relation)

Best-effort cleanup hook before executing a unit-test model:

Drop tables/views with the target name so view<->table flips cannot fail (DuckDB, Postgres, ...).
This runs only in fft utest, and we already enforce that utest profiles use isolated DBs/schemas.

Default: no-op.

Source code in src/fastflowtransform/executors/base.py

def utest_clean_target(self, relation: str) -> None:
    """
    Best-effort cleanup hook before executing a unit-test model:

    - Drop tables/views with the target name so view<->table flips
      cannot fail (DuckDB, Postgres, ...).
    - This runs *only* in `fft utest`, and we already enforce that
      utest profiles use isolated DBs/schemas.

    Default: no-op.
    """
    return

normalize_physical_type ¶

normalize_physical_type(t)

Canonicalize a physical type string for comparisons (DQ + contracts).

Default: just strip + lower. Engines may override to account for dialect quirks in information_schema (e.g. Postgres timestamp variants, Snowflake VARCHAR(…) / NUMBER(…)).

Source code in src/fastflowtransform/executors/base.py

def normalize_physical_type(self, t: str | None) -> str:
    """
    Canonicalize a physical type string for comparisons (DQ + contracts).

    Default: just strip + lower.
    Engines may override to account for dialect quirks in information_schema
    (e.g. Postgres timestamp variants, Snowflake VARCHAR(…) / NUMBER(…)).
    """
    return (t or "").strip().lower()