fastflowtransform.executors.duckdb¶

DuckExecutor ¶

Bases: SqlIdentifierMixin, BaseExecutor[DataFrame]

Source code in src/fastflowtransform/executors/duckdb.py

class DuckExecutor(SqlIdentifierMixin, BaseExecutor[pd.DataFrame]):
    ENGINE_NAME: str = "duckdb"
    runtime_contracts: DuckRuntimeContracts
    runtime_query_stats: DuckQueryStatsRuntime
    runtime_budget: DuckBudgetRuntime
    snapshot_runtime: DuckSnapshotRuntime

    def __init__(
        self, db_path: str = ":memory:", schema: str | None = None, catalog: str | None = None
    ):
        if db_path and db_path != ":memory:" and "://" not in db_path:
            with suppress(Exception):
                Path(db_path).parent.mkdir(parents=True, exist_ok=True)
        self.db_path = db_path
        self.con = duckdb.connect(db_path)
        self.schema = schema.strip() if isinstance(schema, str) and schema.strip() else None
        catalog_override = catalog.strip() if isinstance(catalog, str) and catalog.strip() else None

        # If a catalog override is provided, connect in-memory and attach the file once
        # under the requested alias to avoid DuckDB's auto-attached filename catalog.
        self.catalog: str | None = None
        if catalog_override and db_path != ":memory:" and "://" not in db_path:
            connected = self._connect_with_catalog_override(catalog_override)
            if connected:
                self.catalog = catalog_override
            else:
                self.catalog = self._detect_catalog()
        else:
            self.catalog = self._detect_catalog()

        self.runtime_query_stats = DuckQueryStatsRuntime(self)
        self.runtime_budget = DuckBudgetRuntime(self)
        self.runtime_contracts = DuckRuntimeContracts(self)
        self.snapshot_runtime = DuckSnapshotRuntime(self)

        if self.schema:
            safe_schema = _q_ident(self.schema)
            self._execute_basic(f"create schema if not exists {safe_schema}")
            self._execute_basic(f"set schema '{self.schema}'")

    def _execute_basic(self, sql: str, params: Any | None = None) -> duckdb.DuckDBPyConnection:
        """
        Minimal helper to execute a statement and return the DuckDB cursor.
        Centralises raw connection use for test + runtime helpers.
        """
        return self.con.execute(sql, params) if params is not None else self.con.execute(sql)

    def _execute_fetchall(self, sql: str, params: Any | None = None) -> list[Any]:
        """
        Helper for runtimes that need full result sets without exposing cursors.
        """
        res = self._execute_basic(sql, params)
        fetchall = getattr(res, "fetchall", None)
        return list(cast(Iterable[Any], fetchall())) if callable(fetchall) else []

    def execute_test_sql(self, stmt: Any) -> Any:
        """
        Execute lightweight SQL for DQ tests using the underlying DuckDB connection.
        """

        def _run_one(s: Any) -> Any:
            statement_len = 2
            if (
                isinstance(s, tuple)
                and len(s) == statement_len
                and isinstance(s[0], str)
                and isinstance(s[1], dict)
            ):
                return self._execute_basic(s[0], s[1])
            if isinstance(s, str):
                return self._execute_basic(s)
            if isinstance(s, Iterable) and not isinstance(s, (bytes, bytearray, str)):
                res = None
                for item in s:
                    res = _run_one(item)
                return res
            return self._execute_basic(str(s))

        return make_fetchable(_run_one(stmt))

    def compute_freshness_delay_minutes(self, table: str, ts_col: str) -> tuple[float | None, str]:
        now_expr = "cast(now() as timestamp)"
        sql = (
            f"select date_part('epoch', {now_expr} - max({ts_col})) "
            f"/ 60.0 as delay_min from {table}"
        )
        delay = _scalar(self, sql)
        return (float(delay) if delay is not None else None, sql)

    def _execute_sql(self, sql: str, *args: Any, **kwargs: Any) -> duckdb.DuckDBPyConnection:
        """
        Central DuckDB SQL runner.

        All model-driven SQL in this executor should go through here.
        The cost guard may call _estimate_query_bytes(sql) before executing.
        This wrapper also records simple per-query stats for run_results.json.
        """

        def _exec() -> duckdb.DuckDBPyConnection:
            return self.con.execute(sql, *args, **kwargs)

        def _rows(result: Any) -> int | None:
            return self.runtime_query_stats.rowcount_from_result(result)

        return self.runtime_budget.run_sql(
            sql,
            exec_fn=_exec,
            stats_runtime=self.runtime_query_stats,
            rowcount_extractor=_rows,
        )

    def _connect_with_catalog_override(self, alias: str) -> bool:
        """
        Recreate the connection in-memory and attach the target DB under the
        requested catalog alias, so DuckDB does not auto-name it after the file.
        """
        try:
            resolved = str(Path(self.db_path).resolve())
            # New in-memory connection, then attach the file once under the alias.
            self.con = duckdb.connect()
            self._execute_basic(
                f"attach database '{resolved}' as {_q_ident(alias)} (READ_ONLY FALSE)"
            )
            self._execute_basic(f"set catalog '{alias}'")
            return True
        except Exception:
            # Leave the existing connection in place; caller will fall back.
            with suppress(Exception):
                self.con.close()
            self.con = duckdb.connect(self.db_path)
            return False

    def _detect_catalog(self) -> str | None:
        rows = self._execute_basic("PRAGMA database_list").fetchall()
        if rows:
            return str(rows[0][1])
        return None

    def _apply_catalog_override(self, name: str) -> bool:
        alias = name.strip()
        if not alias:
            return False
        try:
            if self.db_path != ":memory:":
                resolved = str(Path(self.db_path).resolve())
                with suppress(Exception):
                    self._execute_basic(f"detach database {_q_ident(alias)}")
                self._execute_basic(
                    f"attach database '{resolved}' as {_q_ident(alias)} (READ_ONLY FALSE)"
                )
            self._execute_basic(f"set catalog '{alias}'")
            return True
        except Exception:
            return False

    def clone(self) -> DuckExecutor:
        """
        Generates a new Executor instance with its own connection for Thread-Worker.
        Copies runtime-contract configuration from the parent.
        """
        cloned = DuckExecutor(self.db_path, schema=self.schema, catalog=self.catalog)

        # Propagate contracts + project contracts to the clone
        contracts = getattr(self, "_ff_contracts", None)
        project_contracts = getattr(self, "_ff_project_contracts", None)
        if contracts is not None or project_contracts is not None:
            # configure_contracts lives on BaseExecutor
            cloned.configure_contracts(contracts or {}, project_contracts)

        return cloned

    def _exec_many(self, sql: str) -> None:
        """
        Execute multiple SQL statements separated by ';' on the same connection.
        DuckDB normally accepts one statement per execute(), so we split here.
        """
        for stmt in (part.strip() for part in sql.split(";")):
            if not stmt:
                continue
            self._execute_sql(stmt)

    # ---- Frame hooks ----
    def _quote_identifier(self, ident: str) -> str:
        return _q_ident(ident)

    def _should_include_catalog(
        self, catalog: str | None, schema: str | None, *, explicit: bool
    ) -> bool:
        """
        DuckDB includes catalog only when explicitly provided or when it matches
        the schema (mirrors previous behaviour).
        """
        if explicit:
            return bool(catalog)
        return bool(catalog and schema and catalog.lower() == schema.lower())

    def _default_catalog_for_source(self, schema: str | None) -> str | None:
        """
        For sources, fall back to DuckDB's detected catalog when:
        - schema is set and matches the catalog, or
        - neither schema nor catalog was provided (keep old fallback)
        """
        cat = self._default_catalog()
        if not cat:
            return None
        if schema is None or cat.lower() == schema.lower():
            return cat
        return None

    def _qualified(self, relation: str, *, quoted: bool = True) -> str:
        """
        Return (catalog.)schema.relation if schema is set; otherwise just relation.
        When quoted=False, emit bare identifiers for APIs like con.table().
        """
        return self._format_identifier(relation, purpose="physical", quote=quoted)

    def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> pd.DataFrame:
        try:
            target = self._qualified(relation, quoted=False)
            return self.con.table(target).df()
        except CatalogException as e:
            existing = [
                r[0]
                for r in self._execute_basic(
                    "select table_name from information_schema.tables "
                    "where table_schema in ('main','temp')"
                ).fetchall()
            ]
            raise RuntimeError(
                f"Dependency table not found: '{relation}'\n"
                f"Deps: {list(deps)}\nExisting tables: {existing}\n"
                "Note: Use same File-DB/Connection for Seeding & Run."
            ) from e

    def _materialize_relation(self, relation: str, df: pd.DataFrame, node: Node) -> None:
        tmp = "_ff_py_out"
        try:
            self.con.register(tmp, df)
            target = self._qualified(relation)
            self._execute_sql(f'create or replace table {target} as select * from "{tmp}"')
        finally:
            try:
                self.con.unregister(tmp)
            except Exception:
                # housekeeping only; stats here are not important but harmless if recorded
                self._execute_basic(f'drop view if exists "{tmp}"')

    def _create_or_replace_view_from_table(
        self, view_name: str, backing_table: str, node: Node
    ) -> None:
        view_target = self._qualified(view_name)
        backing = self._qualified(backing_table)
        self._execute_sql(f"create or replace view {view_target} as select * from {backing}")

    def _frame_name(self) -> str:
        return "pandas"

    # ---- SQL hooks ----
    def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None:
        self._execute_sql(f"create or replace view {target_sql} as {select_body}")

    def _create_or_replace_table(self, target_sql: str, select_body: str, node: Node) -> None:
        self._execute_sql(f"create or replace table {target_sql} as {select_body}")

    # ---- Meta hook ----
    def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None:
        """
        After successful materialization, ensure the meta table exists and upsert the row.
        """
        ensure_meta_table(self)
        upsert_meta(self, node.name, relation, fingerprint, "duckdb")

    # ── Incremental API ────────────────────────────────────────────────────
    def exists_relation(self, relation: str) -> bool:
        where_tables: list[str] = ["lower(table_name) = lower(?)"]
        params: list[str] = [relation]
        if self.catalog:
            where_tables.append("lower(table_catalog) = lower(?)")
            params.append(self.catalog)
        if self.schema:
            where_tables.append("lower(table_schema) = lower(?)")
            params.append(self.schema)
        else:
            where_tables.append("table_schema in ('main','temp')")
        where = " AND ".join(where_tables)
        sql_tables = f"select 1 from information_schema.tables where {where} limit 1"
        if self._execute_basic(sql_tables, params).fetchone():
            return True
        sql_views = f"select 1 from information_schema.views where {where} limit 1"
        return bool(self._execute_basic(sql_views, params).fetchone())

    def create_table_as(self, relation: str, select_sql: str) -> None:
        # Use only the SELECT body and strip trailing semicolons for safety.
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        self._execute_sql(f"create table {self._qualified(relation)} as {body}")

    def incremental_insert(self, relation: str, select_sql: str) -> None:
        # Ensure the inner SELECT is clean (no trailing semicolon; SELECT body only).
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        self._execute_sql(f"insert into {self._qualified(relation)} {body}")

    def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None:
        """
        Fallback strategy for DuckDB:
        - DELETE collisions via DELETE ... USING (<select>) s
        - INSERT all rows via INSERT ... SELECT * FROM (<select>)
        """
        # 1) clean inner SELECT
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")

        # 2) predicate for DELETE
        keys_pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE"

        # 3) first: delete collisions
        delete_sql = f"delete from {self._qualified(relation)} t using ({body}) s where {keys_pred}"
        self._execute_sql(delete_sql)

        # 4) then: insert fresh rows
        insert_sql = f"insert into {self._qualified(relation)} select * from ({body}) src"
        self._execute_sql(insert_sql)

    def alter_table_sync_schema(
        self, relation: str, select_sql: str, *, mode: str = "append_new_columns"
    ) -> None:
        """
        Best-effort: add new columns with inferred type.
        """
        # Probe: empty projection from the SELECT (cleaned to avoid parser issues).
        body = self._first_select_body(select_sql).strip().rstrip(";\n\t ")
        probe = self._execute_basic(f"select * from ({body}) as q limit 0")
        cols = [c[0] for c in probe.description or []]
        existing = {
            r[0]
            for r in self._execute_basic(
                "select column_name from information_schema.columns "
                + "where lower(table_name)=lower(?)"
                + (" and lower(table_schema)=lower(?)" if self.schema else ""),
                ([relation, self.schema] if self.schema else [relation]),
            ).fetchall()
        }
        add = [c for c in cols if c not in existing]
        for c in add:
            col = _q_ident(c)
            target = self._qualified(relation)
            try:
                self._execute_basic(f"alter table {target} add column {col} varchar")
            except Exception:
                self._execute_basic(f"alter table {target} add column {col} varchar")

    def execute_hook_sql(self, sql: str) -> None:
        """
        Execute one or multiple SQL statements for pre/post/on_run hooks.

        Accepts a string that may contain ';'-separated statements.
        """
        self._exec_many(sql)

    # ---- Snapshot runtime delegation ----
    def run_snapshot_sql(self, node: Node, env: Environment) -> None:
        """
        Delegate snapshot materialization to the DuckDB snapshot runtime.
        """
        self.snapshot_runtime.run_snapshot_sql(node, env)

    def snapshot_prune(
        self,
        relation: str,
        unique_key: list[str],
        keep_last: int,
        *,
        dry_run: bool = False,
    ) -> None:
        self.snapshot_runtime.snapshot_prune(
            relation,
            unique_key,
            keep_last,
            dry_run=dry_run,
        )

        # ---- Unit-test helpers -------------------------------------------------

    def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None:
        """
        Load rows into a DuckDB table for unit tests, fully qualified to
        this executor's schema/catalog.
        """
        df = pd.DataFrame(rows)
        tmp = f"_ff_utest_tmp_{uuid.uuid4().hex[:12]}"
        self.con.register(tmp, df)
        try:
            target = self._qualified(relation)
            self._execute_basic(f"create or replace table {target} as select * from {tmp}")
        finally:
            with suppress(Exception):
                self.con.unregister(tmp)
            # Fallback for older DuckDB where unregister might not exist
            with suppress(Exception):
                self._execute_basic(f'drop view if exists "{tmp}"')

    def utest_read_relation(self, relation: str) -> pd.DataFrame:
        """
        Read a relation as a DataFrame for unit-test assertions.
        """
        target = self._qualified(relation, quoted=False)
        return self.con.table(target).df()

    def utest_clean_target(self, relation: str) -> None:
        """
        Drop any table/view with the given name in this schema/catalog.
        Safe because utest uses its own DB/path.
        """
        target = self._qualified(relation)
        # best-effort; ignore failures
        with suppress(Exception):
            self._execute_basic(f"drop view if exists {target}")
        with suppress(Exception):
            self._execute_basic(f"drop table if exists {target}")

    def collect_docs_columns(self) -> dict[str, list[ColumnInfo]]:
        """
        Best-effort column metadata for docs (schema-aware, supports catalog).
        """
        where: list[str] = []
        params: list[str] = []

        if self.catalog:
            where.append("lower(table_catalog) = lower(?)")
            params.append(self.catalog)
        if self.schema:
            where.append("lower(table_schema) = lower(?)")
            params.append(self.schema)
        else:
            where.append("table_schema in ('main','temp')")

        where_sql = " AND ".join(where) if where else "1=1"
        sql = f"""
        select table_name, column_name, data_type, is_nullable
        from information_schema.columns
        where {where_sql}
        order by table_schema, table_name, ordinal_position
        """

        try:
            rows = self._execute_basic(sql, params or None).fetchall()
        except Exception:
            return {}

        out: dict[str, list[ColumnInfo]] = {}
        for table, col, dtype, nullable in rows:
            out.setdefault(table, []).append(
                ColumnInfo(col, str(dtype), str(nullable) in (True, "YES", "Yes"))
            )
        return out

    def _introspect_columns_metadata(
        self,
        table: str,
        column: str | None = None,
    ) -> list[tuple[str, str]]:
        """
        Internal helper: return [(column_name, data_type), ...] for a DuckDB table.

        - Uses _normalize_table_identifier / _normalize_column_identifier
        - Works with or without schema qualification
        - Optionally restricts to a single column
        """
        schema, table_name = self._normalize_table_identifier(table)

        table_lower = table_name.lower()
        params: list[str] = [table_lower]

        where_clauses: list[str] = ["lower(table_name) = lower(?)"]

        if schema:
            where_clauses.append("lower(table_schema) = lower(?)")
            params.append(schema.lower())

        if column is not None:
            column_lower = self._normalize_column_identifier(column).lower()
            where_clauses.append("lower(column_name) = lower(?)")
            params.append(column_lower)

        where_sql = " AND ".join(where_clauses)

        sql = (
            "select column_name, data_type "
            "from information_schema.columns "
            f"where {where_sql} "
            "order by table_schema, ordinal_position"
        )

        rows = self._execute_basic(sql, params).fetchall()

        # Normalize to plain strings
        return [(str(name), str(dtype)) for (name, dtype) in rows]

    def introspect_column_physical_type(self, table: str, column: str) -> str | None:
        """
        DuckDB: read `data_type` from information_schema.columns for a single column.
        """
        rows = self._introspect_columns_metadata(table, column=column)
        # rows: [(column_name, data_type), ...]
        return rows[0][1] if rows else None

    def introspect_table_physical_schema(self, table: str) -> dict[str, str]:
        """
        DuckDB: return {column_name: data_type} for all columns of `table`.
        """
        rows = self._introspect_columns_metadata(table, column=None)
        return {name: dtype for (name, dtype) in rows}

    def load_seed(
        self, table: str, df: pd.DataFrame, schema: str | None = None
    ) -> tuple[bool, str, bool]:
        target_schema = schema or self.schema
        created_schema = False

        # Qualify identifier with optional schema/catalog
        qualified = self._qualify_identifier(table, schema=target_schema, catalog=self.catalog)

        if target_schema and "." not in table:
            safe_schema = _q_ident(target_schema)
            self._execute_sql(f"create schema if not exists {safe_schema}")
            created_schema = True

        tmp = f"_ff_seed_{uuid.uuid4().hex[:8]}"
        self.con.register(tmp, df)
        try:
            self._execute_sql(f'create or replace table {qualified} as select * from "{tmp}"')
        finally:
            with suppress(Exception):
                self.con.unregister(tmp)
            with suppress(Exception):
                self._execute_basic(f'drop view if exists "{tmp}"')

        return True, qualified, created_schema

execute_test_sql ¶

execute_test_sql(stmt)

Execute lightweight SQL for DQ tests using the underlying DuckDB connection.

Source code in src/fastflowtransform/executors/duckdb.py

def execute_test_sql(self, stmt: Any) -> Any:
    """
    Execute lightweight SQL for DQ tests using the underlying DuckDB connection.
    """

    def _run_one(s: Any) -> Any:
        statement_len = 2
        if (
            isinstance(s, tuple)
            and len(s) == statement_len
            and isinstance(s[0], str)
            and isinstance(s[1], dict)
        ):
            return self._execute_basic(s[0], s[1])
        if isinstance(s, str):
            return self._execute_basic(s)
        if isinstance(s, Iterable) and not isinstance(s, (bytes, bytearray, str)):
            res = None
            for item in s:
                res = _run_one(item)
            return res
        return self._execute_basic(str(s))

    return make_fetchable(_run_one(stmt))

clone ¶

clone()

Generates a new Executor instance with its own connection for Thread-Worker. Copies runtime-contract configuration from the parent.

Source code in src/fastflowtransform/executors/duckdb.py

def clone(self) -> DuckExecutor:
    """
    Generates a new Executor instance with its own connection for Thread-Worker.
    Copies runtime-contract configuration from the parent.
    """
    cloned = DuckExecutor(self.db_path, schema=self.schema, catalog=self.catalog)

    # Propagate contracts + project contracts to the clone
    contracts = getattr(self, "_ff_contracts", None)
    project_contracts = getattr(self, "_ff_project_contracts", None)
    if contracts is not None or project_contracts is not None:
        # configure_contracts lives on BaseExecutor
        cloned.configure_contracts(contracts or {}, project_contracts)

    return cloned

on_node_built ¶

on_node_built(node, relation, fingerprint)

After successful materialization, ensure the meta table exists and upsert the row.

Source code in src/fastflowtransform/executors/duckdb.py

def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None:
    """
    After successful materialization, ensure the meta table exists and upsert the row.
    """
    ensure_meta_table(self)
    upsert_meta(self, node.name, relation, fingerprint, "duckdb")

incremental_merge ¶

incremental_merge(relation, select_sql, unique_key)

Fallback strategy for DuckDB: - DELETE collisions via DELETE ... USING ()

Source code in src/fastflowtransform/executors/duckdb.py

def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None:
    """
    Fallback strategy for DuckDB:
    - DELETE collisions via DELETE ... USING (<select>) s
    - INSERT all rows via INSERT ... SELECT * FROM (<select>)
    """
    # 1) clean inner SELECT
    body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")

    # 2) predicate for DELETE
    keys_pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE"

    # 3) first: delete collisions
    delete_sql = f"delete from {self._qualified(relation)} t using ({body}) s where {keys_pred}"
    self._execute_sql(delete_sql)

    # 4) then: insert fresh rows
    insert_sql = f"insert into {self._qualified(relation)} select * from ({body}) src"
    self._execute_sql(insert_sql)

alter_table_sync_schema ¶

alter_table_sync_schema(relation, select_sql, *, mode='append_new_columns')

Best-effort: add new columns with inferred type.

Source code in src/fastflowtransform/executors/duckdb.py

def alter_table_sync_schema(
    self, relation: str, select_sql: str, *, mode: str = "append_new_columns"
) -> None:
    """
    Best-effort: add new columns with inferred type.
    """
    # Probe: empty projection from the SELECT (cleaned to avoid parser issues).
    body = self._first_select_body(select_sql).strip().rstrip(";\n\t ")
    probe = self._execute_basic(f"select * from ({body}) as q limit 0")
    cols = [c[0] for c in probe.description or []]
    existing = {
        r[0]
        for r in self._execute_basic(
            "select column_name from information_schema.columns "
            + "where lower(table_name)=lower(?)"
            + (" and lower(table_schema)=lower(?)" if self.schema else ""),
            ([relation, self.schema] if self.schema else [relation]),
        ).fetchall()
    }
    add = [c for c in cols if c not in existing]
    for c in add:
        col = _q_ident(c)
        target = self._qualified(relation)
        try:
            self._execute_basic(f"alter table {target} add column {col} varchar")
        except Exception:
            self._execute_basic(f"alter table {target} add column {col} varchar")

execute_hook_sql ¶

execute_hook_sql(sql)

Execute one or multiple SQL statements for pre/post/on_run hooks.

Accepts a string that may contain ';'-separated statements.

Source code in src/fastflowtransform/executors/duckdb.py

def execute_hook_sql(self, sql: str) -> None:
    """
    Execute one or multiple SQL statements for pre/post/on_run hooks.

    Accepts a string that may contain ';'-separated statements.
    """
    self._exec_many(sql)

run_snapshot_sql ¶

run_snapshot_sql(node, env)

Delegate snapshot materialization to the DuckDB snapshot runtime.

Source code in src/fastflowtransform/executors/duckdb.py

def run_snapshot_sql(self, node: Node, env: Environment) -> None:
    """
    Delegate snapshot materialization to the DuckDB snapshot runtime.
    """
    self.snapshot_runtime.run_snapshot_sql(node, env)

utest_load_relation_from_rows ¶

utest_load_relation_from_rows(relation, rows)

Load rows into a DuckDB table for unit tests, fully qualified to this executor's schema/catalog.

Source code in src/fastflowtransform/executors/duckdb.py

def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None:
    """
    Load rows into a DuckDB table for unit tests, fully qualified to
    this executor's schema/catalog.
    """
    df = pd.DataFrame(rows)
    tmp = f"_ff_utest_tmp_{uuid.uuid4().hex[:12]}"
    self.con.register(tmp, df)
    try:
        target = self._qualified(relation)
        self._execute_basic(f"create or replace table {target} as select * from {tmp}")
    finally:
        with suppress(Exception):
            self.con.unregister(tmp)
        # Fallback for older DuckDB where unregister might not exist
        with suppress(Exception):
            self._execute_basic(f'drop view if exists "{tmp}"')

utest_read_relation ¶

utest_read_relation(relation)

Read a relation as a DataFrame for unit-test assertions.

Source code in src/fastflowtransform/executors/duckdb.py

def utest_read_relation(self, relation: str) -> pd.DataFrame:
    """
    Read a relation as a DataFrame for unit-test assertions.
    """
    target = self._qualified(relation, quoted=False)
    return self.con.table(target).df()

utest_clean_target ¶

utest_clean_target(relation)

Drop any table/view with the given name in this schema/catalog. Safe because utest uses its own DB/path.

Source code in src/fastflowtransform/executors/duckdb.py

def utest_clean_target(self, relation: str) -> None:
    """
    Drop any table/view with the given name in this schema/catalog.
    Safe because utest uses its own DB/path.
    """
    target = self._qualified(relation)
    # best-effort; ignore failures
    with suppress(Exception):
        self._execute_basic(f"drop view if exists {target}")
    with suppress(Exception):
        self._execute_basic(f"drop table if exists {target}")

collect_docs_columns ¶

collect_docs_columns()

Best-effort column metadata for docs (schema-aware, supports catalog).

Source code in src/fastflowtransform/executors/duckdb.py

def collect_docs_columns(self) -> dict[str, list[ColumnInfo]]:
    """
    Best-effort column metadata for docs (schema-aware, supports catalog).
    """
    where: list[str] = []
    params: list[str] = []

    if self.catalog:
        where.append("lower(table_catalog) = lower(?)")
        params.append(self.catalog)
    if self.schema:
        where.append("lower(table_schema) = lower(?)")
        params.append(self.schema)
    else:
        where.append("table_schema in ('main','temp')")

    where_sql = " AND ".join(where) if where else "1=1"
    sql = f"""
    select table_name, column_name, data_type, is_nullable
    from information_schema.columns
    where {where_sql}
    order by table_schema, table_name, ordinal_position
    """

    try:
        rows = self._execute_basic(sql, params or None).fetchall()
    except Exception:
        return {}

    out: dict[str, list[ColumnInfo]] = {}
    for table, col, dtype, nullable in rows:
        out.setdefault(table, []).append(
            ColumnInfo(col, str(dtype), str(nullable) in (True, "YES", "Yes"))
        )
    return out

introspect_column_physical_type ¶

introspect_column_physical_type(table, column)

DuckDB: read data_type from information_schema.columns for a single column.

Source code in src/fastflowtransform/executors/duckdb.py

def introspect_column_physical_type(self, table: str, column: str) -> str | None:
    """
    DuckDB: read `data_type` from information_schema.columns for a single column.
    """
    rows = self._introspect_columns_metadata(table, column=column)
    # rows: [(column_name, data_type), ...]
    return rows[0][1] if rows else None

introspect_table_physical_schema ¶

introspect_table_physical_schema(table)

DuckDB: return {column_name: data_type} for all columns of table.

Source code in src/fastflowtransform/executors/duckdb.py

def introspect_table_physical_schema(self, table: str) -> dict[str, str]:
    """
    DuckDB: return {column_name: data_type} for all columns of `table`.
    """
    rows = self._introspect_columns_metadata(table, column=None)
    return {name: dtype for (name, dtype) in rows}

configure_contracts ¶

configure_contracts(contracts, project_contracts)

Inject parsed contracts into this executor instance. The run engine should call this once at startup.

Source code in src/fastflowtransform/executors/base.py

def configure_contracts(
    self,
    contracts: Mapping[str, ContractsFileModel] | None,
    project_contracts: ProjectContractsModel | None,
) -> None:
    """
    Inject parsed contracts into this executor instance.
    The run engine should call this once at startup.
    """
    self._ff_contracts = contracts or {}
    self._ff_project_contracts = project_contracts

run_sql ¶

run_sql(node, env)

Orchestrate SQL models

1) Render Jinja (ref/source/this) and strip leading {{ config(...) }}. 2) If the SQL is full DDL (CREATE …), execute it verbatim (passthrough). 3) Otherwise, normalize to CREATE OR REPLACE {TABLE|VIEW} AS . The body is CTE-aware (keeps WITH … SELECT … intact).

On failure, raise ModelExecutionError with a helpful snippet.

Source code in src/fastflowtransform/executors/base.py

def run_sql(self, node: Node, env: Environment) -> None:
    """
    Orchestrate SQL models:
      1) Render Jinja (ref/source/this) and strip leading {{ config(...) }}.
      2) If the SQL is full DDL (CREATE …), execute it verbatim (passthrough).
      3) Otherwise, normalize to CREATE OR REPLACE {TABLE|VIEW} AS <body>.
         The body is CTE-aware (keeps WITH … SELECT … intact).
    On failure, raise ModelExecutionError with a helpful snippet.
    """
    meta = getattr(node, "meta", {}) or {}
    if self._meta_is_incremental(meta):
        # Delegates to incremental engine: render, schema sync, merge/insert, etc.
        return _ff_incremental.run_or_dispatch(self, node, env)

    if self._meta_is_snapshot(meta):
        # Snapshots are executed via the dedicated CLI: `fft snapshot run`.
        raise ModelExecutionError(
            node_name=node.name,
            relation=relation_for(node.name),
            message=(
                "Snapshot models cannot be executed via 'fft run'. "
                "Use 'fft snapshot run' instead."
            ),
            sql_snippet="",
        )

    sql_rendered = self.render_sql(
        node,
        env,
        ref_resolver=lambda name: self._resolve_ref(name, env),
        source_resolver=self._resolve_source,
    )
    sql = self._strip_leading_config(sql_rendered).strip()

    materialization = (node.meta or {}).get("materialized", "table")
    if materialization == "ephemeral":
        return

    # 1) Direct DDL passthrough (CREATE [OR REPLACE] {TABLE|VIEW} …)
    if self._looks_like_direct_ddl(sql):
        try:
            self._execute_sql_direct(sql, node)
            return
        except NotImplementedError:
            # Engine doesn't implement direct DDL → fall back to normalized materialization.
            pass
        except Exception as e:
            raise ModelExecutionError(
                node_name=node.name,
                relation=relation_for(node.name),
                message=str(e),
                sql_snippet=sql,
            ) from e

    # 2) Normalized materialization path (CTE-safe body)
    body = self._selectable_body(sql).rstrip(" ;\n\t")
    target_sql = self._format_relation_for_ref(node.name)

    # Centralized SQL preview logging (applies to ALL engines)
    preview = (
        f"=== MATERIALIZE ===\n"
        f"-- model: {node.name}\n"
        f"-- materialized: {materialization}\n"
        f"-- target: {target_sql}\n"
        f"{body}\n"
    )
    echo_debug(preview)

    try:
        runtime = getattr(self, "runtime_contracts", None)
        # contracts only for TABLE materialization for now
        if runtime is not None and materialization == "table":
            contracts = getattr(self, "_ff_contracts", {}) or {}
            project_contracts = getattr(self, "_ff_project_contracts", None)

            # keying: prefer the logical table name (contracts.table),
            # but node.name or relation_for(node.name) is usually what you want.
            logical_name = relation_for(node.name)
            contract = contracts.get(logical_name) or contracts.get(node.name)

            ctx = runtime.build_context(
                node=node,
                relation=logical_name,
                physical_table=target_sql,
                contract=contract,
                project_contracts=project_contracts,
                is_incremental=self._meta_is_incremental(meta),
            )
            # Engine-specific enforcement (verify/cast/off)
            runtime.apply_sql_contracts(ctx=ctx, select_body=body)
        else:
            # Old behavior
            self._apply_sql_materialization(node, target_sql, body, materialization)
    except Exception as e:
        preview = f"-- materialized={materialization}\n-- target={target_sql}\n{body}"
        raise ModelExecutionError(
            node_name=node.name,
            relation=relation_for(node.name),
            message=str(e),
            sql_snippet=preview,
        ) from e

configure_query_budget_limit ¶

configure_query_budget_limit(limit)

Inject a configured per-query byte limit (e.g. from budgets.yml).

Source code in src/fastflowtransform/executors/base.py

def configure_query_budget_limit(self, limit: int | None) -> None:
    """
    Inject a configured per-query byte limit (e.g. from budgets.yml).
    """
    if limit is None:
        self._ff_configured_query_limit = None
        return
    try:
        iv = int(limit)
    except Exception:
        self._ff_configured_query_limit = None
        return
    self._ff_configured_query_limit = iv if iv > 0 else None

reset_node_stats ¶

reset_node_stats()

Reset per-node statistics buffer.

The run engine calls this before executing a model so that all stats recorded via _record_query_stats(...) belong to that node.

Source code in src/fastflowtransform/executors/base.py

def reset_node_stats(self) -> None:
    """
    Reset per-node statistics buffer.

    The run engine calls this before executing a model so that all
    stats recorded via `_record_query_stats(...)` belong to that node.
    """
    # just clear the buffer; next recording will re-create it
    self._ff_query_stats_buffer = []

get_node_stats ¶

get_node_stats()

Aggregate buffered QueryStats into a simple dict:

{
  "bytes_scanned": <sum>,
  "rows": <sum>,
  "query_duration_ms": <sum>,
}

Called by the run engine after a node finishes.

Source code in src/fastflowtransform/executors/base.py

def get_node_stats(self) -> dict[str, int]:
    """
    Aggregate buffered QueryStats into a simple dict:

        {
          "bytes_scanned": <sum>,
          "rows": <sum>,
          "query_duration_ms": <sum>,
        }

    Called by the run engine after a node finishes.
    """
    stats_list = self._drain_query_stats()
    if not stats_list:
        return {}

    total_bytes = 0
    total_rows = 0
    total_duration = 0

    for s in stats_list:
        if s.bytes_processed is not None:
            total_bytes += int(s.bytes_processed)
        if s.rows is not None:
            total_rows += int(s.rows)
        if s.duration_ms is not None:
            total_duration += int(s.duration_ms)

    return {
        "bytes_scanned": total_bytes,
        "rows": total_rows,
        "query_duration_ms": total_duration,
    }

run_python ¶

run_python(node)

Execute the Python model for a given node and materialize its result.

Source code in src/fastflowtransform/executors/base.py

def run_python(self, node: Node) -> None:
    """Execute the Python model for a given node and materialize its result."""
    func = REGISTRY.py_funcs[node.name]
    deps = REGISTRY.nodes[node.name].deps or []

    self._reset_http_ctx(node)

    args, argmap = self._build_python_inputs(node, deps)
    requires = REGISTRY.py_requires.get(node.name, {})
    if deps:
        # Required-columns check works against the mapping
        self._validate_required(node.name, argmap, requires)

    # out = self._execute_python_func(func, arg, node)
    out = self._execute_python_func(func, args, node)

    target = relation_for(node.name)
    meta = getattr(node, "meta", {}) or {}
    mat = self._resolve_materialization_strategy(meta)

    # ---------- Runtime contracts for Python models ----------
    runtime = getattr(self, "runtime_contracts", None)
    ctx = None
    took_over = False

    if runtime is not None:
        contracts = getattr(self, "_ff_contracts", {}) or {}
        project_contracts = getattr(self, "_ff_project_contracts", None)

        logical = target  # usually relation_for(node.name)
        contract = contracts.get(logical) or contracts.get(node.name)

        if contract is not None or project_contracts is not None:
            physical_table = self._format_relation_for_ref(node.name)
            ctx = runtime.build_context(
                node=node,
                relation=logical,
                physical_table=physical_table,
                contract=contract,
                project_contracts=project_contracts,
                is_incremental=(mat == "incremental"),
            )

            # Optional pre-coercion (default is no-op).
            if hasattr(runtime, "coerce_frame_schema"):
                out = runtime.coerce_frame_schema(out, ctx)

            # Allow engine-specific runtime to take over Python materialization
            if mat == "table" and hasattr(runtime, "materialize_python"):
                took_over = bool(runtime.materialize_python(ctx=ctx, df=out))

    # ---------- Materialization ----------
    if not took_over:
        if mat == "incremental":
            self._materialize_incremental(target, out, node, meta)
        elif mat == "view":
            self._materialize_view(target, out, node)
        else:
            self._materialize_relation(target, out, node)

    if ctx is not None and runtime is not None:
        runtime.verify_after_materialization(ctx=ctx)

    self._snapshot_http_ctx(node)

normalize_physical_type ¶

normalize_physical_type(t)

Canonicalize a physical type string for comparisons (DQ + contracts).

Default: just strip + lower. Engines may override to account for dialect quirks in information_schema (e.g. Postgres timestamp variants, Snowflake VARCHAR(…) / NUMBER(…)).

Source code in src/fastflowtransform/executors/base.py

def normalize_physical_type(self, t: str | None) -> str:
    """
    Canonicalize a physical type string for comparisons (DQ + contracts).

    Default: just strip + lower.
    Engines may override to account for dialect quirks in information_schema
    (e.g. Postgres timestamp variants, Snowflake VARCHAR(…) / NUMBER(…)).
    """
    return (t or "").strip().lower()