fastflowtransform.contracts.runtime.databricks_spark¶

DatabricksSparkRuntimeContracts ¶

Bases: BaseRuntimeContracts

Runtime schema contracts for Spark / DatabricksSparkExecutor.

verify: write output table, then compare contract vs actual Spark schema
cast: apply Spark casts before writing, then verify

Source code in src/fastflowtransform/contracts/runtime/databricks_spark.py

class DatabricksSparkRuntimeContracts(BaseRuntimeContracts):
    """
    Runtime schema contracts for Spark / DatabricksSparkExecutor.

    - verify: write output table, then compare contract vs actual Spark schema
    - cast: apply Spark casts before writing, then verify
    """

    def __init__(self, executor: ContractExecutor):
        super().__init__(executor)

    # --- helpers ---------------------------------------------------------

    def _save_df_as_table(self, *, ctx: RuntimeContractContext, df: Any) -> None:
        """
        Delegate to DatabricksSparkExecutor._save_df_as_table (format handler aware).
        """
        save = getattr(self.executor, "_save_df_as_table", None)
        if not callable(save):
            raise RuntimeError(
                "[contracts] Spark runtime contracts require executor._save_df_as_table(...)"
            )

        # Preserve existing storage behavior if available
        storage_meta = {}
        storage_fn = getattr(self.executor, "_storage_meta", None)
        if callable(storage_fn):
            try:
                storage_meta = storage_fn(ctx.node, ctx.physical_table)
            except Exception:
                storage_meta = {}

        save(ctx.physical_table, df, storage=storage_meta)

    def _verify(
        self,
        *,
        table: str,
        expected: Mapping[str, str],
        cfg: RuntimeContractConfig,
    ) -> None:
        if not expected:
            return

        actual = self.executor.introspect_table_physical_schema(table)
        exp_lower = {k.lower(): v for k, v in expected.items()}

        problems: list[str] = []

        for col, expected_type in expected.items():
            key = col.lower()
            if key not in actual:
                problems.append(f"- missing column {col!r}")
                continue
            got = actual[key]
            if not _types_match(expected_type, got):
                problems.append(f"- column {col!r}: expected {expected_type!r}, got {got!r}")

        if not cfg.allow_extra_columns:
            extras = [c for c in actual if c not in exp_lower]
            if extras:
                problems.append(f"- extra columns present: {sorted(extras)}")

        if problems:
            raise RuntimeError(
                f"[contracts] Spark schema enforcement failed for {table}:\n" + "\n".join(problems)
            )

    def _cast_df(
        self,
        *,
        df: Any,
        expected: Mapping[str, str],
        allow_extra: bool,
    ) -> Any:
        """
        Return a projected DataFrame:
          - expected cols casted to expected Spark SQL types
          - optionally keep extra cols
        """
        # Use your lazy import helper to avoid hard pyspark deps at import time
        from fastflowtransform.executors._spark_imports import get_spark_functions  # noqa: PLC0415

        F = get_spark_functions()

        if not expected:
            return df

        cols = list(getattr(df, "columns", []) or [])
        col_map = {c.lower(): c for c in cols}  # actual name by lower key

        exp_lower = {k.lower(): v for k, v in expected.items()}

        # Ensure expected columns exist
        missing = [c for c in exp_lower if c not in col_map]
        if missing:
            raise RuntimeError(f"[contracts] missing expected columns: {sorted(missing)}")

        projections: list[Any] = []
        for low_name, typ in exp_lower.items():
            real = col_map[low_name]
            projections.append(F.col(real).cast(str(typ)).alias(real))

        if allow_extra:
            for c in cols:
                if c.lower() not in exp_lower:
                    projections.append(F.col(c))

        return df.select(*projections)

    # --- BaseRuntimeContracts hooks -------------------------------------

    def apply_sql_contracts(
        self,
        *,
        ctx: RuntimeContractContext,
        select_body: str,
    ) -> None:
        expected = expected_physical_schema(executor=self.executor, contract=ctx.contract)
        mode = ctx.config.mode

        # Spark executor doesn't do CTAS SQL; it materializes via DF + _save_df_as_table
        df = self.executor._execute_sql(select_body)

        if mode == "off" or not expected:
            self._save_df_as_table(ctx=ctx, df=df)
            return

        if mode == "cast":
            if not expected:
                raise RuntimeError(
                    f"[contracts] cast mode enabled for {ctx.relation!r} "
                    "but no physical schema could be resolved."
                )
            df2 = self._cast_df(
                df=df, expected=expected, allow_extra=ctx.config.allow_extra_columns
            )
            self._save_df_as_table(ctx=ctx, df=df2)
            self._verify(table=ctx.physical_table, expected=expected, cfg=ctx.config)
            return

        if mode == "verify":
            self._save_df_as_table(ctx=ctx, df=df)
            self._verify(table=ctx.physical_table, expected=expected, cfg=ctx.config)
            return

        # unknown mode -> behave like off
        self._save_df_as_table(ctx=ctx, df=df)

    def materialize_python(
        self,
        *,
        ctx: RuntimeContractContext,
        df: Any,
    ) -> bool:
        """
        Spark Python models return a Spark DataFrame. Enforce contracts here
        so we can CAST before writing.
        """
        mode = ctx.config.mode
        if mode == "off":
            return False

        expected = expected_physical_schema(executor=self.executor, contract=ctx.contract)

        if mode == "cast" and not expected:
            raise RuntimeError(
                f"[contracts] cast mode enabled for {ctx.relation!r} "
                "but no physical schema could be resolved."
            )

        # basic Spark DF shape check (avoid importing pyspark types)
        if not hasattr(df, "schema") or not hasattr(df, "columns") or not hasattr(df, "select"):
            return False

        if mode == "cast":
            df2 = self._cast_df(
                df=df, expected=expected, allow_extra=ctx.config.allow_extra_columns
            )
            self._save_df_as_table(ctx=ctx, df=df2)
            self._verify(table=ctx.physical_table, expected=expected, cfg=ctx.config)
            return True

        if mode == "verify":
            self._save_df_as_table(ctx=ctx, df=df)
            if expected:
                self._verify(table=ctx.physical_table, expected=expected, cfg=ctx.config)
            return True

        return False

    def verify_after_materialization(self, *, ctx: RuntimeContractContext) -> None:
        expected = expected_physical_schema(executor=self.executor, contract=ctx.contract)
        if not expected:
            return
        if ctx.config.mode not in {"verify", "cast"}:
            return
        self._verify(table=ctx.physical_table, expected=expected, cfg=ctx.config)

materialize_python ¶

materialize_python(*, ctx, df)

Spark Python models return a Spark DataFrame. Enforce contracts here so we can CAST before writing.

Source code in src/fastflowtransform/contracts/runtime/databricks_spark.py

def materialize_python(
    self,
    *,
    ctx: RuntimeContractContext,
    df: Any,
) -> bool:
    """
    Spark Python models return a Spark DataFrame. Enforce contracts here
    so we can CAST before writing.
    """
    mode = ctx.config.mode
    if mode == "off":
        return False

    expected = expected_physical_schema(executor=self.executor, contract=ctx.contract)

    if mode == "cast" and not expected:
        raise RuntimeError(
            f"[contracts] cast mode enabled for {ctx.relation!r} "
            "but no physical schema could be resolved."
        )

    # basic Spark DF shape check (avoid importing pyspark types)
    if not hasattr(df, "schema") or not hasattr(df, "columns") or not hasattr(df, "select"):
        return False

    if mode == "cast":
        df2 = self._cast_df(
            df=df, expected=expected, allow_extra=ctx.config.allow_extra_columns
        )
        self._save_df_as_table(ctx=ctx, df=df2)
        self._verify(table=ctx.physical_table, expected=expected, cfg=ctx.config)
        return True

    if mode == "verify":
        self._save_df_as_table(ctx=ctx, df=df)
        if expected:
            self._verify(table=ctx.physical_table, expected=expected, cfg=ctx.config)
        return True

    return False

build_context ¶

build_context(*, node, relation, physical_table, contract, project_contracts, is_incremental=False)

Build a RuntimeContractContext with the correct RuntimeContractConfig.

The caller (run-engine) decides which contract applies and passes: - node: the fft Node being built - relation: logical name (typically node.name) - physical_table: fully-qualified identifier used in SQL - contract: per-table ContractsFileModel, or None - project_contracts: parsed project-level contracts.yml, or None

Source code in src/fastflowtransform/contracts/runtime/base.py

def build_context(
    self,
    *,
    node: Node,
    relation: str,
    physical_table: str,
    contract: ContractsFileModel | None,
    project_contracts: ProjectContractsModel | None,
    is_incremental: bool = False,
) -> RuntimeContractContext:
    """
    Build a RuntimeContractContext with the correct RuntimeContractConfig.

    The caller (run-engine) decides which contract applies and passes:
      - node:          the fft Node being built
      - relation:      logical name (typically node.name)
      - physical_table: fully-qualified identifier used in SQL
      - contract:      per-table ContractsFileModel, or None
      - project_contracts: parsed project-level contracts.yml, or None
    """
    # Use the contract's declared table name if present, otherwise fall
    # back to the logical relation name for project-level overrides.
    table_key = contract.table if contract is not None else relation

    cfg = resolve_runtime_contract_config(
        table_name=table_key,
        contract=contract,
        project_contracts=project_contracts,
    )

    return RuntimeContractContext(
        node=node,
        relation=relation,
        physical_table=physical_table,
        contract=contract,
        project_contracts=project_contracts,
        config=cfg,
        is_incremental=is_incremental,
    )

coerce_frame_schema ¶

coerce_frame_schema(df, ctx)

Optional hook for Python models: given a DataFrame-like object and the RuntimeContractContext, return a new frame whose column types have been coerced to match the expected physical schema (where reasonable).

Default implementation is a no-op. Engine-specific subclasses may override this (e.g. DuckDB + pandas).

Source code in src/fastflowtransform/contracts/runtime/base.py

def coerce_frame_schema(self, df: Any, ctx: RuntimeContractContext) -> Any:
    """
    Optional hook for Python models: given a DataFrame-like object and the
    RuntimeContractContext, return a new frame whose column types have been
    coerced to match the expected physical schema (where reasonable).

    Default implementation is a no-op. Engine-specific subclasses may
    override this (e.g. DuckDB + pandas).
    """
    return df