fastflowtransform.executors.databricks_spark¶

DatabricksSparkExecutor ¶

Bases: BaseExecutor[SDF]

Spark/Databricks executor without pandas: Python models operate on Spark DataFrames.

Source code in src/fastflowtransform/executors/databricks_spark.py

class DatabricksSparkExecutor(BaseExecutor[SDF]):
    """Spark/Databricks executor without pandas: Python models operate on Spark DataFrames."""

    ENGINE_NAME: str = "databricks_spark"
    runtime_contracts: DatabricksSparkRuntimeContracts
    runtime_query_stats: DatabricksSparkQueryStatsRuntime
    runtime_budget: DatabricksSparkBudgetRuntime
    snapshot_runtime: DatabricksSparkSnapshotRuntime

    def __init__(
        self,
        master: str = "local[*]",
        app_name: str = "fastflowtransform",
        *,
        extra_conf: dict[str, Any] | None = None,
        warehouse_dir: str | None = None,
        use_hive_metastore: bool = False,
        catalog: str | None = None,
        database: str | None = None,
        table_format: str | None = "parquet",
        table_options: dict[str, Any] | None = None,
        spark: SparkSession | None = None,
    ):
        extra_conf = dict(extra_conf or {})
        self._user_spark = spark

        builder = SparkSession.builder.master(master).appName(app_name)
        catalog_key = "spark.sql.catalog.spark_catalog"
        ext_key = "spark.sql.extensions"

        # Warehouse directory
        warehouse_path: Path | None = None
        if warehouse_dir:
            warehouse_path = Path(warehouse_dir).expanduser()
            if not warehouse_path.is_absolute():
                warehouse_path = Path.cwd() / warehouse_path
            warehouse_path.mkdir(parents=True, exist_ok=True)
            builder = builder.config("spark.sql.warehouse.dir", str(warehouse_path))

        catalog_value = _as_nonempty_str(catalog)
        if catalog_value:
            builder = builder.config(catalog_key, catalog_value)

        # Extra config
        if extra_conf:
            for key, value in extra_conf.items():
                if value is not None:
                    builder = builder.config(str(key), str(value))

        if use_hive_metastore:
            builder = builder.config("spark.sql.catalogImplementation", "hive")
            builder = builder.enableHiveSupport()

        fmt_requested = (table_format or "").strip().lower()
        wants_delta = fmt_requested == "delta"

        # Apply Delta configuration last, after all Spark configs are set.
        if not wants_delta and self._user_spark is None:
            catalog_overridden = bool(catalog_value)

        # Apply Delta configuration last, after all Spark configs are set.
        if wants_delta and self._user_spark is None:
            if configure_spark_with_delta_pip is None:
                raise RuntimeError(
                    "Delta table_format requested for DatabricksSparkExecutor, "
                    "but 'delta-spark' is not installed. "
                    "Install it with: pip install delta-spark"
                )
            builder = configure_spark_with_delta_pip(builder)

            ext_value = _as_nonempty_str(extra_conf.get(ext_key))
            merged_ext, changed = _ensure_csv_token(ext_value, _DELTA_EXTENSION)
            if changed or ext_value is None:
                builder = builder.config(ext_key, merged_ext)

            extra_catalog = _as_nonempty_str(extra_conf.get(catalog_key))
            catalog_overridden = bool(catalog_value) or bool(extra_catalog)
            if not catalog_overridden:
                builder = builder.config(catalog_key, _DELTA_CATALOG)

        self.spark = self._user_spark or builder.getOrCreate()
        self._registered_path_sources: dict[str, dict[str, Any]] = {}
        self.warehouse_dir = warehouse_path
        self.catalog = catalog
        self.database = database
        self.schema = database
        self.runtime_query_stats = DatabricksSparkQueryStatsRuntime(self)
        self.runtime_budget = DatabricksSparkBudgetRuntime(self)

        if database:
            self._execute_sql_basic(f"CREATE DATABASE IF NOT EXISTS `{database}`")
            with suppress(Exception):
                self.spark.catalog.setCurrentDatabase(database)

        self.spark_table_format: str | None = fmt_requested or None
        self.spark_table_options = {str(k): str(v) for k, v in (table_options or {}).items()}

        # ---- Delta availability check ----
        self._delta_ok = _has_delta(self.spark)

        # Log capabilities whenever Delta is requested or detected
        if wants_delta or self._delta_ok:
            _log_delta_capabilities(
                self.spark,
                wants_delta=wants_delta,
                delta_ok=self._delta_ok,
                user_spark=self._user_spark,
                table_format=self.spark_table_format,
            )

        if wants_delta and not self._delta_ok and self._user_spark is None:
            raise RuntimeError(
                "Delta table_format requested, but the Delta Lake classes are not available. "
                "Install delta-spark or provide a SparkSession already configured for Delta."
            )

        # Unified format handler for managed tables (Delta, Iceberg, generic Parquet/ORC/etc.)
        self._format_handler: SparkFormatHandler = get_spark_format_handler(
            self.spark_table_format,
            self.spark,
            table_options=self.spark_table_options,
            sql_runner=self._execute_sql,
        )

        self.runtime_contracts = DatabricksSparkRuntimeContracts(self)
        self.snapshot_runtime = DatabricksSparkSnapshotRuntime(self)

    def execute_test_sql(self, stmt: Any) -> Any:
        """
        Execute lightweight SQL for DQ tests via Spark and return fetchable rows.
        """

        def _run_one(s: Any) -> Any:
            if isinstance(s, str):
                return rows_to_tuples(self.spark.sql(s).collect())
            if isinstance(s, Iterable) and not isinstance(s, (bytes, bytearray, str)):
                res = None
                for item in s:
                    res = _run_one(item)
                return res
            return rows_to_tuples(self.spark.sql(str(s)).collect())

        return make_fetchable(_run_one(stmt))

    def compute_freshness_delay_minutes(self, table: str, ts_col: str) -> tuple[float | None, str]:
        sql = (
            f"select (unix_timestamp(current_timestamp()) - unix_timestamp(max({ts_col}))) / 60.0 "
            f"as delay_min from {table}"
        )
        res = self.execute_test_sql(sql)
        row = getattr(res, "fetchone", lambda: None)()
        val = row[0] if row else None
        return (float(val) if val is not None else None, sql)

    def _execute_sql_basic(self, sql: str) -> SDF:
        return self.spark.sql(sql)

    def _execute_sql(self, sql: str) -> SDF:
        """
        Central Spark SQL runner.

        - Guarded by FF_SPK_MAX_BYTES via the cost guard.
        - Returns a Spark DataFrame (same as spark.sql).
        - Records best-effort query stats for run_results.json.
        """

        def _exec() -> SDF:
            return self.spark.sql(sql)

        return self.runtime_budget.run_sql(
            sql,
            exec_fn=_exec,
            stats_runtime=self.runtime_query_stats,
            stats_adapter=self.runtime_budget.spark_stats_adapter(sql),
        )

    # ---------- Frame hooks (required) ----------
    def _quote_identifier(self, ident: str) -> str:
        return self._format_handler.qualify_identifier(ident, database=self.database)

    def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> SDF:
        # relation may optionally be "db.table" (via source()/ref())
        physical = self._format_handler.qualify_identifier(relation, database=self.database)
        return self.spark.table(physical)

    def _materialize_relation(self, relation: str, df: SDF, node: Node) -> None:
        if not self._is_frame(df):
            raise TypeError("Spark model must return a Spark DataFrame")
        storage_meta = self._storage_meta(node, relation)
        # Delegate managed/unmanaged handling to _save_df_as_table so Iceberg
        # (or other handlers) can consistently enforce managed tables.
        start = perf_counter()
        self._save_df_as_table(relation, df, storage=storage_meta)
        duration_ms = int((perf_counter() - start) * 1000)
        self._record_spark_dataframe_stats(df, duration_ms)

    def _create_view_over_table(self, view_name: str, backing_table: str, node: Node) -> None:
        """Compatibility hook: create a simple SELECT * view over an existing table."""
        view_sql = self._sql_identifier(view_name)
        backing_sql = self._sql_identifier(backing_table)
        self._execute_sql_basic(f"CREATE OR REPLACE VIEW {view_sql} AS SELECT * FROM {backing_sql}")

    def _validate_required(
        self, node_name: str, inputs: Any, requires: dict[str, set[str]]
    ) -> None:
        if not requires:
            return

        def cols(df: SDF) -> set[str]:
            return set(df.schema.fieldNames())

        errors: list[str] = []
        # Single dependency: requires typically contains exactly one entry (ignore the key)
        if isinstance(inputs, SDF):
            need = next(iter(requires.values()), set())
            missing = need - cols(inputs)
            if missing:
                errors.append(f"- missing columns: {sorted(missing)} | have={sorted(cols(inputs))}")
        else:
            # Multiple dependencies: keys in requires = physical relations (relation_for(dep))
            for rel, need in requires.items():
                if rel not in inputs:
                    errors.append(f"- missing dependency key '{rel}'")
                    continue
                missing = need - cols(inputs[rel])
                if missing:
                    errors.append(
                        f"- [{rel}] missing: {sorted(missing)} | have={sorted(cols(inputs[rel]))}"
                    )

        if errors:
            raise ValueError(
                "Required columns check failed for Spark model "
                f"'{node_name}'.\n" + "\n".join(errors)
            )

    def _columns_of(self, frame: SDF) -> list[str]:  # pragma: no cover
        return frame.schema.fieldNames()

    def _is_frame(self, obj: Any) -> bool:  # pragma: no cover
        return isinstance(obj, SDF)

    def _frame_name(self) -> str:  # pragma: no cover
        return "Spark"

    # ---- Helpers ----
    @staticmethod
    def _q_ident(value: str | None) -> str:
        if value is None:
            return ""
        return f"`{value.replace('`', '``')}`"

    def _storage_meta(self, node: Node | None, relation: str) -> dict[str, Any]:
        """
        Retrieve configured storage overrides for the logical node backing `relation`.
        """
        rel_clean = self._strip_quotes(relation)

        # 1) Direct node meta / storage config
        if node is not None:
            meta = dict((node.meta or {}).get("storage") or {})
            if meta:
                return meta
            lookup = storage.get_model_storage(node.name)
            if lookup:
                return lookup

        # 2) Search REGISTRY nodes by relation_for(name)
        for cand in getattr(REGISTRY, "nodes", {}).values():
            try:
                if self._strip_quotes(relation_for(cand.name)) == rel_clean:
                    meta = dict((cand.meta or {}).get("storage") or {})
                    if meta:
                        return meta
                    lookup = storage.get_model_storage(cand.name)
                    if lookup:
                        return lookup
            except Exception:
                continue

        # 3) Direct storage override by relation name
        return storage.get_model_storage(rel_clean)

    def _write_to_storage_path(self, relation: str, df: SDF, storage_meta: dict[str, Any]) -> None:
        parts = self._identifier_parts(relation)
        identifier = ".".join(parts)

        storage.spark_write_to_path(
            self.spark,
            identifier,
            df,
            storage=storage_meta,
            default_format=self.spark_table_format,
            default_options=self.spark_table_options,
        )

        path = storage_meta.get("path")
        if path:
            with suppress(Exception):
                self.spark.catalog.refreshByPath(path)

    def _record_spark_dataframe_stats(self, df: SDF, duration_ms: int) -> None:
        self.runtime_query_stats.record_dataframe(df, duration_ms)

    # ---- SQL hooks ----
    def _format_relation_for_ref(self, name: str) -> str:
        """
        Format a ref(...) relation for use in SQL.

        - Default: just backtick-quote the logical relation name.
        - Iceberg: qualify with the Iceberg catalog so that models point at
          tables in `iceberg.<db>.<table>`, matching the seed & incremental
          write path.
        """
        base = relation_for(name)
        return self._sql_identifier(base)

    def _this_identifier(self, node: Node) -> str:
        base = relation_for(node.name)
        return self._sql_identifier(base)

    def _format_source_reference(
        self, cfg: dict[str, Any], source_name: str, table_name: str
    ) -> str:
        location = cfg.get("location")
        identifier = cfg.get("identifier")

        if location:
            alias = identifier or f"__ff_src_{source_name}_{table_name}"
            fmt_src = cfg.get("format")
            if not fmt_src:
                raise KeyError(
                    f"Source {source_name}.{table_name} requires 'format' when using a location"
                )

            options = dict(cfg.get("options") or {})
            descriptor = {
                "location": location,
                "format": fmt_src,
                "options": options,
            }
            existing = self._registered_path_sources.get(alias)
            if existing != descriptor:
                reader = self.spark.read.format(fmt_src)
                if options:
                    reader = reader.options(**options)
                df = reader.load(location)
                df.createOrReplaceTempView(alias)
                self._registered_path_sources[alias] = descriptor
            return self._q_ident(alias)

        if not identifier:
            raise KeyError(f"Source {source_name}.{table_name} missing identifier")
        catalog = cfg.get("catalog")
        schema = cfg.get("schema") or cfg.get("database")
        if catalog or schema:
            logical = ".".join([p for p in (catalog, schema, identifier) if p])
            return self._sql_identifier(logical)

        fallback_db = self.database or self.spark.catalog.currentDatabase()
        return self._sql_identifier(str(identifier), database=fallback_db)

    def _format_test_table(self, table: str | None) -> str | None:
        formatted = super()._format_test_table(table)
        if not isinstance(formatted, str):
            return formatted
        return self._format_handler.format_test_table(formatted, database=self.database)

    # ---- Spark table helpers ----
    @staticmethod
    def _strip_quotes(identifier: str) -> str:
        return identifier.replace("`", "").replace('"', "")

    def _identifier_parts(self, identifier: str) -> list[str]:
        cleaned = self._strip_quotes(identifier)
        return [part for part in cleaned.split(".") if part]

    def _physical_identifier(self, identifier: str, *, database: str | None = None) -> str:
        db = database if database is not None else self.database
        return self._format_handler.qualify_identifier(identifier, database=db)

    def _sql_identifier(self, identifier: str, *, database: str | None = None) -> str:
        db = database if database is not None else self.database
        return self._format_handler.format_identifier_for_sql(identifier, database=db)

    def _warehouse_base(self) -> Path | None:
        try:
            conf_val = self.spark.conf.get("spark.sql.warehouse.dir", "spark-warehouse")
        except Exception:
            conf_val = "spark-warehouse"

        if not isinstance(conf_val, str):
            conf_val = str(conf_val)
        parsed = urlparse(conf_val)
        scheme = (parsed.scheme or "").lower()

        if scheme and scheme != "file":
            return None

        if scheme == "file":
            if parsed.netloc and parsed.netloc not in {"", "localhost"}:
                return None
            raw_path = unquote(parsed.path or "")
            if not raw_path:
                return None
            base = Path(raw_path)
        else:
            base = Path(conf_val)

        if not base.is_absolute():
            base = Path.cwd() / base
        return base

    def _table_location(self, parts: list[str]) -> Path | None:
        base = self._warehouse_base()
        if base is None or not parts:
            return None

        filtered = [p for p in parts if p]
        if not filtered:
            return None

        catalog_cutoff = 3
        if len(filtered) >= catalog_cutoff and filtered[0].lower() in {"spark_catalog", "spark"}:
            filtered = filtered[1:]

        table = filtered[-1]
        schema_cutoff = 2
        schema = filtered[-2] if len(filtered) >= schema_cutoff else None

        location = base
        if schema:
            location = location / f"{schema}.db"
        return location / table

    def _save_df_as_table(
        self, identifier: str, df: SDF, *, storage: dict[str, Any] | None = None
    ) -> None:
        """
        Save a DataFrame as a (managed or unmanaged) table.

        - If storage["path"] is set -> unmanaged/path-based via storage.spark_write_to_path.
        - Otherwise -> managed table via the configured format handler
          (Delta, Parquet, future Iceberg, ...).
        """
        parts = self._identifier_parts(identifier)
        if not parts:
            raise ValueError(f"Invalid Spark table identifier: {identifier}")

        storage_meta = dict(storage or self._storage_meta(None, identifier) or {})

        path_override = storage_meta.get("path")
        if path_override and not self._format_handler.allows_unmanaged_paths():
            echo_debug(
                f"Ignoring storage.path override for table '{identifier}' because "
                f"format '{self._format_handler.table_format or 'default'}' "
                "requires managed tables."
            )
            path_override = None

        if path_override:
            self._write_to_storage_path(identifier, df, storage_meta)
            return

        table_name = ".".join(parts)
        # Managed tables: delegate to the format handler (Delta, Parquet, Iceberg, ...)
        self._format_handler.save_df_as_table(table_name, df)

        with suppress(Exception):
            self._execute_sql_basic(
                f"ANALYZE TABLE {self._sql_identifier(table_name)} COMPUTE STATISTICS"
            )

    def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None:
        self._execute_sql_basic(f"CREATE OR REPLACE VIEW {target_sql} AS {select_body}")

    def _create_or_replace_table(self, target_sql: str, select_body: str, node: Node) -> None:
        preview = f"-- target={target_sql}\n{select_body}"
        try:
            df = self._execute_sql(select_body)
            storage_meta = self._storage_meta(node, target_sql)
            self._save_df_as_table(target_sql, df, storage=storage_meta)
        except Exception as exc:
            raise ModelExecutionError(node.name, target_sql, str(exc), sql_snippet=preview) from exc

    def _create_or_replace_view_from_table(
        self, view_name: str, backing_table: str, node: Node
    ) -> None:
        view_sql = self._sql_identifier(view_name)
        backing_sql = self._sql_identifier(backing_table)
        self._execute_sql_basic(f"CREATE OR REPLACE VIEW {view_sql} AS SELECT * FROM {backing_sql}")

    # ---- Meta hook ----
    def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None:
        """After successful materialization, upsert _ff_meta (best-effort)."""
        ensure_meta_table(self)
        upsert_meta(self, node.name, relation, fingerprint, "databricks_spark")

    # ── Incremental API ─────────────────────────────────────────
    def exists_relation(self, relation: str) -> bool:
        """Check whether a table/view exists (optionally qualified with database)."""
        return self._format_handler.relation_exists(relation, database=self.database)

    def create_table_as(self, relation: str, select_sql: str) -> None:
        """CREATE TABLE AS with cleaned SELECT body."""
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        df = self._execute_sql(body)
        self._save_df_as_table(relation, df)

    def full_refresh_table(self, relation: str, select_sql: str) -> None:
        """
        Engine-specific full refresh for incremental fallbacks.
        Important: NO 'REPLACE TABLE' SQL, but DataFrame path + saveAsTable instead.
        """
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        # Delegate to format handler via _save_df_as_table for managed, or storage for unmanaged
        df = self._execute_sql(body)
        self._save_df_as_table(relation, df)

    def incremental_insert(self, relation: str, select_sql: str) -> None:
        """INSERT INTO with cleaned SELECT body (format-aware via handler)."""
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        self._format_handler.incremental_insert(relation, body)

    def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None:
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")

        # First: let the current format handler try to do a native merge.
        # - DeltaFormatHandler -> DeltaTable.merge()
        # - IcebergFormatHandler -> Spark SQL MERGE INTO
        try:
            self._format_handler.incremental_merge(relation, body, unique_key)
            return
        except NotImplementedError:
            # Format handler doesn't support MERGE → fall back to generic Spark strategy.
            pass

        # Fallback for formats without native merge:
        # overwrite = (existing minus keys being updated) UNION (new rows)
        materialized: list[SDF] = []

        def _materialize(df: SDF) -> SDF:
            """
            Ensure the frame is realized independently of the source table so an
            overwrite doesn't conflict with the read path.
            """
            try:
                cp = df.localCheckpoint(eager=True)
                materialized.append(cp)
                return cp
            except Exception:
                cached = df.cache()
                cached.count()
                materialized.append(cached)
                return cached

        try:
            physical = self._physical_identifier(relation)
            existing = _materialize(self.spark.table(physical))
            incoming = _materialize(self.spark.sql(body))

            if unique_key:
                # ensure key columns exist on incoming
                missing = [k for k in unique_key if k not in incoming.columns]
                if missing:
                    raise ModelExecutionError(
                        node_name="__python_incremental__",
                        relation=relation,
                        message=(
                            "incremental_merge fallback: missing key columns on incoming: "
                            f"{missing}"
                        ),
                    )
                key_df = incoming.select(*unique_key).dropDuplicates()
                # left_anti: keep only rows whose keys are NOT in incoming
                kept = existing.join(key_df, on=unique_key, how="left_anti")
                merged = kept.unionByName(incoming, allowMissingColumns=True)
            else:
                # No keys → append & deduplicate
                merged = existing.unionByName(incoming, allowMissingColumns=True).dropDuplicates()

            merged = _materialize(merged)
            # Full overwrite with merged result
            self._save_df_as_table(relation, merged)
        finally:
            for handle in materialized:
                with suppress(Exception):
                    handle.unpersist()

    def alter_table_sync_schema(
        self, relation: str, select_sql: str, *, mode: str = "append_new_columns"
    ) -> None:
        """
        Best-effort additive schema sync:
          - infer select schema via LIMIT 0
          - add missing columns as STRING (safe default)
        """
        if mode not in {"append_new_columns", "sync_all_columns"}:
            return
        # Target schema
        try:
            physical = self._physical_identifier(relation)
            target_df = self.spark.table(physical)
        except Exception:
            return
        existing = {f.name for f in target_df.schema.fields}
        # Output schema from the SELECT
        body = self._first_select_body(select_sql).strip().rstrip(";\n\t ")
        probe = self._execute_sql_basic(f"SELECT * FROM ({body}) q LIMIT 0")
        to_add = [f for f in probe.schema.fields if f.name not in existing]
        if not to_add:
            return

        def _spark_sql_type(dt: DataType) -> str:
            """Simple, portable mapping for Spark SQL types."""
            return (
                getattr(dt, "simpleString", lambda: "string")().upper()
                if hasattr(dt, "simpleString")
                else "STRING"
            )

        cols_sql = ", ".join([f"`{f.name}` {_spark_sql_type(f.dataType)}" for f in to_add])
        table_sql = self._sql_identifier(relation)
        self._execute_sql_basic(f"ALTER TABLE {table_sql} ADD COLUMNS ({cols_sql})")

    # ── Snapshot runtime delegation ──────────────────────────────────────
    def run_snapshot_sql(self, node: Node, env: Environment) -> None:
        self.snapshot_runtime.run_snapshot_sql(node, env)

    def snapshot_prune(
        self,
        relation: str,
        unique_key: list[str],
        keep_last: int,
        *,
        dry_run: bool = False,
    ) -> None:
        self.snapshot_runtime.snapshot_prune(
            relation,
            unique_key,
            keep_last,
            dry_run=dry_run,
        )

    def execute_hook_sql(self, sql: str) -> None:
        """
        Entry point for hook SQL.

        Accepts a string that may contain multiple ';'-separated statements.
        `_RunEngine._execute_hook_sql` has already normalized away semicolons
        in full-line comments, so naive splitting by ';' is acceptable here.
        """
        for stmt in (part.strip() for part in sql.split(";")):
            if not stmt:
                continue
            # Reuse your existing single-statement executor
            self._execute_sql(stmt)

    def load_seed(
        self, table: str, df: pd.DataFrame, schema: str | None = None
    ) -> tuple[bool, str, bool]:
        cleaned_table = self._strip_quotes(table)
        parts = self._identifier_parts(cleaned_table)

        created_schema = False
        if schema and len(parts) == 1:
            schema_part = self._strip_quotes(schema)
            if schema_part:
                # Ensure database exists when a separate schema is provided.
                self._execute_sql_basic(
                    f"CREATE DATABASE IF NOT EXISTS {self._q_ident(schema_part)}"
                )
                created_schema = True
                parts = [schema_part, parts[0]]

        if not parts:
            raise ValueError(f"Invalid Spark table identifier: {table}")

        target_identifier = ".".join(parts)
        target_sql = self._sql_identifier(target_identifier)
        format_handler = getattr(self, "_format_handler", None)

        storage_meta = storage.get_seed_storage(target_identifier)

        sdf = self.spark.createDataFrame(df)

        allows_unmanaged = bool(getattr(format_handler, "allows_unmanaged_paths", lambda: True)())

        if storage_meta.get("path") and allows_unmanaged:
            try:
                self._write_to_storage_path(target_identifier, sdf, storage_meta)
            except Exception as exc:  # pragma: no cover
                raise RuntimeError(f"Spark seed load failed for {target_sql}: {exc}") from exc
        else:
            try:
                self._save_df_as_table(target_identifier, sdf, storage={"path": None})
            except Exception as exc:  # pragma: no cover
                raise RuntimeError(f"Spark seed load failed for {target_sql}: {exc}") from exc

        return True, target_identifier, created_schema

        # ---- Unit-test helpers -------------------------------------------------

    def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None:
        """
        Load rows into a Spark table for unit tests (replace if exists).

        We go via pandas → Spark so schema is inferred from the Python
        data, then delegate to the same table-writing pipeline as the
        normal engine (_save_df_as_table), so table_format / storage
        options / catalogs are all respected.
        """
        pdf = pd.DataFrame(rows)
        # Spark can infer schema from the pandas DataFrame, even for empty
        # frames (it will just create an empty table with no rows).
        sdf = self.spark.createDataFrame(pdf)
        # Use the same path as normal model materialization so that
        # Delta/Iceberg/etc. are handled consistently.
        self._save_df_as_table(relation, sdf)

    def utest_read_relation(self, relation: str) -> pd.DataFrame:
        """
        Read a relation as a pandas DataFrame for unit-test assertions.

        The utest framework always compares on pandas, so we convert from
        Spark DataFrame here.
        """
        physical = self._physical_identifier(relation)
        sdf = self.spark.table(physical)
        return sdf.toPandas()

    def utest_clean_target(self, relation: str) -> None:
        """
        For unit tests: drop any view or table with this name.

        We:
          - try DROP VIEW IF EXISTS ...
          - try DROP TABLE IF EXISTS ...
        and ignore type-mismatch errors, so it doesn't matter whether a
        table or a view currently exists under that name.
        """
        ident = self._sql_identifier(relation)

        # Drop view first; ignore errors if it's actually a table or missing.
        with suppress(Exception):
            self._execute_sql_basic(f"DROP VIEW IF EXISTS {ident}")

        # Then drop table; ignore errors if it's actually a view or missing.
        with suppress(Exception):
            self._execute_sql_basic(f"DROP TABLE IF EXISTS {ident}")

    def collect_docs_columns(self) -> dict[str, list[ColumnInfo]]:
        """
        Collect column metadata via Spark catalog for docs rendering.
        """
        try:
            tables = list(self.spark.catalog.listTables())
        except Exception:
            return {}

        out: dict[str, list[ColumnInfo]] = {}
        seen: set[tuple[str | None, str]] = set()

        def _list_columns(table_name: str, database: str | None) -> list[Any]:
            ident = table_name if not database else f"{database}.{table_name}"
            try:
                return list(self.spark.catalog.listColumns(ident))
            except TypeError:
                return list(self.spark.catalog.listColumns(table_name, database))

        for tbl in tables:
            database = getattr(tbl, "database", None)
            raw_name = getattr(tbl, "name", None)
            if not raw_name:
                continue
            table_name = str(raw_name)
            key = (database, table_name)
            if key in seen:
                continue
            seen.add(key)
            try:
                cols = _list_columns(table_name, database)
            except Exception:
                continue
            if not cols:
                continue

            keys: set[str] = {table_name}
            catalog = getattr(tbl, "catalog", None)
            if database:
                keys.add(f"{database}.{table_name}")
            if database and catalog:
                keys.add(f"{catalog}.{database}.{table_name}")
            for c in cols:
                nullable = bool(getattr(c, "nullable", False))
                dtype = str(getattr(c, "dataType", ""))
                col_name = getattr(c, "name", None)
                if not col_name:
                    continue
                info = ColumnInfo(str(col_name), dtype, nullable)
                for k in keys:
                    out.setdefault(k, []).append(info)

        return out

    def _introspect_columns_metadata(
        self,
        table: str,
        column: str | None = None,
    ) -> list[tuple[str, str]]:
        """
        Internal helper: return [(column_name, spark_sql_type), ...] for a Spark table.

        - Uses Spark's DataFrame schema (no information_schema dependency).
        - Works with db.table identifiers via _physical_identifier().
        - Optionally restricts to a single column (case-insensitive).
        """
        physical = self._physical_identifier(table)
        df = self.spark.table(physical)

        want = column.lower() if column is not None else None

        out: list[tuple[str, str]] = []
        for field in df.schema.fields:
            name = field.name
            if want is not None and name.lower() != want:
                continue

            dt = field.dataType
            try:
                # e.g. "bigint", "string", "timestamp", "decimal(10,2)", "array<string>", ...
                typ = dt.simpleString()
            except Exception:
                typ = str(dt)

            # Keep consistent with your existing introspect_column_physical_type()
            out.append((str(name), str(typ).upper()))

        return out

    def introspect_column_physical_type(self, table: str, column: str) -> str | None:
        """
        Spark: return Spark SQL type (simpleString) for one column, uppercased.
        """
        rows = self._introspect_columns_metadata(table, column=column)
        return rows[0][1] if rows else None

    def introspect_table_physical_schema(self, table: str) -> dict[str, str]:
        """
        Spark: return {lower(column_name): spark_sql_type} for all columns of `table`.
        """
        rows = self._introspect_columns_metadata(table, column=None)
        # Lower keys to match runtime verifier behavior (case-insensitive compare)
        return {name.lower(): typ for (name, typ) in rows}

execute_test_sql ¶

execute_test_sql(stmt)

Execute lightweight SQL for DQ tests via Spark and return fetchable rows.

Source code in src/fastflowtransform/executors/databricks_spark.py

def execute_test_sql(self, stmt: Any) -> Any:
    """
    Execute lightweight SQL for DQ tests via Spark and return fetchable rows.
    """

    def _run_one(s: Any) -> Any:
        if isinstance(s, str):
            return rows_to_tuples(self.spark.sql(s).collect())
        if isinstance(s, Iterable) and not isinstance(s, (bytes, bytearray, str)):
            res = None
            for item in s:
                res = _run_one(item)
            return res
        return rows_to_tuples(self.spark.sql(str(s)).collect())

    return make_fetchable(_run_one(stmt))

on_node_built ¶

on_node_built(node, relation, fingerprint)

After successful materialization, upsert _ff_meta (best-effort).

Source code in src/fastflowtransform/executors/databricks_spark.py

def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None:
    """After successful materialization, upsert _ff_meta (best-effort)."""
    ensure_meta_table(self)
    upsert_meta(self, node.name, relation, fingerprint, "databricks_spark")

exists_relation ¶

exists_relation(relation)

Check whether a table/view exists (optionally qualified with database).

Source code in src/fastflowtransform/executors/databricks_spark.py

def exists_relation(self, relation: str) -> bool:
    """Check whether a table/view exists (optionally qualified with database)."""
    return self._format_handler.relation_exists(relation, database=self.database)

create_table_as ¶

create_table_as(relation, select_sql)

CREATE TABLE AS with cleaned SELECT body.

Source code in src/fastflowtransform/executors/databricks_spark.py

def create_table_as(self, relation: str, select_sql: str) -> None:
    """CREATE TABLE AS with cleaned SELECT body."""
    body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
    df = self._execute_sql(body)
    self._save_df_as_table(relation, df)

full_refresh_table ¶

full_refresh_table(relation, select_sql)

Engine-specific full refresh for incremental fallbacks. Important: NO 'REPLACE TABLE' SQL, but DataFrame path + saveAsTable instead.

Source code in src/fastflowtransform/executors/databricks_spark.py

def full_refresh_table(self, relation: str, select_sql: str) -> None:
    """
    Engine-specific full refresh for incremental fallbacks.
    Important: NO 'REPLACE TABLE' SQL, but DataFrame path + saveAsTable instead.
    """
    body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
    # Delegate to format handler via _save_df_as_table for managed, or storage for unmanaged
    df = self._execute_sql(body)
    self._save_df_as_table(relation, df)

incremental_insert ¶

incremental_insert(relation, select_sql)

INSERT INTO with cleaned SELECT body (format-aware via handler).

Source code in src/fastflowtransform/executors/databricks_spark.py

def incremental_insert(self, relation: str, select_sql: str) -> None:
    """INSERT INTO with cleaned SELECT body (format-aware via handler)."""
    body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
    self._format_handler.incremental_insert(relation, body)

alter_table_sync_schema ¶

alter_table_sync_schema(relation, select_sql, *, mode='append_new_columns')

Best-effort additive schema sync

infer select schema via LIMIT 0
add missing columns as STRING (safe default)

Source code in src/fastflowtransform/executors/databricks_spark.py

def alter_table_sync_schema(
    self, relation: str, select_sql: str, *, mode: str = "append_new_columns"
) -> None:
    """
    Best-effort additive schema sync:
      - infer select schema via LIMIT 0
      - add missing columns as STRING (safe default)
    """
    if mode not in {"append_new_columns", "sync_all_columns"}:
        return
    # Target schema
    try:
        physical = self._physical_identifier(relation)
        target_df = self.spark.table(physical)
    except Exception:
        return
    existing = {f.name for f in target_df.schema.fields}
    # Output schema from the SELECT
    body = self._first_select_body(select_sql).strip().rstrip(";\n\t ")
    probe = self._execute_sql_basic(f"SELECT * FROM ({body}) q LIMIT 0")
    to_add = [f for f in probe.schema.fields if f.name not in existing]
    if not to_add:
        return

    def _spark_sql_type(dt: DataType) -> str:
        """Simple, portable mapping for Spark SQL types."""
        return (
            getattr(dt, "simpleString", lambda: "string")().upper()
            if hasattr(dt, "simpleString")
            else "STRING"
        )

    cols_sql = ", ".join([f"`{f.name}` {_spark_sql_type(f.dataType)}" for f in to_add])
    table_sql = self._sql_identifier(relation)
    self._execute_sql_basic(f"ALTER TABLE {table_sql} ADD COLUMNS ({cols_sql})")

execute_hook_sql ¶

execute_hook_sql(sql)

Entry point for hook SQL.

Accepts a string that may contain multiple ';'-separated statements. _RunEngine._execute_hook_sql has already normalized away semicolons in full-line comments, so naive splitting by ';' is acceptable here.

Source code in src/fastflowtransform/executors/databricks_spark.py

def execute_hook_sql(self, sql: str) -> None:
    """
    Entry point for hook SQL.

    Accepts a string that may contain multiple ';'-separated statements.
    `_RunEngine._execute_hook_sql` has already normalized away semicolons
    in full-line comments, so naive splitting by ';' is acceptable here.
    """
    for stmt in (part.strip() for part in sql.split(";")):
        if not stmt:
            continue
        # Reuse your existing single-statement executor
        self._execute_sql(stmt)

utest_load_relation_from_rows ¶

utest_load_relation_from_rows(relation, rows)

Load rows into a Spark table for unit tests (replace if exists).

We go via pandas → Spark so schema is inferred from the Python data, then delegate to the same table-writing pipeline as the normal engine (_save_df_as_table), so table_format / storage options / catalogs are all respected.

Source code in src/fastflowtransform/executors/databricks_spark.py

def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None:
    """
    Load rows into a Spark table for unit tests (replace if exists).

    We go via pandas → Spark so schema is inferred from the Python
    data, then delegate to the same table-writing pipeline as the
    normal engine (_save_df_as_table), so table_format / storage
    options / catalogs are all respected.
    """
    pdf = pd.DataFrame(rows)
    # Spark can infer schema from the pandas DataFrame, even for empty
    # frames (it will just create an empty table with no rows).
    sdf = self.spark.createDataFrame(pdf)
    # Use the same path as normal model materialization so that
    # Delta/Iceberg/etc. are handled consistently.
    self._save_df_as_table(relation, sdf)

utest_read_relation ¶

utest_read_relation(relation)

Read a relation as a pandas DataFrame for unit-test assertions.

The utest framework always compares on pandas, so we convert from Spark DataFrame here.

Source code in src/fastflowtransform/executors/databricks_spark.py

def utest_read_relation(self, relation: str) -> pd.DataFrame:
    """
    Read a relation as a pandas DataFrame for unit-test assertions.

    The utest framework always compares on pandas, so we convert from
    Spark DataFrame here.
    """
    physical = self._physical_identifier(relation)
    sdf = self.spark.table(physical)
    return sdf.toPandas()

utest_clean_target ¶

utest_clean_target(relation)

For unit tests: drop any view or table with this name.

We

try DROP VIEW IF EXISTS ...
try DROP TABLE IF EXISTS ...

and ignore type-mismatch errors, so it doesn't matter whether a table or a view currently exists under that name.

Source code in src/fastflowtransform/executors/databricks_spark.py

def utest_clean_target(self, relation: str) -> None:
    """
    For unit tests: drop any view or table with this name.

    We:
      - try DROP VIEW IF EXISTS ...
      - try DROP TABLE IF EXISTS ...
    and ignore type-mismatch errors, so it doesn't matter whether a
    table or a view currently exists under that name.
    """
    ident = self._sql_identifier(relation)

    # Drop view first; ignore errors if it's actually a table or missing.
    with suppress(Exception):
        self._execute_sql_basic(f"DROP VIEW IF EXISTS {ident}")

    # Then drop table; ignore errors if it's actually a view or missing.
    with suppress(Exception):
        self._execute_sql_basic(f"DROP TABLE IF EXISTS {ident}")

collect_docs_columns ¶

collect_docs_columns()

Collect column metadata via Spark catalog for docs rendering.

Source code in src/fastflowtransform/executors/databricks_spark.py

def collect_docs_columns(self) -> dict[str, list[ColumnInfo]]:
    """
    Collect column metadata via Spark catalog for docs rendering.
    """
    try:
        tables = list(self.spark.catalog.listTables())
    except Exception:
        return {}

    out: dict[str, list[ColumnInfo]] = {}
    seen: set[tuple[str | None, str]] = set()

    def _list_columns(table_name: str, database: str | None) -> list[Any]:
        ident = table_name if not database else f"{database}.{table_name}"
        try:
            return list(self.spark.catalog.listColumns(ident))
        except TypeError:
            return list(self.spark.catalog.listColumns(table_name, database))

    for tbl in tables:
        database = getattr(tbl, "database", None)
        raw_name = getattr(tbl, "name", None)
        if not raw_name:
            continue
        table_name = str(raw_name)
        key = (database, table_name)
        if key in seen:
            continue
        seen.add(key)
        try:
            cols = _list_columns(table_name, database)
        except Exception:
            continue
        if not cols:
            continue

        keys: set[str] = {table_name}
        catalog = getattr(tbl, "catalog", None)
        if database:
            keys.add(f"{database}.{table_name}")
        if database and catalog:
            keys.add(f"{catalog}.{database}.{table_name}")
        for c in cols:
            nullable = bool(getattr(c, "nullable", False))
            dtype = str(getattr(c, "dataType", ""))
            col_name = getattr(c, "name", None)
            if not col_name:
                continue
            info = ColumnInfo(str(col_name), dtype, nullable)
            for k in keys:
                out.setdefault(k, []).append(info)

    return out

introspect_column_physical_type ¶

introspect_column_physical_type(table, column)

Spark: return Spark SQL type (simpleString) for one column, uppercased.

Source code in src/fastflowtransform/executors/databricks_spark.py

def introspect_column_physical_type(self, table: str, column: str) -> str | None:
    """
    Spark: return Spark SQL type (simpleString) for one column, uppercased.
    """
    rows = self._introspect_columns_metadata(table, column=column)
    return rows[0][1] if rows else None

introspect_table_physical_schema ¶

introspect_table_physical_schema(table)

Spark: return {lower(column_name): spark_sql_type} for all columns of table.

Source code in src/fastflowtransform/executors/databricks_spark.py

def introspect_table_physical_schema(self, table: str) -> dict[str, str]:
    """
    Spark: return {lower(column_name): spark_sql_type} for all columns of `table`.
    """
    rows = self._introspect_columns_metadata(table, column=None)
    # Lower keys to match runtime verifier behavior (case-insensitive compare)
    return {name.lower(): typ for (name, typ) in rows}

configure_contracts ¶

configure_contracts(contracts, project_contracts)

Inject parsed contracts into this executor instance. The run engine should call this once at startup.

Source code in src/fastflowtransform/executors/base.py

def configure_contracts(
    self,
    contracts: Mapping[str, ContractsFileModel] | None,
    project_contracts: ProjectContractsModel | None,
) -> None:
    """
    Inject parsed contracts into this executor instance.
    The run engine should call this once at startup.
    """
    self._ff_contracts = contracts or {}
    self._ff_project_contracts = project_contracts

run_sql ¶

run_sql(node, env)

Orchestrate SQL models

1) Render Jinja (ref/source/this) and strip leading {{ config(...) }}. 2) If the SQL is full DDL (CREATE …), execute it verbatim (passthrough). 3) Otherwise, normalize to CREATE OR REPLACE {TABLE|VIEW} AS . The body is CTE-aware (keeps WITH … SELECT … intact).

On failure, raise ModelExecutionError with a helpful snippet.

Source code in src/fastflowtransform/executors/base.py

def run_sql(self, node: Node, env: Environment) -> None:
    """
    Orchestrate SQL models:
      1) Render Jinja (ref/source/this) and strip leading {{ config(...) }}.
      2) If the SQL is full DDL (CREATE …), execute it verbatim (passthrough).
      3) Otherwise, normalize to CREATE OR REPLACE {TABLE|VIEW} AS <body>.
         The body is CTE-aware (keeps WITH … SELECT … intact).
    On failure, raise ModelExecutionError with a helpful snippet.
    """
    meta = getattr(node, "meta", {}) or {}
    if self._meta_is_incremental(meta):
        # Delegates to incremental engine: render, schema sync, merge/insert, etc.
        return _ff_incremental.run_or_dispatch(self, node, env)

    if self._meta_is_snapshot(meta):
        # Snapshots are executed via the dedicated CLI: `fft snapshot run`.
        raise ModelExecutionError(
            node_name=node.name,
            relation=relation_for(node.name),
            message=(
                "Snapshot models cannot be executed via 'fft run'. "
                "Use 'fft snapshot run' instead."
            ),
            sql_snippet="",
        )

    sql_rendered = self.render_sql(
        node,
        env,
        ref_resolver=lambda name: self._resolve_ref(name, env),
        source_resolver=self._resolve_source,
    )
    sql = self._strip_leading_config(sql_rendered).strip()

    materialization = (node.meta or {}).get("materialized", "table")
    if materialization == "ephemeral":
        return

    # 1) Direct DDL passthrough (CREATE [OR REPLACE] {TABLE|VIEW} …)
    if self._looks_like_direct_ddl(sql):
        try:
            self._execute_sql_direct(sql, node)
            return
        except NotImplementedError:
            # Engine doesn't implement direct DDL → fall back to normalized materialization.
            pass
        except Exception as e:
            raise ModelExecutionError(
                node_name=node.name,
                relation=relation_for(node.name),
                message=str(e),
                sql_snippet=sql,
            ) from e

    # 2) Normalized materialization path (CTE-safe body)
    body = self._selectable_body(sql).rstrip(" ;\n\t")
    target_sql = self._format_relation_for_ref(node.name)

    # Centralized SQL preview logging (applies to ALL engines)
    preview = (
        f"=== MATERIALIZE ===\n"
        f"-- model: {node.name}\n"
        f"-- materialized: {materialization}\n"
        f"-- target: {target_sql}\n"
        f"{body}\n"
    )
    echo_debug(preview)

    try:
        runtime = getattr(self, "runtime_contracts", None)
        # contracts only for TABLE materialization for now
        if runtime is not None and materialization == "table":
            contracts = getattr(self, "_ff_contracts", {}) or {}
            project_contracts = getattr(self, "_ff_project_contracts", None)

            # keying: prefer the logical table name (contracts.table),
            # but node.name or relation_for(node.name) is usually what you want.
            logical_name = relation_for(node.name)
            contract = contracts.get(logical_name) or contracts.get(node.name)

            ctx = runtime.build_context(
                node=node,
                relation=logical_name,
                physical_table=target_sql,
                contract=contract,
                project_contracts=project_contracts,
                is_incremental=self._meta_is_incremental(meta),
            )
            # Engine-specific enforcement (verify/cast/off)
            runtime.apply_sql_contracts(ctx=ctx, select_body=body)
        else:
            # Old behavior
            self._apply_sql_materialization(node, target_sql, body, materialization)
    except Exception as e:
        preview = f"-- materialized={materialization}\n-- target={target_sql}\n{body}"
        raise ModelExecutionError(
            node_name=node.name,
            relation=relation_for(node.name),
            message=str(e),
            sql_snippet=preview,
        ) from e

configure_query_budget_limit ¶

configure_query_budget_limit(limit)

Inject a configured per-query byte limit (e.g. from budgets.yml).

Source code in src/fastflowtransform/executors/base.py

def configure_query_budget_limit(self, limit: int | None) -> None:
    """
    Inject a configured per-query byte limit (e.g. from budgets.yml).
    """
    if limit is None:
        self._ff_configured_query_limit = None
        return
    try:
        iv = int(limit)
    except Exception:
        self._ff_configured_query_limit = None
        return
    self._ff_configured_query_limit = iv if iv > 0 else None

reset_node_stats ¶

reset_node_stats()

Reset per-node statistics buffer.

The run engine calls this before executing a model so that all stats recorded via _record_query_stats(...) belong to that node.

Source code in src/fastflowtransform/executors/base.py

def reset_node_stats(self) -> None:
    """
    Reset per-node statistics buffer.

    The run engine calls this before executing a model so that all
    stats recorded via `_record_query_stats(...)` belong to that node.
    """
    # just clear the buffer; next recording will re-create it
    self._ff_query_stats_buffer = []

get_node_stats ¶

get_node_stats()

Aggregate buffered QueryStats into a simple dict:

{
  "bytes_scanned": <sum>,
  "rows": <sum>,
  "query_duration_ms": <sum>,
}

Called by the run engine after a node finishes.

Source code in src/fastflowtransform/executors/base.py

def get_node_stats(self) -> dict[str, int]:
    """
    Aggregate buffered QueryStats into a simple dict:

        {
          "bytes_scanned": <sum>,
          "rows": <sum>,
          "query_duration_ms": <sum>,
        }

    Called by the run engine after a node finishes.
    """
    stats_list = self._drain_query_stats()
    if not stats_list:
        return {}

    total_bytes = 0
    total_rows = 0
    total_duration = 0

    for s in stats_list:
        if s.bytes_processed is not None:
            total_bytes += int(s.bytes_processed)
        if s.rows is not None:
            total_rows += int(s.rows)
        if s.duration_ms is not None:
            total_duration += int(s.duration_ms)

    return {
        "bytes_scanned": total_bytes,
        "rows": total_rows,
        "query_duration_ms": total_duration,
    }

run_python ¶

run_python(node)

Execute the Python model for a given node and materialize its result.

Source code in src/fastflowtransform/executors/base.py

def run_python(self, node: Node) -> None:
    """Execute the Python model for a given node and materialize its result."""
    func = REGISTRY.py_funcs[node.name]
    deps = REGISTRY.nodes[node.name].deps or []

    self._reset_http_ctx(node)

    args, argmap = self._build_python_inputs(node, deps)
    requires = REGISTRY.py_requires.get(node.name, {})
    if deps:
        # Required-columns check works against the mapping
        self._validate_required(node.name, argmap, requires)

    # out = self._execute_python_func(func, arg, node)
    out = self._execute_python_func(func, args, node)

    target = relation_for(node.name)
    meta = getattr(node, "meta", {}) or {}
    mat = self._resolve_materialization_strategy(meta)

    # ---------- Runtime contracts for Python models ----------
    runtime = getattr(self, "runtime_contracts", None)
    ctx = None
    took_over = False

    if runtime is not None:
        contracts = getattr(self, "_ff_contracts", {}) or {}
        project_contracts = getattr(self, "_ff_project_contracts", None)

        logical = target  # usually relation_for(node.name)
        contract = contracts.get(logical) or contracts.get(node.name)

        if contract is not None or project_contracts is not None:
            physical_table = self._format_relation_for_ref(node.name)
            ctx = runtime.build_context(
                node=node,
                relation=logical,
                physical_table=physical_table,
                contract=contract,
                project_contracts=project_contracts,
                is_incremental=(mat == "incremental"),
            )

            # Optional pre-coercion (default is no-op).
            if hasattr(runtime, "coerce_frame_schema"):
                out = runtime.coerce_frame_schema(out, ctx)

            # Allow engine-specific runtime to take over Python materialization
            if mat == "table" and hasattr(runtime, "materialize_python"):
                took_over = bool(runtime.materialize_python(ctx=ctx, df=out))

    # ---------- Materialization ----------
    if not took_over:
        if mat == "incremental":
            self._materialize_incremental(target, out, node, meta)
        elif mat == "view":
            self._materialize_view(target, out, node)
        else:
            self._materialize_relation(target, out, node)

    if ctx is not None and runtime is not None:
        runtime.verify_after_materialization(ctx=ctx)

    self._snapshot_http_ctx(node)

normalize_physical_type ¶

normalize_physical_type(t)

Canonicalize a physical type string for comparisons (DQ + contracts).

Default: just strip + lower. Engines may override to account for dialect quirks in information_schema (e.g. Postgres timestamp variants, Snowflake VARCHAR(…) / NUMBER(…)).

Source code in src/fastflowtransform/executors/base.py

def normalize_physical_type(self, t: str | None) -> str:
    """
    Canonicalize a physical type string for comparisons (DQ + contracts).

    Default: just strip + lower.
    Engines may override to account for dialect quirks in information_schema
    (e.g. Postgres timestamp variants, Snowflake VARCHAR(…) / NUMBER(…)).
    """
    return (t or "").strip().lower()