fastflowtransform.executors.snowflake_snowpark¶

SnowflakeSnowparkExecutor ¶

Bases: SqlIdentifierMixin, BaseExecutor[SNDF]

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

class SnowflakeSnowparkExecutor(SqlIdentifierMixin, BaseExecutor[SNDF]):
    ENGINE_NAME: str = "snowflake_snowpark"
    runtime_contracts: SnowflakeSnowparkRuntimeContracts
    runtime_query_stats: SnowflakeSnowparkQueryStatsRuntime
    runtime_budget: SnowflakeSnowparkBudgetRuntime
    snapshot_runtime: SnowflakeSnowparkSnapshotRuntime
    """Snowflake executor operating on Snowpark DataFrames (no pandas)."""

    def __init__(self, cfg: dict):
        # cfg: {account, user, password, warehouse, database, schema, role?}
        self.session = Session.builder.configs(cfg).create()
        self.database = cfg["database"]
        self.schema = cfg["schema"]

        self.allow_create_schema: bool = bool(cfg["allow_create_schema"])
        self._ensure_schema()
        self.runtime_query_stats = SnowflakeSnowparkQueryStatsRuntime(self)
        self.runtime_budget = SnowflakeSnowparkBudgetRuntime(self)
        self.runtime_contracts = SnowflakeSnowparkRuntimeContracts(self)
        self.snapshot_runtime = SnowflakeSnowparkSnapshotRuntime(self)

    def execute_test_sql(self, stmt: Any) -> Any:
        """
        Execute lightweight SQL for DQ tests via Snowpark and return fetchable rows.
        """

        def _run_one(s: Any) -> Any:
            if isinstance(s, str):
                return rows_to_tuples(self._execute_sql(s).collect())
            if isinstance(s, Iterable) and not isinstance(s, (bytes, bytearray, str)):
                res = None
                for item in s:
                    res = _run_one(item)
                return res
            return rows_to_tuples(self._execute_sql(str(s)).collect())

        return make_fetchable(_run_one(stmt))

    def compute_freshness_delay_minutes(self, table: str, ts_col: str) -> tuple[float | None, str]:
        sql = (
            f"select DATEDIFF('minute', max({ts_col}), CURRENT_TIMESTAMP())::float as delay_min "
            f"from {table}"
        )
        res = self.execute_test_sql(sql)
        row = getattr(res, "fetchone", lambda: None)()
        val = row[0] if row else None
        return (float(val) if val is not None else None, sql)

    def _execute_sql_basic(self, sql: str) -> SNDF:
        return self.session.sql(sql)

    def _execute_sql(self, sql: str) -> SNDF:
        """
        Central Snowflake SQL runner.

        - Returns a Snowpark DataFrame (same as session.sql).
        - Records best-effort query stats for run_results.json.
        """

        def _exec() -> SNDF:
            return self.session.sql(sql)

        return self.runtime_budget.run_sql(
            sql,
            exec_fn=_exec,
            stats_runtime=self.runtime_query_stats,
        )

    def _exec_many(self, sql: str) -> None:
        """
        Execute multiple SQL statements separated by ';' on the same connection.
        Snowflake normally accepts one statement per execute(), so we split here.
        """
        for stmt in (part.strip() for part in sql.split(";")):
            if not stmt:
                continue
            self._execute_sql(stmt).collect()

    # ---------- Helpers ----------
    def _quote_identifier(self, ident: str) -> str:
        # Keep identifiers unquoted to match legacy Snowflake behaviour.
        return ident

    def _default_schema(self) -> str | None:
        return self.schema

    def _default_catalog(self) -> str | None:
        return self.database

    def _should_include_catalog(
        self, catalog: str | None, schema: str | None, *, explicit: bool
    ) -> bool:
        # Always include database when present; Snowflake expects DB.SCHEMA.TABLE.
        return bool(catalog)

    def _qualified(self, relation: str, *, quoted: bool = False) -> str:
        # DATABASE.SCHEMA.TABLE  (no quotes)
        return self._format_identifier(relation, purpose="physical", quote=quoted)

    def _ensure_schema(self) -> None:
        """
        Best-effort schema creation when allow_create_schema=True.

        Mirrors BigQuery's `_ensure_dataset` behaviour:
        - If the flag is false → do nothing.
        - If true → `CREATE SCHEMA IF NOT EXISTS "DB"."SCHEMA"`.
        """
        if not getattr(self, "allow_create_schema", False):
            return
        if not self.database or not self.schema:
            # Misconfigured; let downstream errors surface naturally.
            return

        db = _q_ident(self.database)
        sch = _q_ident(self.schema)
        with suppress(Exception):
            # Fully qualified CREATE SCHEMA is allowed in Snowflake.
            self.session.sql(f"CREATE SCHEMA IF NOT EXISTS {db}.{sch}").collect()
            # Best-effort; permission issues or race conditions shouldn't crash the executor.
            # If the schema truly doesn't exist and we can't create it, later queries will fail
            # with a clearer engine error.

    # ---------- Frame-Hooks ----------
    def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> SNDF:
        df = self.session.table(self._qualified(relation))
        # Present a *logical* lowercase schema to Python models:
        lowered = [c.lower() for c in df.schema.names]
        return df.toDF(*lowered)

    def _materialize_relation(self, relation: str, df: SNDF, node: Node) -> None:
        if not self._is_frame(df):
            raise TypeError("Snowpark model must return a Snowpark DataFrame")

        # Normalize to uppercase for storage in Snowflake
        cols = list(df.schema.names)
        upper_cols = [c.upper() for c in cols]
        if cols != upper_cols:
            df = df.toDF(*upper_cols)

        start = perf_counter()
        df.write.save_as_table(self._qualified(relation), mode="overwrite")
        duration_ms = int((perf_counter() - start) * 1000)
        self.runtime_query_stats.record_dataframe(df, duration_ms)

    def _create_view_over_table(self, view_name: str, backing_table: str, node: Node) -> None:
        qv = self._qualified(view_name)
        qb = self._qualified(backing_table)
        self._execute_sql_basic(f"CREATE OR REPLACE VIEW {qv} AS SELECT * FROM {qb}").collect()

    def _validate_required(
        self, node_name: str, inputs: Any, requires: dict[str, set[str]]
    ) -> None:
        if not requires:
            return

        def cols(df: SNDF) -> set[str]:
            # Compare in lowercase to be case-insensitive for Snowflake
            return {c.lower() for c in df.schema.names}

        # Normalize the required sets too
        normalized_requires = {rel: {c.lower() for c in needed} for rel, needed in requires.items()}

        errors: list[str] = []

        if isinstance(inputs, SNDF):
            need = next(iter(normalized_requires.values()), set())
            missing = need - cols(inputs)
            if missing:
                errors.append(f"- missing columns: {sorted(missing)} | have={sorted(cols(inputs))}")
        else:
            for rel, need in normalized_requires.items():
                if rel not in inputs:
                    errors.append(f"- missing dependency key '{rel}'")
                    continue
                missing = need - cols(inputs[rel])
                if missing:
                    errors.append(
                        f"- [{rel}] missing: {sorted(missing)} | have={sorted(cols(inputs[rel]))}"
                    )

        if errors:
            raise ValueError(
                "Required columns check failed for Snowpark model "
                f"'{node_name}'.\n" + "\n".join(errors)
            )

    def _columns_of(self, frame: SNDF) -> list[str]:
        return list(frame.schema.names)

    def _is_frame(self, obj: Any) -> bool:
        # Accept real Snowpark DataFrames and test doubles with a compatible surface.
        schema = getattr(obj, "schema", None)
        return isinstance(obj, SNDF) or (
            schema is not None
            and hasattr(schema, "names")
            and callable(getattr(obj, "collect", None))
        )

    def _frame_name(self) -> str:
        return "Snowpark"

    def load_seed(
        self, table: str, df: pd.DataFrame, schema: str | None = None
    ) -> tuple[bool, str, bool]:
        """
        Materialize a pandas seed into Snowflake via Snowpark.

        - Qualifies with database + schema (defaults to configured schema).
        - Best-effort schema creation when allow_create_schema is enabled.
        - Normalizes columns to uppercase before writing.
        """
        db_part, schema_part, table_part = self._normalize_table_parts_for_introspection(table)

        if schema:
            schema_part = schema.strip().strip('`"') or schema_part

        target_db = db_part or self.database
        target_schema = schema_part or self.schema

        created_schema = False
        if target_db and target_schema and getattr(self, "allow_create_schema", False):
            db_ident = _q_ident(target_db)
            schema_ident = _q_ident(target_schema)
            try:
                self.session.sql(f"CREATE SCHEMA IF NOT EXISTS {db_ident}.{schema_ident}").collect()
                created_schema = True
            except Exception:
                # Best-effort; let the write fail later if schema truly missing.
                pass

        qualified = self._format_identifier(
            table_part,
            purpose="seed",
            schema=target_schema,
            catalog=target_db,
            quote=False,
        )

        snow_df = self.session.create_dataframe(df.reset_index(drop=True))
        cols = list(snow_df.schema.names)
        upper_cols = [c.upper() for c in cols]
        if cols != upper_cols:
            snow_df = snow_df.toDF(*upper_cols)

        snow_df.write.save_as_table(qualified, mode="overwrite")

        return True, qualified, created_schema

    # ---- SQL hooks ----
    def _this_identifier(self, node: Node) -> str:
        """
        Identifier for {{ this }} in SQL models.
        Use fully-qualified DB.SCHEMA.TABLE so all build/read/test paths agree.
        """
        return self._format_identifier(relation_for(node.name), purpose="this", quote=False)

    def _format_source_reference(
        self, cfg: dict[str, Any], source_name: str, table_name: str
    ) -> str:
        ident = cfg.get("identifier")
        if not ident:
            raise KeyError(f"Source {source_name}.{table_name} missing identifier")
        formatted = self._format_identifier(
            ident,
            purpose="source",
            source_cfg=cfg,
            source_name=source_name,
            table_name=table_name,
            quote=False,
        )
        # Ensure we resolved to DB.SCHEMA.TABLE; Snowflake needs both parts.
        if "." not in formatted:
            raise KeyError(
                f"Source {source_name}.{table_name} missing database/schema for Snowflake"
            )
        return formatted

    def _create_or_replace_view(self, target_sql: str, select_body: str, node: Node) -> None:
        self._execute_sql_basic(f"CREATE OR REPLACE VIEW {target_sql} AS {select_body}").collect()

    def _create_or_replace_table(self, target_sql: str, select_body: str, node: Node) -> None:
        self._execute_sql(f"CREATE OR REPLACE TABLE {target_sql} AS {select_body}").collect()

    def _create_or_replace_view_from_table(
        self, view_name: str, backing_table: str, node: Node
    ) -> None:
        view_id = self._qualified(view_name)
        back_id = self._qualified(backing_table)
        self._execute_sql_basic(
            f"CREATE OR REPLACE VIEW {view_id} AS SELECT * FROM {back_id}"
        ).collect()

    def _format_test_table(self, table: str | None) -> str | None:
        # Bypass mixin qualification to avoid double-qualifying already dotted names.
        formatted = BaseExecutor._format_test_table(self, table)
        if formatted is None:
            return None

        # If it's already qualified (DB.SCHEMA.TABLE) or quoted, leave it alone.
        if "." in formatted or '"' in formatted:
            return formatted

        # Otherwise, treat it as a logical relation name and fully-qualify it
        # with the executor's configured database/schema.
        return self._format_identifier(formatted, purpose="test", quote=False)

    # ---- Meta hook ----
    def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None:
        """After successful materialization, upsert _ff_meta (best-effort)."""
        ensure_meta_table(self)
        upsert_meta(self, node.name, relation, fingerprint, "snowflake_snowpark")

    # ── Incremental API (parity with DuckDB/PG) ──────────────────────────
    def exists_relation(self, relation: str) -> bool:
        """Check existence via information_schema.tables."""
        db = _q_ident(self.database)
        schema_lit = f"'{self.schema.upper()}'"
        rel_lit = f"'{relation.upper()}'"
        q = f"""
        select 1
        from {db}.information_schema.tables
        where upper(table_schema) = {schema_lit}
            and upper(table_name) = {rel_lit}
        limit 1
        """
        try:
            return bool(self._execute_sql_basic(q).collect())
        except Exception:
            return False

    def create_table_as(self, relation: str, select_sql: str) -> None:
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        self._execute_sql(
            f"CREATE OR REPLACE TABLE {self._qualified(relation)} AS {body}"
        ).collect()

    def full_refresh_table(self, relation: str, select_sql: str) -> None:
        """
        Engine-specific full refresh for incremental fallbacks.
        """
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        self._execute_sql(
            f"CREATE OR REPLACE TABLE {self._qualified(relation)} AS {body}"
        ).collect()

    def incremental_insert(self, relation: str, select_sql: str) -> None:
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        self._execute_sql(f"INSERT INTO {self._qualified(relation)} {body}").collect()

    def incremental_merge(self, relation: str, select_sql: str, unique_key: list[str]) -> None:
        body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
        pred = " AND ".join([f"t.{k}=s.{k}" for k in unique_key]) or "FALSE"
        qrel = self._qualified(relation)

        # 1) Delete matching keys
        delete_sql = f"""
        DELETE FROM {qrel} AS t
        USING ({body}) AS s
        WHERE {pred}
        """
        self._execute_sql(delete_sql).collect()

        # 2) Insert all rows from the delta
        insert_sql = f"INSERT INTO {qrel} SELECT * FROM ({body})"
        self._execute_sql(insert_sql).collect()

    def alter_table_sync_schema(
        self, relation: str, select_sql: str, *, mode: str = "append_new_columns"
    ) -> None:
        """
        Best-effort additive schema sync:
        - infer SELECT schema via LIMIT 0
        - add missing columns as STRING
        """
        if mode not in {"append_new_columns", "sync_all_columns"}:
            return

        qrel = self._qualified(relation)

        # Use identifiers in FROM, but *string literals* in WHERE
        db_ident = _q_ident(self.database)
        schema_lit = self.schema.replace("'", "''")
        rel_lit = relation.replace("'", "''")

        try:
            existing = {
                r[0]
                for r in self._execute_sql_basic(
                    f"""
                    select column_name
                    from {db_ident}.information_schema.columns
                    where upper(table_schema) = upper('{schema_lit}')
                    and upper(table_name)   = upper('{rel_lit}')
                    """
                ).collect()
            }
        except Exception:
            existing = set()

        # Probe SELECT columns
        body = self._first_select_body(select_sql).strip().rstrip(";\n\t ")
        probe = self.session.sql(f"SELECT * FROM ({body}) q WHERE 1=0")
        probe_cols = list(probe.schema.names)

        to_add = [c for c in probe_cols if c not in existing]
        if not to_add:
            return

        # Column names are identifiers → _q is correct here
        cols_sql = ", ".join(f"{_q_ident(c)} STRING" for c in to_add)
        self._execute_sql_basic(f"ALTER TABLE {qrel} ADD COLUMN {cols_sql}").collect()

    # ---- Snapshot runtime delegation --------------------------------------
    def run_snapshot_sql(self, node: Node, env: Any) -> None:
        self.snapshot_runtime.run_snapshot_sql(node, env)

    def snapshot_prune(
        self,
        relation: str,
        unique_key: list[str],
        keep_last: int,
        *,
        dry_run: bool = False,
    ) -> None:
        self.snapshot_runtime.snapshot_prune(
            relation,
            unique_key,
            keep_last,
            dry_run=dry_run,
        )

    def execute_hook_sql(self, sql: str) -> None:
        """
        Execute one SQL statement for pre/post/on_run hooks.
        """
        self._exec_many(sql)

    # ---- Unit-test helpers -----------------------------------------------

    def utest_read_relation(self, relation: str) -> pd.DataFrame:
        """
        Read a relation into a pandas DataFrame for unit-test assertions.

        We use Snowpark to read the table and convert to pandas,
        normalizing column names to lowercase to match _read_relation.
        """
        df = self.session.table(self._qualified(relation))
        # Mirror _read_relation: present lowercase schema to the test layer
        lowered = [c.lower() for c in df.schema.names]
        df = df.toDF(*lowered)

        to_pandas = getattr(df, "to_pandas", None)

        pdf: pd.DataFrame
        if callable(to_pandas):
            pdf = cast(pd.DataFrame, to_pandas())
        else:
            rows = df.collect()
            records = [r.asDict() for r in rows]
            pdf = pd.DataFrame.from_records(records)

        # Return a new DF with lowercase columns (no attribute assignment)
        return pdf.rename(columns=lambda c: str(c).lower())

    def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None:
        """
        Load rows into a Snowflake table for unit tests (replace if exists).

        We build a Snowpark DataFrame from the Python rows and overwrite the
        target table using save_as_table().
        """
        # Best-effort: if rows are empty, create an empty table with no rows.
        # We assume at least one row in normal test usage so we can infer schema.
        if not rows:
            # Without any rows we don't know the schema; create a trivial
            # single-column table to surface the situation clearly.
            tmp_df = self.session.create_dataframe([[None]], schema=["__empty__"])
            tmp_df.write.save_as_table(self._qualified(relation), mode="overwrite")
            return

        # Infer column order from the first row
        first = rows[0]
        columns = list(first.keys())

        # Normalize data to a list of lists in a fixed column order
        data = [[row.get(col) for col in columns] for row in rows]

        df = self.session.create_dataframe(data, schema=columns)

        # Store with uppercase column names in Snowflake (conventional)
        upper_cols = [c.upper() for c in columns]
        if columns != upper_cols:
            df = df.toDF(*upper_cols)

        # Overwrite the target table
        df.write.save_as_table(self._qualified(relation), mode="overwrite")

    def utest_clean_target(self, relation: str) -> None:
        """
        For unit tests: drop any table or view with this name in the configured
        database/schema.

        We:
          - try DROP VIEW IF EXISTS DB.SCHEMA.REL
          - try DROP TABLE IF EXISTS DB.SCHEMA.REL

        and ignore "not a view/table" style errors so it doesn't matter what
        kind of object is currently there - after this, nothing with that name
        should remain (best-effort).
        """
        qualified = self._qualified(relation)

        # Drop view first; ignore errors if it's actually a table or doesn't exist.
        with suppress(Exception):
            self.session.sql(f"DROP VIEW IF EXISTS {qualified}").collect()

        # Then drop table; ignore errors if it's actually a view or doesn't exist.
        with suppress(Exception):
            self.session.sql(f"DROP TABLE IF EXISTS {qualified}").collect()

    def collect_docs_columns(self) -> dict[str, list[ColumnInfo]]:
        """
        Best-effort column metadata for docs (scoped to configured DB/schema).
        """
        schema_pred = (
            f"lower(table_schema) = '{self.schema.lower()}'"
            if self.schema
            else "table_schema = current_schema()"
        )
        catalog_pred = (
            f" AND lower(table_catalog) = '{self.database.lower()}'" if self.database else ""
        )
        sql = f"""
        select table_name, column_name, data_type, is_nullable
        from information_schema.columns
        where {schema_pred}{catalog_pred}
        order by table_schema, table_name, ordinal_position
        """
        try:
            rows = self.session.sql(sql).collect()
        except Exception:
            return {}

        out: dict[str, list[ColumnInfo]] = {}
        for r in rows:
            table = r["TABLE_NAME"]
            col = r["COLUMN_NAME"]
            dtype = r["DATA_TYPE"]
            nullable = r["IS_NULLABLE"]
            out.setdefault(table, []).append(
                ColumnInfo(col, str(dtype), str(nullable).upper() == "YES")
            )
        return out

    def _normalize_table_parts_for_introspection(self, table: str) -> tuple[str, str, str]:
        """
        Return (database, schema, table_name) for a possibly qualified identifier.

        Accepts:
          - TABLE
          - SCHEMA.TABLE
          - DATABASE.SCHEMA.TABLE

        Quotes/backticks are stripped best-effort; names are returned as raw strings.
        """
        raw = (table or "").strip()
        raw = raw.replace('"', "").replace("`", "")
        parts = [p for p in raw.split(".") if p]

        if len(parts) >= 3:
            db, sch, tbl = parts[-3], parts[-2], parts[-1]
            return db, sch, tbl
        if len(parts) == 2:
            sch, tbl = parts[0], parts[1]
            return self.database, sch, tbl
        return self.database, self.schema, parts[0] if parts else raw

    def _compose_snowflake_type(
        self,
        data_type: Any,
        char_max: Any,
        num_precision: Any,
        num_scale: Any,
    ) -> str:
        dt = str(data_type).strip().upper()

        def _to_int(v: Any) -> int | None:
            try:
                if v is None:
                    return None
                return int(v)
            except Exception:
                return None

        SF_VARCHAR_MAX = 16777216

        if dt in {"NUMBER", "DECIMAL", "NUMERIC"}:
            p = _to_int(num_precision)
            s = _to_int(num_scale)
            if s is None:
                s = 0

            # Treat integer NUMBERs as base NUMBER so contracts can just say "NUMBER"
            if s == 0:
                return "NUMBER"

            # Only surface precision/scale for non-integer decimals
            if p is None:
                return "NUMBER"
            return f"NUMBER({p},{s})"

        if dt in {"VARCHAR", "CHAR", "CHARACTER", "STRING", "TEXT"}:
            n = _to_int(char_max)
            if n is None or n <= 0 or n >= SF_VARCHAR_MAX:
                return "VARCHAR"
            return f"VARCHAR({n})"

        return dt

    def introspect_table_physical_schema(self, table: str) -> dict[str, str]:
        """
        Snowflake: return {lower(column_name): type_string} for all columns.

        Uses <db>.information_schema.columns and composes NUMBER(p,s) / VARCHAR(n)
        when metadata is present.
        """
        db, sch, tbl = self._normalize_table_parts_for_introspection(table)

        db_ident = _q_ident(db)
        schema_lit = sch.replace("'", "''").upper()
        table_lit = tbl.replace("'", "''").upper()

        sql = f"""
        select
            column_name,
            data_type,
            character_maximum_length,
            numeric_precision,
            numeric_scale
        from {db_ident}.information_schema.columns
        where upper(table_schema) = '{schema_lit}'
          and upper(table_name)   = '{table_lit}'
        order by ordinal_position
        """

        rows = self._execute_sql_basic(sql).collect()
        out: dict[str, str] = {}

        for r in rows or []:
            col_name = str(r[0]) if r and r[0] is not None else None
            if not col_name:
                continue
            typ = self._compose_snowflake_type(r[1], r[2], r[3], r[4])
            out[col_name.lower()] = typ

        return out

    def introspect_column_physical_type(self, table: str, column: str) -> str | None:
        """
        Snowflake: read column type from information_schema.columns and return a composed
        type string (e.g. NUMBER(38,0), VARCHAR(16777216), TIMESTAMP_NTZ).
        """
        db, sch, tbl = self._normalize_table_parts_for_introspection(table)

        db_ident = _q_ident(db)
        schema_lit = sch.replace("'", "''").upper()
        table_lit = tbl.replace("'", "''").upper()
        col_lit = (column or "").replace("'", "''").upper()

        sql = f"""
        select
            data_type,
            character_maximum_length,
            numeric_precision,
            numeric_scale
        from {db_ident}.information_schema.columns
        where upper(table_schema) = '{schema_lit}'
          and upper(table_name)   = '{table_lit}'
          and upper(column_name)  = '{col_lit}'
        limit 1
        """

        rows = self._execute_sql_basic(sql).collect()
        if not rows:
            return None
        r = rows[0]
        return self._compose_snowflake_type(r[0], r[1], r[2], r[3])

snapshot_runtime `instance-attribute` ¶

snapshot_runtime = SnowflakeSnowparkSnapshotRuntime(self)

Snowflake executor operating on Snowpark DataFrames (no pandas).

execute_test_sql ¶

execute_test_sql(stmt)

Execute lightweight SQL for DQ tests via Snowpark and return fetchable rows.

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def execute_test_sql(self, stmt: Any) -> Any:
    """
    Execute lightweight SQL for DQ tests via Snowpark and return fetchable rows.
    """

    def _run_one(s: Any) -> Any:
        if isinstance(s, str):
            return rows_to_tuples(self._execute_sql(s).collect())
        if isinstance(s, Iterable) and not isinstance(s, (bytes, bytearray, str)):
            res = None
            for item in s:
                res = _run_one(item)
            return res
        return rows_to_tuples(self._execute_sql(str(s)).collect())

    return make_fetchable(_run_one(stmt))

load_seed ¶

load_seed(table, df, schema=None)

Materialize a pandas seed into Snowflake via Snowpark.

Qualifies with database + schema (defaults to configured schema).
Best-effort schema creation when allow_create_schema is enabled.
Normalizes columns to uppercase before writing.

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def load_seed(
    self, table: str, df: pd.DataFrame, schema: str | None = None
) -> tuple[bool, str, bool]:
    """
    Materialize a pandas seed into Snowflake via Snowpark.

    - Qualifies with database + schema (defaults to configured schema).
    - Best-effort schema creation when allow_create_schema is enabled.
    - Normalizes columns to uppercase before writing.
    """
    db_part, schema_part, table_part = self._normalize_table_parts_for_introspection(table)

    if schema:
        schema_part = schema.strip().strip('`"') or schema_part

    target_db = db_part or self.database
    target_schema = schema_part or self.schema

    created_schema = False
    if target_db and target_schema and getattr(self, "allow_create_schema", False):
        db_ident = _q_ident(target_db)
        schema_ident = _q_ident(target_schema)
        try:
            self.session.sql(f"CREATE SCHEMA IF NOT EXISTS {db_ident}.{schema_ident}").collect()
            created_schema = True
        except Exception:
            # Best-effort; let the write fail later if schema truly missing.
            pass

    qualified = self._format_identifier(
        table_part,
        purpose="seed",
        schema=target_schema,
        catalog=target_db,
        quote=False,
    )

    snow_df = self.session.create_dataframe(df.reset_index(drop=True))
    cols = list(snow_df.schema.names)
    upper_cols = [c.upper() for c in cols]
    if cols != upper_cols:
        snow_df = snow_df.toDF(*upper_cols)

    snow_df.write.save_as_table(qualified, mode="overwrite")

    return True, qualified, created_schema

on_node_built ¶

on_node_built(node, relation, fingerprint)

After successful materialization, upsert _ff_meta (best-effort).

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def on_node_built(self, node: Node, relation: str, fingerprint: str) -> None:
    """After successful materialization, upsert _ff_meta (best-effort)."""
    ensure_meta_table(self)
    upsert_meta(self, node.name, relation, fingerprint, "snowflake_snowpark")

exists_relation ¶

exists_relation(relation)

Check existence via information_schema.tables.

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def exists_relation(self, relation: str) -> bool:
    """Check existence via information_schema.tables."""
    db = _q_ident(self.database)
    schema_lit = f"'{self.schema.upper()}'"
    rel_lit = f"'{relation.upper()}'"
    q = f"""
    select 1
    from {db}.information_schema.tables
    where upper(table_schema) = {schema_lit}
        and upper(table_name) = {rel_lit}
    limit 1
    """
    try:
        return bool(self._execute_sql_basic(q).collect())
    except Exception:
        return False

full_refresh_table ¶

full_refresh_table(relation, select_sql)

Engine-specific full refresh for incremental fallbacks.

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def full_refresh_table(self, relation: str, select_sql: str) -> None:
    """
    Engine-specific full refresh for incremental fallbacks.
    """
    body = self._selectable_body(select_sql).strip().rstrip(";\n\t ")
    self._execute_sql(
        f"CREATE OR REPLACE TABLE {self._qualified(relation)} AS {body}"
    ).collect()

alter_table_sync_schema ¶

alter_table_sync_schema(relation, select_sql, *, mode='append_new_columns')

Best-effort additive schema sync: - infer SELECT schema via LIMIT 0 - add missing columns as STRING

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def alter_table_sync_schema(
    self, relation: str, select_sql: str, *, mode: str = "append_new_columns"
) -> None:
    """
    Best-effort additive schema sync:
    - infer SELECT schema via LIMIT 0
    - add missing columns as STRING
    """
    if mode not in {"append_new_columns", "sync_all_columns"}:
        return

    qrel = self._qualified(relation)

    # Use identifiers in FROM, but *string literals* in WHERE
    db_ident = _q_ident(self.database)
    schema_lit = self.schema.replace("'", "''")
    rel_lit = relation.replace("'", "''")

    try:
        existing = {
            r[0]
            for r in self._execute_sql_basic(
                f"""
                select column_name
                from {db_ident}.information_schema.columns
                where upper(table_schema) = upper('{schema_lit}')
                and upper(table_name)   = upper('{rel_lit}')
                """
            ).collect()
        }
    except Exception:
        existing = set()

    # Probe SELECT columns
    body = self._first_select_body(select_sql).strip().rstrip(";\n\t ")
    probe = self.session.sql(f"SELECT * FROM ({body}) q WHERE 1=0")
    probe_cols = list(probe.schema.names)

    to_add = [c for c in probe_cols if c not in existing]
    if not to_add:
        return

    # Column names are identifiers → _q is correct here
    cols_sql = ", ".join(f"{_q_ident(c)} STRING" for c in to_add)
    self._execute_sql_basic(f"ALTER TABLE {qrel} ADD COLUMN {cols_sql}").collect()

execute_hook_sql ¶

execute_hook_sql(sql)

Execute one SQL statement for pre/post/on_run hooks.

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def execute_hook_sql(self, sql: str) -> None:
    """
    Execute one SQL statement for pre/post/on_run hooks.
    """
    self._exec_many(sql)

utest_read_relation ¶

utest_read_relation(relation)

Read a relation into a pandas DataFrame for unit-test assertions.

We use Snowpark to read the table and convert to pandas, normalizing column names to lowercase to match _read_relation.

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def utest_read_relation(self, relation: str) -> pd.DataFrame:
    """
    Read a relation into a pandas DataFrame for unit-test assertions.

    We use Snowpark to read the table and convert to pandas,
    normalizing column names to lowercase to match _read_relation.
    """
    df = self.session.table(self._qualified(relation))
    # Mirror _read_relation: present lowercase schema to the test layer
    lowered = [c.lower() for c in df.schema.names]
    df = df.toDF(*lowered)

    to_pandas = getattr(df, "to_pandas", None)

    pdf: pd.DataFrame
    if callable(to_pandas):
        pdf = cast(pd.DataFrame, to_pandas())
    else:
        rows = df.collect()
        records = [r.asDict() for r in rows]
        pdf = pd.DataFrame.from_records(records)

    # Return a new DF with lowercase columns (no attribute assignment)
    return pdf.rename(columns=lambda c: str(c).lower())

utest_load_relation_from_rows ¶

utest_load_relation_from_rows(relation, rows)

Load rows into a Snowflake table for unit tests (replace if exists).

We build a Snowpark DataFrame from the Python rows and overwrite the target table using save_as_table().

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None:
    """
    Load rows into a Snowflake table for unit tests (replace if exists).

    We build a Snowpark DataFrame from the Python rows and overwrite the
    target table using save_as_table().
    """
    # Best-effort: if rows are empty, create an empty table with no rows.
    # We assume at least one row in normal test usage so we can infer schema.
    if not rows:
        # Without any rows we don't know the schema; create a trivial
        # single-column table to surface the situation clearly.
        tmp_df = self.session.create_dataframe([[None]], schema=["__empty__"])
        tmp_df.write.save_as_table(self._qualified(relation), mode="overwrite")
        return

    # Infer column order from the first row
    first = rows[0]
    columns = list(first.keys())

    # Normalize data to a list of lists in a fixed column order
    data = [[row.get(col) for col in columns] for row in rows]

    df = self.session.create_dataframe(data, schema=columns)

    # Store with uppercase column names in Snowflake (conventional)
    upper_cols = [c.upper() for c in columns]
    if columns != upper_cols:
        df = df.toDF(*upper_cols)

    # Overwrite the target table
    df.write.save_as_table(self._qualified(relation), mode="overwrite")

utest_clean_target ¶

utest_clean_target(relation)

For unit tests: drop any table or view with this name in the configured database/schema.

We

try DROP VIEW IF EXISTS DB.SCHEMA.REL
try DROP TABLE IF EXISTS DB.SCHEMA.REL

and ignore "not a view/table" style errors so it doesn't matter what kind of object is currently there - after this, nothing with that name should remain (best-effort).

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def utest_clean_target(self, relation: str) -> None:
    """
    For unit tests: drop any table or view with this name in the configured
    database/schema.

    We:
      - try DROP VIEW IF EXISTS DB.SCHEMA.REL
      - try DROP TABLE IF EXISTS DB.SCHEMA.REL

    and ignore "not a view/table" style errors so it doesn't matter what
    kind of object is currently there - after this, nothing with that name
    should remain (best-effort).
    """
    qualified = self._qualified(relation)

    # Drop view first; ignore errors if it's actually a table or doesn't exist.
    with suppress(Exception):
        self.session.sql(f"DROP VIEW IF EXISTS {qualified}").collect()

    # Then drop table; ignore errors if it's actually a view or doesn't exist.
    with suppress(Exception):
        self.session.sql(f"DROP TABLE IF EXISTS {qualified}").collect()

collect_docs_columns ¶

collect_docs_columns()

Best-effort column metadata for docs (scoped to configured DB/schema).

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def collect_docs_columns(self) -> dict[str, list[ColumnInfo]]:
    """
    Best-effort column metadata for docs (scoped to configured DB/schema).
    """
    schema_pred = (
        f"lower(table_schema) = '{self.schema.lower()}'"
        if self.schema
        else "table_schema = current_schema()"
    )
    catalog_pred = (
        f" AND lower(table_catalog) = '{self.database.lower()}'" if self.database else ""
    )
    sql = f"""
    select table_name, column_name, data_type, is_nullable
    from information_schema.columns
    where {schema_pred}{catalog_pred}
    order by table_schema, table_name, ordinal_position
    """
    try:
        rows = self.session.sql(sql).collect()
    except Exception:
        return {}

    out: dict[str, list[ColumnInfo]] = {}
    for r in rows:
        table = r["TABLE_NAME"]
        col = r["COLUMN_NAME"]
        dtype = r["DATA_TYPE"]
        nullable = r["IS_NULLABLE"]
        out.setdefault(table, []).append(
            ColumnInfo(col, str(dtype), str(nullable).upper() == "YES")
        )
    return out

introspect_table_physical_schema ¶

introspect_table_physical_schema(table)

Snowflake: return {lower(column_name): type_string} for all columns.

Uses .information_schema.columns and composes NUMBER(p,s) / VARCHAR(n) when metadata is present.

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def introspect_table_physical_schema(self, table: str) -> dict[str, str]:
    """
    Snowflake: return {lower(column_name): type_string} for all columns.

    Uses <db>.information_schema.columns and composes NUMBER(p,s) / VARCHAR(n)
    when metadata is present.
    """
    db, sch, tbl = self._normalize_table_parts_for_introspection(table)

    db_ident = _q_ident(db)
    schema_lit = sch.replace("'", "''").upper()
    table_lit = tbl.replace("'", "''").upper()

    sql = f"""
    select
        column_name,
        data_type,
        character_maximum_length,
        numeric_precision,
        numeric_scale
    from {db_ident}.information_schema.columns
    where upper(table_schema) = '{schema_lit}'
      and upper(table_name)   = '{table_lit}'
    order by ordinal_position
    """

    rows = self._execute_sql_basic(sql).collect()
    out: dict[str, str] = {}

    for r in rows or []:
        col_name = str(r[0]) if r and r[0] is not None else None
        if not col_name:
            continue
        typ = self._compose_snowflake_type(r[1], r[2], r[3], r[4])
        out[col_name.lower()] = typ

    return out

introspect_column_physical_type ¶

introspect_column_physical_type(table, column)

Snowflake: read column type from information_schema.columns and return a composed type string (e.g. NUMBER(38,0), VARCHAR(16777216), TIMESTAMP_NTZ).

Source code in src/fastflowtransform/executors/snowflake_snowpark.py

def introspect_column_physical_type(self, table: str, column: str) -> str | None:
    """
    Snowflake: read column type from information_schema.columns and return a composed
    type string (e.g. NUMBER(38,0), VARCHAR(16777216), TIMESTAMP_NTZ).
    """
    db, sch, tbl = self._normalize_table_parts_for_introspection(table)

    db_ident = _q_ident(db)
    schema_lit = sch.replace("'", "''").upper()
    table_lit = tbl.replace("'", "''").upper()
    col_lit = (column or "").replace("'", "''").upper()

    sql = f"""
    select
        data_type,
        character_maximum_length,
        numeric_precision,
        numeric_scale
    from {db_ident}.information_schema.columns
    where upper(table_schema) = '{schema_lit}'
      and upper(table_name)   = '{table_lit}'
      and upper(column_name)  = '{col_lit}'
    limit 1
    """

    rows = self._execute_sql_basic(sql).collect()
    if not rows:
        return None
    r = rows[0]
    return self._compose_snowflake_type(r[0], r[1], r[2], r[3])

configure_contracts ¶

configure_contracts(contracts, project_contracts)

Inject parsed contracts into this executor instance. The run engine should call this once at startup.

Source code in src/fastflowtransform/executors/base.py

def configure_contracts(
    self,
    contracts: Mapping[str, ContractsFileModel] | None,
    project_contracts: ProjectContractsModel | None,
) -> None:
    """
    Inject parsed contracts into this executor instance.
    The run engine should call this once at startup.
    """
    self._ff_contracts = contracts or {}
    self._ff_project_contracts = project_contracts

run_sql ¶

run_sql(node, env)

Orchestrate SQL models

1) Render Jinja (ref/source/this) and strip leading {{ config(...) }}. 2) If the SQL is full DDL (CREATE …), execute it verbatim (passthrough). 3) Otherwise, normalize to CREATE OR REPLACE {TABLE|VIEW} AS . The body is CTE-aware (keeps WITH … SELECT … intact).

On failure, raise ModelExecutionError with a helpful snippet.

Source code in src/fastflowtransform/executors/base.py

def run_sql(self, node: Node, env: Environment) -> None:
    """
    Orchestrate SQL models:
      1) Render Jinja (ref/source/this) and strip leading {{ config(...) }}.
      2) If the SQL is full DDL (CREATE …), execute it verbatim (passthrough).
      3) Otherwise, normalize to CREATE OR REPLACE {TABLE|VIEW} AS <body>.
         The body is CTE-aware (keeps WITH … SELECT … intact).
    On failure, raise ModelExecutionError with a helpful snippet.
    """
    meta = getattr(node, "meta", {}) or {}
    if self._meta_is_incremental(meta):
        # Delegates to incremental engine: render, schema sync, merge/insert, etc.
        return _ff_incremental.run_or_dispatch(self, node, env)

    if self._meta_is_snapshot(meta):
        # Snapshots are executed via the dedicated CLI: `fft snapshot run`.
        raise ModelExecutionError(
            node_name=node.name,
            relation=relation_for(node.name),
            message=(
                "Snapshot models cannot be executed via 'fft run'. "
                "Use 'fft snapshot run' instead."
            ),
            sql_snippet="",
        )

    sql_rendered = self.render_sql(
        node,
        env,
        ref_resolver=lambda name: self._resolve_ref(name, env),
        source_resolver=self._resolve_source,
    )
    sql = self._strip_leading_config(sql_rendered).strip()

    materialization = (node.meta or {}).get("materialized", "table")
    if materialization == "ephemeral":
        return

    # 1) Direct DDL passthrough (CREATE [OR REPLACE] {TABLE|VIEW} …)
    if self._looks_like_direct_ddl(sql):
        try:
            self._execute_sql_direct(sql, node)
            return
        except NotImplementedError:
            # Engine doesn't implement direct DDL → fall back to normalized materialization.
            pass
        except Exception as e:
            raise ModelExecutionError(
                node_name=node.name,
                relation=relation_for(node.name),
                message=str(e),
                sql_snippet=sql,
            ) from e

    # 2) Normalized materialization path (CTE-safe body)
    body = self._selectable_body(sql).rstrip(" ;\n\t")
    target_sql = self._format_relation_for_ref(node.name)

    # Centralized SQL preview logging (applies to ALL engines)
    preview = (
        f"=== MATERIALIZE ===\n"
        f"-- model: {node.name}\n"
        f"-- materialized: {materialization}\n"
        f"-- target: {target_sql}\n"
        f"{body}\n"
    )
    echo_debug(preview)

    try:
        runtime = getattr(self, "runtime_contracts", None)
        # contracts only for TABLE materialization for now
        if runtime is not None and materialization == "table":
            contracts = getattr(self, "_ff_contracts", {}) or {}
            project_contracts = getattr(self, "_ff_project_contracts", None)

            # keying: prefer the logical table name (contracts.table),
            # but node.name or relation_for(node.name) is usually what you want.
            logical_name = relation_for(node.name)
            contract = contracts.get(logical_name) or contracts.get(node.name)

            ctx = runtime.build_context(
                node=node,
                relation=logical_name,
                physical_table=target_sql,
                contract=contract,
                project_contracts=project_contracts,
                is_incremental=self._meta_is_incremental(meta),
            )
            # Engine-specific enforcement (verify/cast/off)
            runtime.apply_sql_contracts(ctx=ctx, select_body=body)
        else:
            # Old behavior
            self._apply_sql_materialization(node, target_sql, body, materialization)
    except Exception as e:
        preview = f"-- materialized={materialization}\n-- target={target_sql}\n{body}"
        raise ModelExecutionError(
            node_name=node.name,
            relation=relation_for(node.name),
            message=str(e),
            sql_snippet=preview,
        ) from e

configure_query_budget_limit ¶

configure_query_budget_limit(limit)

Inject a configured per-query byte limit (e.g. from budgets.yml).

Source code in src/fastflowtransform/executors/base.py

def configure_query_budget_limit(self, limit: int | None) -> None:
    """
    Inject a configured per-query byte limit (e.g. from budgets.yml).
    """
    if limit is None:
        self._ff_configured_query_limit = None
        return
    try:
        iv = int(limit)
    except Exception:
        self._ff_configured_query_limit = None
        return
    self._ff_configured_query_limit = iv if iv > 0 else None

reset_node_stats ¶

reset_node_stats()

Reset per-node statistics buffer.

The run engine calls this before executing a model so that all stats recorded via _record_query_stats(...) belong to that node.

Source code in src/fastflowtransform/executors/base.py

def reset_node_stats(self) -> None:
    """
    Reset per-node statistics buffer.

    The run engine calls this before executing a model so that all
    stats recorded via `_record_query_stats(...)` belong to that node.
    """
    # just clear the buffer; next recording will re-create it
    self._ff_query_stats_buffer = []

get_node_stats ¶

get_node_stats()

Aggregate buffered QueryStats into a simple dict:

{
  "bytes_scanned": <sum>,
  "rows": <sum>,
  "query_duration_ms": <sum>,
}

Called by the run engine after a node finishes.

Source code in src/fastflowtransform/executors/base.py

def get_node_stats(self) -> dict[str, int]:
    """
    Aggregate buffered QueryStats into a simple dict:

        {
          "bytes_scanned": <sum>,
          "rows": <sum>,
          "query_duration_ms": <sum>,
        }

    Called by the run engine after a node finishes.
    """
    stats_list = self._drain_query_stats()
    if not stats_list:
        return {}

    total_bytes = 0
    total_rows = 0
    total_duration = 0

    for s in stats_list:
        if s.bytes_processed is not None:
            total_bytes += int(s.bytes_processed)
        if s.rows is not None:
            total_rows += int(s.rows)
        if s.duration_ms is not None:
            total_duration += int(s.duration_ms)

    return {
        "bytes_scanned": total_bytes,
        "rows": total_rows,
        "query_duration_ms": total_duration,
    }

run_python ¶

run_python(node)

Execute the Python model for a given node and materialize its result.

Source code in src/fastflowtransform/executors/base.py

def run_python(self, node: Node) -> None:
    """Execute the Python model for a given node and materialize its result."""
    func = REGISTRY.py_funcs[node.name]
    deps = REGISTRY.nodes[node.name].deps or []

    self._reset_http_ctx(node)

    args, argmap = self._build_python_inputs(node, deps)
    requires = REGISTRY.py_requires.get(node.name, {})
    if deps:
        # Required-columns check works against the mapping
        self._validate_required(node.name, argmap, requires)

    # out = self._execute_python_func(func, arg, node)
    out = self._execute_python_func(func, args, node)

    target = relation_for(node.name)
    meta = getattr(node, "meta", {}) or {}
    mat = self._resolve_materialization_strategy(meta)

    # ---------- Runtime contracts for Python models ----------
    runtime = getattr(self, "runtime_contracts", None)
    ctx = None
    took_over = False

    if runtime is not None:
        contracts = getattr(self, "_ff_contracts", {}) or {}
        project_contracts = getattr(self, "_ff_project_contracts", None)

        logical = target  # usually relation_for(node.name)
        contract = contracts.get(logical) or contracts.get(node.name)

        if contract is not None or project_contracts is not None:
            physical_table = self._format_relation_for_ref(node.name)
            ctx = runtime.build_context(
                node=node,
                relation=logical,
                physical_table=physical_table,
                contract=contract,
                project_contracts=project_contracts,
                is_incremental=(mat == "incremental"),
            )

            # Optional pre-coercion (default is no-op).
            if hasattr(runtime, "coerce_frame_schema"):
                out = runtime.coerce_frame_schema(out, ctx)

            # Allow engine-specific runtime to take over Python materialization
            if mat == "table" and hasattr(runtime, "materialize_python"):
                took_over = bool(runtime.materialize_python(ctx=ctx, df=out))

    # ---------- Materialization ----------
    if not took_over:
        if mat == "incremental":
            self._materialize_incremental(target, out, node, meta)
        elif mat == "view":
            self._materialize_view(target, out, node)
        else:
            self._materialize_relation(target, out, node)

    if ctx is not None and runtime is not None:
        runtime.verify_after_materialization(ctx=ctx)

    self._snapshot_http_ctx(node)

normalize_physical_type ¶

normalize_physical_type(t)

Canonicalize a physical type string for comparisons (DQ + contracts).

Default: just strip + lower. Engines may override to account for dialect quirks in information_schema (e.g. Postgres timestamp variants, Snowflake VARCHAR(…) / NUMBER(…)).

Source code in src/fastflowtransform/executors/base.py

def normalize_physical_type(self, t: str | None) -> str:
    """
    Canonicalize a physical type string for comparisons (DQ + contracts).

    Default: just strip + lower.
    Engines may override to account for dialect quirks in information_schema
    (e.g. Postgres timestamp variants, Snowflake VARCHAR(…) / NUMBER(…)).
    """
    return (t or "").strip().lower()

fastflowtransform.executors.snowflake_snowpark¶

SnowflakeSnowparkExecutor ¶

snapshot_runtime instance-attribute ¶

execute_test_sql ¶

load_seed ¶

on_node_built ¶

exists_relation ¶

full_refresh_table ¶

alter_table_sync_schema ¶

execute_hook_sql ¶

utest_read_relation ¶

utest_load_relation_from_rows ¶

utest_clean_target ¶

collect_docs_columns ¶

introspect_table_physical_schema ¶

introspect_column_physical_type ¶

configure_contracts ¶

run_sql ¶

configure_query_budget_limit ¶

reset_node_stats ¶

get_node_stats ¶

run_python ¶

normalize_physical_type ¶

snapshot_runtime `instance-attribute` ¶