fastflowtransform.executors.budget.runtime¶

BaseBudgetRuntime ¶

Base runtime for per-query budget enforcement.

Executors compose this (like runtime contracts) and delegate guarded execution through it.

Source code in src/fastflowtransform/executors/budget/runtime/base.py

class BaseBudgetRuntime[E: BudgetExecutor]:
    """
    Base runtime for per-query budget enforcement.

    Executors compose this (like runtime contracts) and delegate guarded
    execution through it.
    """

    executor: E
    guard: BudgetGuard | None

    def __init__(self, executor: E, guard: BudgetGuard | None = None):
        self.executor = executor
        self.guard = guard or getattr(type(self), "DEFAULT_GUARD", None)

    def apply_guard(self, sql: str) -> int | None:
        return self.executor._apply_budget_guard(self.guard, sql)

    def run_sql(
        self,
        sql: str,
        *,
        exec_fn: Callable[[], Any],
        stats_runtime: BaseQueryStatsRuntime,
        rowcount_extractor: Callable[[Any], int | None] | None = None,
        extra_stats: Callable[[Any], Any] | None = None,
        estimate_fn: Callable[[str], int | None] | None = None,
        post_estimate_fn: Callable[[str, Any], int | None] | None = None,
        record_stats: bool = True,
        stats_adapter: QueryStatsAdapter | None = None,
    ) -> Any:
        estimated_bytes = self.apply_guard(sql)
        estimator = estimate_fn or getattr(self, "estimate_query_bytes", None)
        if (
            estimated_bytes is None
            and not self.executor._is_budget_guard_active()
            and callable(estimator)
        ):
            with suppress(Exception):
                estimated_bytes = estimator(sql)

        if not record_stats:
            return exec_fn()

        started = perf_counter()
        result = exec_fn()
        duration_ms = int((perf_counter() - started) * 1000)

        adapter = stats_adapter
        if adapter is None and (rowcount_extractor or post_estimate_fn or extra_stats):
            adapter = RowcountStatsAdapter(
                rowcount_extractor=rowcount_extractor,
                post_estimate_fn=post_estimate_fn,
                extra_stats=extra_stats,
                sql=sql,
            )

        stats_runtime.record_result(
            result,
            duration_ms=duration_ms,
            estimated_bytes=estimated_bytes,
            adapter=adapter,
            sql=sql,
        )

        return result

BigQueryBudgetRuntime ¶

Bases: BaseBudgetRuntime[BigQueryBudgetExecutor]

BigQuery budget runtime using dry-run estimation.

Source code in src/fastflowtransform/executors/budget/runtime/bigquery.py

class BigQueryBudgetRuntime(BaseBudgetRuntime[BigQueryBudgetExecutor]):
    """BigQuery budget runtime using dry-run estimation."""

    DEFAULT_GUARD = BudgetGuard(
        env_var="FF_BQ_MAX_BYTES",
        estimator_attr="runtime_budget_estimate_query_bytes",
        engine_label="BigQuery",
        what="query",
    )

    def estimate_query_bytes(self, sql: str) -> int | None:
        """
        Estimate bytes for a BigQuery SQL statement using a dry-run.

        Returns the estimated bytes, or None if estimation is not possible.
        """
        cfg = bigquery.QueryJobConfig(
            dry_run=True,
            use_query_cache=False,
        )
        if self.executor.dataset:
            cfg.default_dataset = bigquery.DatasetReference(
                self.executor.project, self.executor.dataset
            )

        try:
            job = self.executor.client.query(
                sql,
                job_config=cfg,
                location=self.executor.location,
            )
            job.result()
        except Exception:
            return None

        try:
            return int(getattr(job, "total_bytes_processed", 0) or 0)
        except Exception:
            return None

estimate_query_bytes ¶

estimate_query_bytes(sql)

Estimate bytes for a BigQuery SQL statement using a dry-run.

Returns the estimated bytes, or None if estimation is not possible.

Source code in src/fastflowtransform/executors/budget/runtime/bigquery.py

def estimate_query_bytes(self, sql: str) -> int | None:
    """
    Estimate bytes for a BigQuery SQL statement using a dry-run.

    Returns the estimated bytes, or None if estimation is not possible.
    """
    cfg = bigquery.QueryJobConfig(
        dry_run=True,
        use_query_cache=False,
    )
    if self.executor.dataset:
        cfg.default_dataset = bigquery.DatasetReference(
            self.executor.project, self.executor.dataset
        )

    try:
        job = self.executor.client.query(
            sql,
            job_config=cfg,
            location=self.executor.location,
        )
        job.result()
    except Exception:
        return None

    try:
        return int(getattr(job, "total_bytes_processed", 0) or 0)
    except Exception:
        return None

DatabricksSparkBudgetRuntime ¶

Bases: BaseBudgetRuntime[DatabricksSparkBudgetExecutor]

Databricks/Spark budget runtime using logical-plan stats for estimation.

Source code in src/fastflowtransform/executors/budget/runtime/databricks_spark.py

class DatabricksSparkBudgetRuntime(BaseBudgetRuntime[DatabricksSparkBudgetExecutor]):
    """Databricks/Spark budget runtime using logical-plan stats for estimation."""

    DEFAULT_GUARD = BudgetGuard(
        env_var="FF_SPK_MAX_BYTES",
        estimator_attr="runtime_budget_estimate_query_bytes",
        engine_label="Databricks/Spark",
        what="query",
    )

    def __init__(self, executor: DatabricksSparkBudgetExecutor, guard: BudgetGuard | None = None):
        super().__init__(executor, guard)
        self._default_size: int | None = self.detect_default_size()

    def estimate_query_bytes(self, sql: str) -> int | None:
        return self._spark_plan_bytes(sql)

    def detect_default_size(self) -> int:
        """
        Detect Spark's defaultSizeInBytes sentinel.

        - Prefer spark.sql.defaultSizeInBytes if available.
        - Fall back to Long.MaxValue (2^63 - 1) otherwise.
        """
        try:
            conf_val = self.executor.spark.conf.get("spark.sql.defaultSizeInBytes")
            if conf_val is not None:
                return int(conf_val)
        except Exception:
            # config not set / older Spark / weird environment
            pass

        # Fallback: Spark uses Long.MaxValue by default
        return 2**63 - 1  # 9223372036854775807

    def spark_stats_adapter(self, sql: str) -> SparkDataFrameStatsAdapter:
        """
        Build a SparkDataFrameStatsAdapter tied to this runtime's estimation logic.
        """

        def _bytes(df: Any) -> int | None:
            estimate = self.dataframe_bytes(df)
            if estimate is not None:
                return estimate
            return self.estimate_query_bytes(sql)

        return SparkDataFrameStatsAdapter(_bytes)

    # ---- Shared helpers for Spark stats ----
    def dataframe_bytes(self, df: Any) -> int | None:
        try:
            jdf = getattr(df, "_jdf", None)
            if jdf is None:
                return None

            qe = jdf.queryExecution()
            jplan = qe.optimizedPlan()

            if self._jplan_uses_default_size(jplan):
                return None

            stats = jplan.stats()
            size_attr = getattr(stats, "sizeInBytes", None)
            size_val = size_attr() if callable(size_attr) else size_attr
            return self._parse_spark_stats_size(size_val)
        except Exception:
            return None

    def _spark_plan_bytes(self, sql: str) -> int | None:
        """
        Inspect the optimized logical plan via the JVM and return sizeInBytes
        as an integer, or None if not available. This does not execute the query.
        """
        try:
            normalized = self.executor._selectable_body(sql).rstrip(";\n\t ")
            if not normalized:
                normalized = sql
        except Exception:
            normalized = sql

        stmt = normalized.lstrip().lower()
        if not stmt.startswith(("select", "with")):
            # DDL/DML statements should not be executed twice.
            return None

        try:
            df = self.executor.spark.sql(normalized)

            jdf = getattr(df, "_jdf", None)
            if jdf is None:
                return None

            qe = jdf.queryExecution()
            jplan = qe.optimizedPlan()

            if self._jplan_uses_default_size(jplan):
                return None

            stats = jplan.stats()
            size_attr = getattr(stats, "sizeInBytes", None)
            size_val = size_attr() if callable(size_attr) else size_attr

            return self._parse_spark_stats_size(size_val)
        except Exception:
            return None

    def _jplan_uses_default_size(self, jplan: Any) -> bool:
        """
        Recursively walk a JVM LogicalPlan and return True if any node's
        stats.sizeInBytes equals spark.sql.defaultSizeInBytes.
        """
        spark_default_size = self._default_size
        if spark_default_size is None:
            return False

        try:
            stats = jplan.stats()
            size_val = stats.sizeInBytes()
            size_int = int(str(size_val))
            if size_int == spark_default_size:
                return True
        except Exception:
            # ignore stats errors and keep walking
            pass

        # children() is a Scala Seq[LogicalPlan]; iterate via .size() / .apply(i)
        try:
            children = jplan.children()
            n = children.size()
            for idx in range(n):
                child = children.apply(idx)
                if self._jplan_uses_default_size(child):
                    return True
        except Exception:
            # if we can't inspect children, stop here
            pass

        return False

    def _parse_spark_stats_size(self, size_val: Any) -> int | None:
        if size_val is None:
            return None
        try:
            size_int = int(str(size_val))
        except Exception:
            return None
        return size_int if size_int > 0 else None

detect_default_size ¶

detect_default_size()

Detect Spark's defaultSizeInBytes sentinel.

Prefer spark.sql.defaultSizeInBytes if available.
Fall back to Long.MaxValue (2^63 - 1) otherwise.

Source code in src/fastflowtransform/executors/budget/runtime/databricks_spark.py

def detect_default_size(self) -> int:
    """
    Detect Spark's defaultSizeInBytes sentinel.

    - Prefer spark.sql.defaultSizeInBytes if available.
    - Fall back to Long.MaxValue (2^63 - 1) otherwise.
    """
    try:
        conf_val = self.executor.spark.conf.get("spark.sql.defaultSizeInBytes")
        if conf_val is not None:
            return int(conf_val)
    except Exception:
        # config not set / older Spark / weird environment
        pass

    # Fallback: Spark uses Long.MaxValue by default
    return 2**63 - 1  # 9223372036854775807

spark_stats_adapter ¶

spark_stats_adapter(sql)

Build a SparkDataFrameStatsAdapter tied to this runtime's estimation logic.

Source code in src/fastflowtransform/executors/budget/runtime/databricks_spark.py

def spark_stats_adapter(self, sql: str) -> SparkDataFrameStatsAdapter:
    """
    Build a SparkDataFrameStatsAdapter tied to this runtime's estimation logic.
    """

    def _bytes(df: Any) -> int | None:
        estimate = self.dataframe_bytes(df)
        if estimate is not None:
            return estimate
        return self.estimate_query_bytes(sql)

    return SparkDataFrameStatsAdapter(_bytes)

DuckBudgetRuntime ¶

Bases: BaseBudgetRuntime[DuckBudgetExecutor]

DuckDB-specific budget runtime with plan-based estimation.

Source code in src/fastflowtransform/executors/budget/runtime/duckdb.py

class DuckBudgetRuntime(BaseBudgetRuntime[DuckBudgetExecutor]):
    """DuckDB-specific budget runtime with plan-based estimation."""

    DEFAULT_GUARD = BudgetGuard(
        env_var="FF_DUCKDB_MAX_BYTES",
        estimator_attr="_estimate_query_bytes",
        engine_label="DuckDB",
        what="query",
    )

    _FIXED_TYPE_SIZES: ClassVar[dict[str, int]] = {
        "boolean": 1,
        "bool": 1,
        "tinyint": 1,
        "smallint": 2,
        "integer": 4,
        "int": 4,
        "bigint": 8,
        "float": 4,
        "real": 4,
        "double": 8,
        "double precision": 8,
        "decimal": 16,
        "numeric": 16,
        "uuid": 16,
        "json": 64,
        "jsonb": 64,
        "timestamp": 8,
        "timestamp_ntz": 8,
        "timestamp_ltz": 8,
        "timestamptz": 8,
        "date": 4,
        "time": 4,
        "interval": 16,
    }
    _VARCHAR_DEFAULT_WIDTH = 64
    _VARCHAR_MAX_WIDTH = 1024
    _DEFAULT_ROW_WIDTH = 128

    def __init__(self, executor: DuckBudgetExecutor, guard: BudgetGuard | None = None):
        super().__init__(executor, guard)
        self._table_row_width_cache: dict[tuple[str | None, str], int] = {}

    # ------------------------------------------------------------------ #
    # Cost estimation used by BudgetGuard                                #
    # ------------------------------------------------------------------ #

    def estimate_query_bytes(self, sql: str) -> int | None:
        """
        Estimate query size via DuckDB's EXPLAIN (FORMAT JSON).
        """
        # Try to normalize to a SELECT/CTE body if the executor exposes it
        body = self.executor._selectable_body(sql).strip().rstrip(";\n\t ")

        lower = body.lower()
        if not lower.startswith(("select", "with")):
            return None

        explain_sql = f"EXPLAIN (FORMAT JSON) {body}"
        try:
            rows = self.executor._execute_fetchall(explain_sql)
        except Exception:
            return None

        if not rows:
            return None

        fragments: list[str] = []
        for row in rows:
            for cell in row:
                if cell is None:
                    continue
                fragments.append(str(cell))

        if not fragments:
            return None

        plan_text = "\n".join(fragments).strip()
        start = plan_text.find("[")
        end = plan_text.rfind("]")
        if start == -1 or end == -1 or end <= start:
            return None

        try:
            plan_data = json.loads(plan_text[start : end + 1])
        except Exception:
            return None

        estimate = self._max_cardinality(plan_data)
        if estimate <= 0:
            return None

        tables = self._collect_tables_from_plan(
            plan_data if isinstance(plan_data, list) else [plan_data]
        )
        row_width = self._row_width_for_tables(tables)
        if row_width <= 0:
            row_width = self._DEFAULT_ROW_WIDTH

        bytes_estimate = int(estimate * row_width)
        return bytes_estimate if bytes_estimate > 0 else None

    def _max_cardinality(self, plan_data: Any) -> int:
        def _to_int(value: Any) -> int | None:
            if value is None:
                return None
            if isinstance(value, (int, float)):
                try:
                    converted = int(value)
                except Exception:
                    return None
                return converted
            text = str(value)
            match = re.search(r"(\d+(?:\.\d+)?)", text)
            if not match:
                return None
            try:
                return int(float(match.group(1)))
            except ValueError:
                return None

        def _walk_node(node: dict[str, Any]) -> int:
            best = 0
            extra = node.get("extra_info") or {}
            for key in (
                "Estimated Cardinality",
                "estimated_cardinality",
                "Cardinality",
                "cardinality",
            ):
                candidate = _to_int(extra.get(key))
                if candidate is not None:
                    best = max(best, candidate)
            candidate = _to_int(node.get("cardinality"))
            if candidate is not None:
                best = max(best, candidate)
            for child in node.get("children") or []:
                if isinstance(child, dict):
                    best = max(best, _walk_node(child))
            return best

        nodes = plan_data if isinstance(plan_data, list) else [plan_data]

        estimate = 0
        for entry in nodes:
            if isinstance(entry, dict):
                estimate = max(estimate, _walk_node(entry))
        return estimate

    def _collect_tables_from_plan(self, nodes: list[dict[str, Any]]) -> set[tuple[str | None, str]]:
        tables: set[tuple[str | None, str]] = set()

        def _walk(entry: dict[str, Any]) -> None:
            extra = entry.get("extra_info") or {}
            table_val = extra.get("Table")
            schema_val = extra.get("Schema") or extra.get("Database") or extra.get("Catalog")
            if isinstance(table_val, str) and table_val.strip():
                schema, table = self._split_identifier(table_val, schema_val)
                if table:
                    tables.add((schema, table))
            for child in entry.get("children") or []:
                if isinstance(child, dict):
                    _walk(child)

        for node in nodes:
            if isinstance(node, dict):
                _walk(node)
        return tables

    def _split_identifier(
        self, identifier: str, explicit_schema: str | None
    ) -> tuple[str | None, str]:
        parts = [part.strip() for part in identifier.split(".") if part.strip()]
        if not parts:
            return explicit_schema, identifier
        if len(parts) >= 2:
            schema_candidate = self._strip_quotes(parts[-2])
            table_candidate = self._strip_quotes(parts[-1])
            return schema_candidate or explicit_schema, table_candidate
        return explicit_schema, self._strip_quotes(parts[-1])

    def _strip_quotes(self, value: str) -> str:
        if value.startswith('"') and value.endswith('"'):
            return value[1:-1]
        return value

    def _row_width_for_tables(self, tables: Iterable[tuple[str | None, str]]) -> int:
        widths: list[int] = []
        for schema, table in tables:
            width = self._row_width_for_table(schema, table)
            if width > 0:
                widths.append(width)
        return max(widths) if widths else 0

    def _row_width_for_table(self, schema: str | None, table: str) -> int:
        key = (schema or "", table.lower())
        cached = self._table_row_width_cache.get(key)
        if cached:
            return cached

        columns = self._columns_for_table(table, schema)
        width = sum(self._estimate_column_width(col) for col in columns)
        if width <= 0:
            width = self._DEFAULT_ROW_WIDTH
        self._table_row_width_cache[key] = width
        return width

    def _columns_for_table(
        self, table: str, schema: str | None
    ) -> list[tuple[str | None, int | None, int | None, int | None]]:
        table_lower = table.lower()
        columns: list[tuple[str | None, int | None, int | None, int | None]] = []
        seen_schemas: set[str | None] = set()
        for candidate in self._schema_candidates(schema):
            if candidate in seen_schemas:
                continue
            seen_schemas.add(candidate)
            try:
                if candidate is not None:
                    rows = self.executor._execute_fetchall(
                        """
                        select lower(data_type) as dtype,
                               character_maximum_length,
                               numeric_precision,
                               numeric_scale
                        from information_schema.columns
                        where lower(table_name)=lower(?)
                          and lower(table_schema)=lower(?)
                        order by ordinal_position
                        """,
                        [table_lower, candidate.lower()],
                    )
                else:
                    rows = self.executor._execute_fetchall(
                        """
                        select lower(data_type) as dtype,
                               character_maximum_length,
                               numeric_precision,
                               numeric_scale
                        from information_schema.columns
                        where lower(table_name)=lower(?)
                        order by lower(table_schema), ordinal_position
                        """,
                        [table_lower],
                    )
            except Exception:
                continue
            if rows:
                return rows
        return columns

    def _schema_candidates(self, schema: str | None) -> list[str | None]:
        candidates: list[str | None] = []

        def _add(value: str | None) -> None:
            normalized = self._normalize_schema(value)
            if normalized not in candidates:
                candidates.append(normalized)

        _add(schema)
        _add(getattr(self.executor, "schema", None))
        for alt in ("main", "temp"):
            _add(alt)
        _add(None)
        return candidates

    def _normalize_schema(self, schema: str | None) -> str | None:
        if not schema:
            return None
        stripped = schema.strip()
        return stripped or None

    def _estimate_column_width(
        self, column_info: tuple[str | None, int | None, int | None, int | None]
    ) -> int:
        dtype_raw, char_max, numeric_precision, _ = column_info
        dtype = self._normalize_data_type(dtype_raw)
        if dtype and dtype in self._FIXED_TYPE_SIZES:
            return self._FIXED_TYPE_SIZES[dtype]

        if dtype in {"character", "varchar", "char", "text", "string"}:
            if char_max and char_max > 0:
                return min(char_max, self._VARCHAR_MAX_WIDTH)
            return self._VARCHAR_DEFAULT_WIDTH

        if dtype in {"varbinary", "blob", "binary"}:
            if char_max and char_max > 0:
                return min(char_max, self._VARCHAR_MAX_WIDTH)
            return self._VARCHAR_DEFAULT_WIDTH

        if dtype in {"numeric", "decimal"} and numeric_precision and numeric_precision > 0:
            return min(max(int(numeric_precision), 16), 128)

        return 16

    def _normalize_data_type(self, dtype: str | None) -> str | None:
        if not dtype:
            return None
        stripped = dtype.strip().lower()
        if "(" in stripped:
            stripped = stripped.split("(", 1)[0].strip()
        if stripped.endswith("[]"):
            stripped = stripped[:-2]
        return stripped or None

estimate_query_bytes ¶

estimate_query_bytes(sql)

Estimate query size via DuckDB's EXPLAIN (FORMAT JSON).

Source code in src/fastflowtransform/executors/budget/runtime/duckdb.py

def estimate_query_bytes(self, sql: str) -> int | None:
    """
    Estimate query size via DuckDB's EXPLAIN (FORMAT JSON).
    """
    # Try to normalize to a SELECT/CTE body if the executor exposes it
    body = self.executor._selectable_body(sql).strip().rstrip(";\n\t ")

    lower = body.lower()
    if not lower.startswith(("select", "with")):
        return None

    explain_sql = f"EXPLAIN (FORMAT JSON) {body}"
    try:
        rows = self.executor._execute_fetchall(explain_sql)
    except Exception:
        return None

    if not rows:
        return None

    fragments: list[str] = []
    for row in rows:
        for cell in row:
            if cell is None:
                continue
            fragments.append(str(cell))

    if not fragments:
        return None

    plan_text = "\n".join(fragments).strip()
    start = plan_text.find("[")
    end = plan_text.rfind("]")
    if start == -1 or end == -1 or end <= start:
        return None

    try:
        plan_data = json.loads(plan_text[start : end + 1])
    except Exception:
        return None

    estimate = self._max_cardinality(plan_data)
    if estimate <= 0:
        return None

    tables = self._collect_tables_from_plan(
        plan_data if isinstance(plan_data, list) else [plan_data]
    )
    row_width = self._row_width_for_tables(tables)
    if row_width <= 0:
        row_width = self._DEFAULT_ROW_WIDTH

    bytes_estimate = int(estimate * row_width)
    return bytes_estimate if bytes_estimate > 0 else None

PostgresBudgetRuntime ¶

Bases: BaseBudgetRuntime[PostgresBudgetExecutor]

Postgres-specific budget runtime with EXPLAIN-based estimation.

Source code in src/fastflowtransform/executors/budget/runtime/postgres.py

class PostgresBudgetRuntime(BaseBudgetRuntime[PostgresBudgetExecutor]):
    """Postgres-specific budget runtime with EXPLAIN-based estimation."""

    DEFAULT_GUARD = BudgetGuard(
        env_var="FF_PG_MAX_BYTES",
        estimator_attr="_estimate_query_bytes",
        engine_label="Postgres",
        what="query",
    )

    _DEFAULT_PG_ROW_WIDTH = 128

    def __init__(self, executor: PostgresBudgetExecutor, guard: BudgetGuard | None = None):
        super().__init__(executor, guard)

    def estimate_query_bytes(self, sql: str) -> int | None:
        body = self.executor._extract_select_like(sql)
        lower = body.lstrip().lower()
        if not lower.startswith(("select", "with")):
            return None

        explain_sql = f"EXPLAIN (FORMAT JSON) {body}"

        try:
            raw = self.executor._execute_sql_maintenance(explain_sql, set_search_path=False)
        except Exception:
            return None

        if raw is None:
            return None

        try:
            data = json.loads(raw)
        except Exception:
            data = raw

        # Postgres JSON format: list with a single object
        if isinstance(data, list) and data:
            root = data[0]
        elif isinstance(data, dict):
            root = data
        else:
            return None

        plan = root.get("Plan")
        if not isinstance(plan, dict):
            if isinstance(root, dict) and "Node Type" in root:
                plan = root
            else:
                return None

        return self._estimate_bytes_from_plan(plan)

    def _estimate_bytes_from_plan(self, plan: dict[str, Any]) -> int | None:
        def _to_int(node: dict[str, Any], keys: tuple[str, ...]) -> int | None:
            for key in keys:
                val = node.get(key)
                if val is None:
                    continue
                try:
                    return int(val)
                except (TypeError, ValueError):
                    continue
            return None

        rows = _to_int(plan, ("Plan Rows", "Plan_Rows", "Rows"))
        width = _to_int(plan, ("Plan Width", "Plan_Width", "Width"))

        if rows is None and width is None:
            return None

        candidate: int | None

        if rows is not None and width is not None:
            candidate = rows * width
        elif rows is not None:
            candidate = rows * self._DEFAULT_PG_ROW_WIDTH
        else:
            candidate = width

        if candidate is None or candidate <= 0:
            return None

        return int(candidate)

SnowflakeSnowparkBudgetRuntime ¶

Bases: BaseBudgetRuntime[SnowflakeSnowparkBudgetExecutor]

Snowflake Snowpark budget runtime using EXPLAIN for estimation.

Source code in src/fastflowtransform/executors/budget/runtime/snowflake_snowpark.py

class SnowflakeSnowparkBudgetRuntime(BaseBudgetRuntime[SnowflakeSnowparkBudgetExecutor]):
    """Snowflake Snowpark budget runtime using EXPLAIN for estimation."""

    DEFAULT_GUARD = BudgetGuard(
        env_var="FF_SF_MAX_BYTES",
        estimator_attr="runtime_budget_estimate_query_bytes",
        engine_label="Snowflake",
        what="query",
    )

    def estimate_query_bytes(self, sql: str) -> int | None:
        """
        Best-effort Snowflake bytes estimation using EXPLAIN USING JSON.
        Mirrors the previous executor-side logic.
        """
        try:
            body = self.executor._selectable_body(sql)
        except Exception:
            body = sql

        try:
            rows = self.executor.session.sql(f"EXPLAIN USING JSON {body}").collect()
            if not rows:
                return None

            parts: list[str] = []
            for r in rows:
                try:
                    parts.append(str(r[0]))
                except Exception:
                    as_dict: dict[str, Any] = getattr(r, "asDict", lambda: {})()
                    if as_dict:
                        parts.extend(str(v) for v in as_dict.values())

            plan_text = "\n".join(parts).strip()
            if not plan_text:
                return None

            try:
                plan_data = json.loads(plan_text)
            except Exception:
                return None

            bytes_val = self._extract_bytes_from_plan(plan_data)
            if bytes_val is None or bytes_val <= 0:
                return None
            return bytes_val
        except Exception:
            # Any parsing / EXPLAIN issues → no estimate, guard skipped
            return None

    def dataframe_bytes(self, df: Any) -> int | None:
        """
        Best-effort bytes estimate for a Snowpark DataFrame.
        """
        try:
            sql_text = self._snowpark_df_sql(df)
            if not isinstance(sql_text, str) or not sql_text.strip():
                return None
            return self.estimate_query_bytes(sql_text)
        except Exception:
            return None

    def _extract_bytes_from_plan(self, plan_data: Any) -> int | None:
        def _to_int(value: Any) -> int | None:
            if value is None:
                return None
            try:
                return int(value)
            except Exception:
                return None

        if isinstance(plan_data, dict):
            global_stats = plan_data.get("GlobalStats") or plan_data.get("globalStats")
            if isinstance(global_stats, dict):
                candidate = _to_int(
                    global_stats.get("bytesAssigned") or global_stats.get("bytes_assigned")
                )
                if candidate:
                    return candidate
            for val in plan_data.values():
                bytes_val = self._extract_bytes_from_plan(val)
                if bytes_val:
                    return bytes_val
        elif isinstance(plan_data, list):
            for item in plan_data:
                bytes_val = self._extract_bytes_from_plan(item)
                if bytes_val:
                    return bytes_val
        return None

    def _snowpark_df_sql(self, df: Any) -> str | None:
        """
        Extract the main SQL statement for a Snowpark DataFrame.

        Uses the documented public APIs:
        - DataFrame.queries -> {"queries": [sql1, sql2, ...], "post_actions": [...]}
        - Optionally falls back to df._plan.sql() if needed.
        """
        queries_dict = getattr(df, "queries", None)

        if isinstance(queries_dict, dict):
            queries = queries_dict.get("queries")
            if isinstance(queries, list) and queries:
                candidates = [q.strip() for q in queries if isinstance(q, str) and q.strip()]
                if candidates:
                    return max(candidates, key=len)

        plan = getattr(df, "_plan", None)
        if plan is not None:
            with suppress(Exception):
                simplify = getattr(plan, "simplify", None)
                if callable(simplify):
                    simplified = simplify()
                    to_sql = getattr(simplified, "sql", None)
                    if callable(to_sql):
                        sql = to_sql()
                        if isinstance(sql, str) and sql.strip():
                            return sql.strip()

            with suppress(Exception):
                to_sql = getattr(plan, "sql", None)
                if callable(to_sql):
                    sql = to_sql()
                    if isinstance(sql, str) and sql.strip():
                        return sql.strip()

        return None

estimate_query_bytes ¶

estimate_query_bytes(sql)

Best-effort Snowflake bytes estimation using EXPLAIN USING JSON. Mirrors the previous executor-side logic.

Source code in src/fastflowtransform/executors/budget/runtime/snowflake_snowpark.py

def estimate_query_bytes(self, sql: str) -> int | None:
    """
    Best-effort Snowflake bytes estimation using EXPLAIN USING JSON.
    Mirrors the previous executor-side logic.
    """
    try:
        body = self.executor._selectable_body(sql)
    except Exception:
        body = sql

    try:
        rows = self.executor.session.sql(f"EXPLAIN USING JSON {body}").collect()
        if not rows:
            return None

        parts: list[str] = []
        for r in rows:
            try:
                parts.append(str(r[0]))
            except Exception:
                as_dict: dict[str, Any] = getattr(r, "asDict", lambda: {})()
                if as_dict:
                    parts.extend(str(v) for v in as_dict.values())

        plan_text = "\n".join(parts).strip()
        if not plan_text:
            return None

        try:
            plan_data = json.loads(plan_text)
        except Exception:
            return None

        bytes_val = self._extract_bytes_from_plan(plan_data)
        if bytes_val is None or bytes_val <= 0:
            return None
        return bytes_val
    except Exception:
        # Any parsing / EXPLAIN issues → no estimate, guard skipped
        return None

dataframe_bytes ¶

dataframe_bytes(df)

Best-effort bytes estimate for a Snowpark DataFrame.

Source code in src/fastflowtransform/executors/budget/runtime/snowflake_snowpark.py

def dataframe_bytes(self, df: Any) -> int | None:
    """
    Best-effort bytes estimate for a Snowpark DataFrame.
    """
    try:
        sql_text = self._snowpark_df_sql(df)
        if not isinstance(sql_text, str) or not sql_text.strip():
            return None
        return self.estimate_query_bytes(sql_text)
    except Exception:
        return None