Skip to content

fastflowtransform.table_formats.spark_iceberg

IcebergFormatHandler

Bases: SparkFormatHandler

Iceberg format handler using Spark's Iceberg integration.

Responsibilities
  • save_df_as_table() with format("iceberg").
  • incremental_insert(): default SparkFormatHandler implementation (INSERT INTO).
  • incremental_merge(): uses Spark SQL MERGE INTO ... USING (...) syntax, which Iceberg supports when the catalog is configured for Iceberg.
Source code in src/fastflowtransform/table_formats/spark_iceberg.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
class IcebergFormatHandler(SparkFormatHandler):
    """
    Iceberg format handler using Spark's Iceberg integration.

    Responsibilities:
      - save_df_as_table() with format("iceberg").
      - incremental_insert(): default SparkFormatHandler implementation
        (INSERT INTO).
      - incremental_merge(): uses Spark SQL MERGE INTO ... USING (...) syntax,
        which Iceberg supports when the catalog is configured for Iceberg.
    """

    def __init__(
        self,
        spark: SparkSession,
        *,
        table_options: dict[str, Any] | None = None,
        sql_runner: Callable[[str], Any] | None = None,
    ) -> None:
        options = dict(table_options or {})
        catalog = options.pop("catalog_name", None) or options.pop("__catalog_name__", None)
        self.catalog_name = str(catalog) if catalog else "iceberg"
        super().__init__(
            spark,
            table_format="iceberg",
            table_options=options,
            sql_runner=sql_runner,
        )

    # ---------- Core helpers ----------
    def _qualify_table_name(self, table_name: str, database: str | None = None) -> str:
        """
        Normalize arbitrary input like "seed_events" or "db.seed_events"
        to the fully-qualified Iceberg identifier "iceberg.db.seed_events".
        """
        raw = (table_name or "").strip()
        if not raw:
            raise ValueError("Empty table name for IcebergFormatHandler")

        parts = [p for p in raw.split(".") if p]
        cat = self.catalog_name

        if len(parts) == 1:
            # table → iceberg.<current_db>.table
            db = database or self.spark.catalog.currentDatabase()
            return ".".join([cat, db, parts[0]])
        if len(parts) == 2:
            # db.table → iceberg.db.table
            return ".".join([cat, *parts])
        # len >= 3: assume already catalog.db.table
        return ".".join(parts)

    # ---------- Identifier overrides ----------
    def qualify_identifier(self, table_name: str, *, database: str | None = None) -> str:
        return self._qualify_table_name(table_name, database=database)

    def allows_unmanaged_paths(self) -> bool:
        return False

    def relation_exists(self, table_name: str, *, database: str | None = None) -> bool:
        ident = self.qualify_identifier(table_name, database=database)
        try:
            return bool(self.spark.catalog.tableExists(ident))
        except Exception:
            return False

    @staticmethod
    def _quote_part(value: str) -> str:
        return f"`{value.replace('`', '``')}`"

    def _sql_identifier(self, table_name: str, *, database: str | None = None) -> str:
        qualified = self._qualify_table_name(table_name, database=database)
        parts = [p for p in qualified.split(".") if p]
        return ".".join(self._quote_part(part) for part in parts)

    @staticmethod
    def _sql_literal(value: str) -> str:
        return "'" + value.replace("'", "''") + "'"

    def _restore_table_metadata(
        self,
        table_ident: str,
        *,
        table_comment: str | None,
        column_comments: dict[str, str],
        table_properties: dict[str, Any],
    ) -> None:
        if table_comment:
            with suppress(Exception):
                self.run_sql(
                    f"COMMENT ON TABLE {table_ident} IS {self._sql_literal(table_comment)}"
                )

        if table_properties:
            assignments = []
            for key, value in table_properties.items():
                if value is None:
                    continue
                key_str = str(key)
                if key_str.lower() in {"transient_lastddltime"}:
                    continue
                assignments.append(f"{self._sql_literal(key_str)}={self._sql_literal(str(value))}")
            if assignments:
                props = ", ".join(assignments)
                with suppress(Exception):
                    self.run_sql(f"ALTER TABLE {table_ident} SET TBLPROPERTIES ({props})")

        for name, comment in column_comments.items():
            if not comment:
                continue
            col_ident = f"{table_ident}.{self._quote_part(name)}"
            with suppress(Exception):
                self.run_sql(f"COMMENT ON COLUMN {col_ident} IS {self._sql_literal(comment)}")

    # ---------- Required API ----------
    def save_df_as_table(self, table_name: str, df: SDF) -> None:
        """
        Save DataFrame as an Iceberg table in the configured catalog.

        Uses DataFrameWriterV2:

            df.writeTo("iceberg.db.table").using("iceberg").createOrReplace()
        """
        full_name = self._qualify_table_name(table_name)
        writer = df.writeTo(full_name).using("iceberg")
        for k, v in self.table_options.items():
            writer = writer.tableProperty(str(k), str(v))

        existed = False
        table_comment: str | None = None
        table_properties: dict[str, Any] = {}
        column_comments: dict[str, str] = {}
        table_ident = self._sql_identifier(table_name)

        try:
            existed = bool(self.spark.catalog.tableExists(full_name))
        except Exception:
            existed = False

        if existed:
            try:
                info = self.spark.catalog.getTable(full_name)
                table_comment = getattr(info, "description", None)
                props = getattr(info, "properties", None)
                if isinstance(props, dict):
                    table_properties = dict(props)
            except Exception:
                pass

            try:
                cols = self.spark.catalog.listColumns(full_name)
                for col in cols:
                    comment = getattr(col, "comment", None)
                    if comment:
                        column_comments[col.name] = comment
            except Exception:
                column_comments = {}

        # Upsert semantics for seeds / full-refresh
        writer.createOrReplace()

        if existed:
            self._restore_table_metadata(
                table_ident,
                table_comment=table_comment,
                column_comments=column_comments,
                table_properties=table_properties,
            )

    # ---------- Incremental API ----------
    def incremental_insert(self, table_name: str, select_body_sql: str) -> None:
        body = select_body_sql.strip().rstrip(";\n\t ")
        if not body.lower().startswith("select"):
            raise ValueError(f"incremental_insert expects SELECT body, got: {body[:40]!r}")

        full_name = self._sql_identifier(table_name)
        self.run_sql(f"INSERT INTO {full_name} {body}")

    def incremental_merge(
        self,
        table_name: str,
        select_body_sql: str,
        unique_key: list[str],
    ) -> None:
        """
        Iceberg MERGE implementation.

            MERGE INTO iceberg.db.table AS t
            USING (<select_body_sql>) AS s
            ON  AND-joined equality on unique_key
            WHEN MATCHED THEN UPDATE SET *
            WHEN NOT MATCHED THEN INSERT *
        """
        body = select_body_sql.strip().rstrip(";\n\t ")
        if not unique_key:
            self.incremental_insert(table_name, body)
            return

        full_name = self._sql_identifier(table_name)
        pred = " AND ".join([f"t.`{k}` = s.`{k}`" for k in unique_key])

        self.run_sql(
            f"""
            MERGE INTO {full_name} AS t
            USING ({body}) AS s
            ON {pred}
            WHEN MATCHED THEN UPDATE SET *
            WHEN NOT MATCHED THEN INSERT *
            """
        )

save_df_as_table

save_df_as_table(table_name, df)

Save DataFrame as an Iceberg table in the configured catalog.

Uses DataFrameWriterV2:

df.writeTo("iceberg.db.table").using("iceberg").createOrReplace()
Source code in src/fastflowtransform/table_formats/spark_iceberg.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def save_df_as_table(self, table_name: str, df: SDF) -> None:
    """
    Save DataFrame as an Iceberg table in the configured catalog.

    Uses DataFrameWriterV2:

        df.writeTo("iceberg.db.table").using("iceberg").createOrReplace()
    """
    full_name = self._qualify_table_name(table_name)
    writer = df.writeTo(full_name).using("iceberg")
    for k, v in self.table_options.items():
        writer = writer.tableProperty(str(k), str(v))

    existed = False
    table_comment: str | None = None
    table_properties: dict[str, Any] = {}
    column_comments: dict[str, str] = {}
    table_ident = self._sql_identifier(table_name)

    try:
        existed = bool(self.spark.catalog.tableExists(full_name))
    except Exception:
        existed = False

    if existed:
        try:
            info = self.spark.catalog.getTable(full_name)
            table_comment = getattr(info, "description", None)
            props = getattr(info, "properties", None)
            if isinstance(props, dict):
                table_properties = dict(props)
        except Exception:
            pass

        try:
            cols = self.spark.catalog.listColumns(full_name)
            for col in cols:
                comment = getattr(col, "comment", None)
                if comment:
                    column_comments[col.name] = comment
        except Exception:
            column_comments = {}

    # Upsert semantics for seeds / full-refresh
    writer.createOrReplace()

    if existed:
        self._restore_table_metadata(
            table_ident,
            table_comment=table_comment,
            column_comments=column_comments,
            table_properties=table_properties,
        )

incremental_merge

incremental_merge(table_name, select_body_sql, unique_key)

Iceberg MERGE implementation.

MERGE INTO iceberg.db.table AS t
USING (<select_body_sql>) AS s
ON  AND-joined equality on unique_key
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
Source code in src/fastflowtransform/table_formats/spark_iceberg.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def incremental_merge(
    self,
    table_name: str,
    select_body_sql: str,
    unique_key: list[str],
) -> None:
    """
    Iceberg MERGE implementation.

        MERGE INTO iceberg.db.table AS t
        USING (<select_body_sql>) AS s
        ON  AND-joined equality on unique_key
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
    """
    body = select_body_sql.strip().rstrip(";\n\t ")
    if not unique_key:
        self.incremental_insert(table_name, body)
        return

    full_name = self._sql_identifier(table_name)
    pred = " AND ".join([f"t.`{k}` = s.`{k}`" for k in unique_key])

    self.run_sql(
        f"""
        MERGE INTO {full_name} AS t
        USING ({body}) AS s
        ON {pred}
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
        """
    )

run_sql

run_sql(sql)

Execute SQL via the injected runner (guardable in the executor).

Source code in src/fastflowtransform/table_formats/base.py
45
46
47
def run_sql(self, sql: str) -> Any:
    """Execute SQL via the injected runner (guardable in the executor)."""
    return self._sql_runner(sql)

format_identifier_for_sql

format_identifier_for_sql(table_name, *, database=None)

Return a SQL-safe identifier (per-part quoted) for the table.

Source code in src/fastflowtransform/table_formats/base.py
54
55
56
57
58
59
60
def format_identifier_for_sql(self, table_name: str, *, database: str | None = None) -> str:
    """Return a SQL-safe identifier (per-part quoted) for the table."""
    ident = self.qualify_identifier(table_name, database=database)
    parts = [p for p in ident.split(".") if p]
    if not parts:
        return self._quote_part(ident)
    return ".".join(self._quote_part(part) for part in parts)