fastflowtransform.seeding¶

SeedTarget ¶

Bases: NamedTuple

Resolved seed target (schema, table).

Source code in src/fastflowtransform/seeding.py

class SeedTarget(NamedTuple):
    """Resolved seed target (schema, table)."""

    schema: str | None
    table: str

materialize_seed ¶

materialize_seed(table, df, executor, schema=None)

Materialize a DataFrame as a database table across engines.

Engine-spezifische Logik ist in dedizierten Handlern gekapselt (_handle_duckdb/_handle_sqlalchemy/_handle_spark). Der Dispatcher ruft sie der Reihe nach auf, bis einer übernimmt.

Source code in src/fastflowtransform/seeding.py

def materialize_seed(
    table: str, df: pd.DataFrame, executor: Any, schema: str | None = None
) -> None:
    """
    Materialize a DataFrame as a database table across engines.

    Engine-spezifische Logik ist in dedizierten Handlern gekapselt
    (_handle_duckdb/_handle_sqlalchemy/_handle_spark). Der Dispatcher
    ruft sie der Reihe nach auf, bis einer übernimmt.
    """
    for handler in _HANDLERS:
        if handler(table, df, executor, schema):
            return

    raise RuntimeError("No compatible executor connection for seeding found.")

seed_project ¶

seed_project(project_dir, executor, default_schema=None)

Load every seed file under /seeds recursively and materialize it.

Supports configuration in seeds/schema.yml: - targets: : # e.g., "raw/users" (path-based, recommended) schema: # global target schema table: # optional rename schema_by_engine: # optional engine overrides postgres: raw duckdb: main - dtypes: : column_a: string column_b: int64

Resolution priority for (schema, table): 1) targets[] (e.g., "raw/users") 2) targets[] (e.g., "raw.users") 3) targets[] (only if stem is unique) 4) executor.schema or default_schema

Returns:

Type	Description
`int`	Number of successfully materialized seed tables.

Raises:

Type	Description
`ValueError`	if schema.yml uses a plain stem key while multiple files share that stem.

Source code in src/fastflowtransform/seeding.py

def seed_project(project_dir: Path, executor: Any, default_schema: str | None = None) -> int:
    """
    Load every seed file under <project>/seeds recursively and materialize it.

    Supports configuration in seeds/schema.yml:
      - targets:
          <seed-id>:                 # e.g., "raw/users" (path-based, recommended)
            schema: <schema-name>    # global target schema
            table: <table-name>      # optional rename
            schema_by_engine:        # optional engine overrides
              postgres: raw
              duckdb: main
      - dtypes:
          <table-key>:
            column_a: string
            column_b: int64

    Resolution priority for (schema, table):
      1) targets[<seed-id>]  (e.g., "raw/users")
      2) targets[<seed-id with dots>] (e.g., "raw.users")
      3) targets[<stem>] (*only* if stem is unique)
      4) executor.schema or default_schema

    Returns:
      Number of successfully materialized seed tables.

    Raises:
      ValueError: if schema.yml uses a plain stem key while multiple files share that stem.
    """
    seeds_dir = project_dir / "seeds"
    if not seeds_dir.exists():
        return 0

    schema_cfg = None
    schema_file = seeds_dir / "schema.yml"
    if schema_file.exists():
        schema_cfg = yaml.safe_load(schema_file.read_text(encoding="utf-8"))

    # Collect seed files recursively to allow folder-based schema conventions.
    paths: list[Path] = [
        p
        for p in sorted(seeds_dir.rglob("*"))
        if p.is_file() and p.suffix.lower() in (".csv", ".parquet", ".pq")
    ]
    if not paths:
        return 0

    # Check for ambiguous stems (same filename in different folders).
    stem_counts: dict[str, int] = {}
    for p in paths:
        stem_counts[p.stem] = stem_counts.get(p.stem, 0) + 1

    count = 0
    for path in paths:
        seedid = _seed_id(seeds_dir, path)
        stem = path.stem

        # Default schema may come from executor or caller.
        base_schema = getattr(executor, "schema", None) or default_schema
        schema, table = _resolve_schema_and_table_by_cfg(
            seedid, stem, schema_cfg, executor, base_schema
        )

        # If schema.yml uses a bare stem while that stem exists multiple times,
        # force disambiguation.
        if (
            schema_cfg
            and (schema_cfg.get("targets") or {}).get(stem)
            and stem_counts.get(stem, 0) > 1
        ):
            raise ValueError(
                f'Seed stem "{stem}" appears multiple times. '
                f"Please configure using the path-based seed ID "
                f'(e.g., "{seedid}") in seeds/schema.yml.'
            )

        df = _read_seed_file(path)
        # Use the resolved *table* key for dtypes (allows rename-aware dtype mapping in cfg).
        df = _apply_schema(df, table, schema_cfg)

        materialize_seed(table, df, executor, schema=schema)
        count += 1

    return count