Skip to content

fastflowtransform.lineage

infer_sql_lineage

infer_sql_lineage(rendered_sql, ref_map=None)

Infer a mapping from output column -> upstream sources (relation.column) for common patterns: - . AS - AS (relation unknown) - FUNC(.) AS → transformed=True - bare . → out=, direct Joins with aliases are resolved via → relation from FROM/JOIN.

Source code in src/fastflowtransform/lineage.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def infer_sql_lineage(rendered_sql: str, ref_map: dict[str, str] | None = None) -> LineageMap:
    """
    Infer a mapping from output column -> upstream sources (relation.column) for common patterns:
      - <alias>.<col> AS <out>
      - <col> AS <out>             (relation unknown)
      - FUNC(<alias>.<col>) AS <out>  → transformed=True
      - bare <alias>.<col>          → out=<col>, direct
    Joins with aliases are resolved via <alias> → relation from FROM/JOIN.
    """
    lineage: LineageMap = {}
    if not rendered_sql:
        return lineage

    alias_map = ref_map or _alias_map_from_sql(rendered_sql)

    m = _SEL_RE.search(rendered_sql)
    if not m:
        return lineage
    select_clause = m.group(1)
    exprs = _split_select_list(select_clause)

    # Patterns
    as_pat = re.compile(r"^(?P<expr>.+?)\s+as\s+(?P<alias>[a-zA-Z_][\w\$]*)$", re.IGNORECASE)
    qual_col = re.compile(r"^(?P<a>[a-zA-Z_]\w*)\.(?P<c>[a-zA-Z_]\w*)$")
    func_of_qual = re.compile(
        r"^[a-zA-Z_]\w*\s*\(\s*(?P<a>[a-zA-Z_]\w*)\.(?P<c>[a-zA-Z_]\w*)\s*\)\s*$", re.IGNORECASE
    )

    for raw in exprs:
        expr = raw.strip()
        if expr == "*" or not expr:
            continue

        out_col: str | None = None
        expr_only = expr
        m_as = as_pat.match(expr)
        if m_as:
            out_col = m_as.group("alias")
            expr_only = m_as.group("expr").strip()

        # func(alias.col)
        m_func = func_of_qual.match(expr_only)
        if m_func:
            a, c = m_func.group("a"), m_func.group("c")
            rel = alias_map.get(a)
            item = {
                "from_relation": rel or "?",
                "from_column": c,
                "transformed": True,
            }
            if out_col is None:
                out_col = c  # best-effort
            _append_lineage(lineage, out_col, item)
            continue

        # alias.col
        m_q = qual_col.match(expr_only)
        if m_q:
            a, c = m_q.group("a"), m_q.group("c")
            rel = alias_map.get(a)
            item = {
                "from_relation": rel or "?",
                "from_column": c,
                "transformed": False,
            }
            if out_col is None:
                out_col = c
            _append_lineage(lineage, out_col, item)
            continue

        # plain col (no qualifier) - we can only map column name with unknown relation
        m_col = re.match(r"^[a-zA-Z_]\w*$", expr_only)
        if m_col:
            c = expr_only
            item = {"from_relation": "?", "from_column": c, "transformed": False}
            if out_col is None:
                out_col = c
            _append_lineage(lineage, out_col, item)
            continue

        # func(col) or complex expression → mark as transformed with unknown relation/col
        _append_lineage(
            lineage, out_col, {"from_relation": "?", "from_column": "?", "transformed": True}
        )

    return lineage

infer_py_lineage

infer_py_lineage(func, requires=None, source_code=None)
Very small regex-based inference for common pandas patterns
  • out["x"] = df["y"] → x <- y (direct)
  • df.rename(columns={"y": "x"}) → x <- y (transformed=True)
  • .assign(x=lambda d: d["y"].str.upper()) → x <- y (transformed=True) [best-effort]

Relation is unknown ("?"); full mapping across multiple inputs would require deeper analysis.

Source code in src/fastflowtransform/lineage.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def infer_py_lineage(
    func: Any, requires: dict | None = None, source_code: str | None = None
) -> LineageMap:
    """
    Very small regex-based inference for common pandas patterns:
      - out["x"] = df["y"]                      → x <- y (direct)
      - df.rename(columns={"y": "x"})           → x <- y (transformed=True)
      - .assign(x=lambda d: d["y"].str.upper()) → x <- y (transformed=True)  [best-effort]
    Relation is unknown ("?"); full mapping across multiple inputs would require deeper analysis.
    """
    code = source_code or ""
    try:
        if not code and func is not None:
            code = inspect.getsource(func)
    except Exception:
        pass

    lineage: LineageMap = {}
    if not code:
        return lineage

    # Assign pattern: out["x"] = df["y"]
    for m in _ASSIGN_RE.finditer(code):
        out_col = m.group("out")
        src_col = m.group("col")
        _append_lineage(
            lineage, out_col, {"from_relation": "?", "from_column": src_col, "transformed": False}
        )

    # Rename pattern: .rename(columns={"old":"new"})
    for m in _RENAME_RE.finditer(code):
        pairs = m.group("pairs")
        for p in _PAIR_RE.finditer(pairs):
            old, new = p.group("old"), p.group("new")
            _append_lineage(
                lineage, new, {"from_relation": "?", "from_column": old, "transformed": True}
            )

    # assign(x=lambda d: ...)
    for m in _ASSIGN_LAMBDA_RE.finditer(code):
        out_col = m.group(1)
        body = m.group("body")
        m2 = _BODY_SRC_COL.search(body)
        if m2:
            src_col = m2.group("col")
            _append_lineage(
                lineage,
                out_col,
                {"from_relation": "?", "from_column": src_col, "transformed": True},
            )

    return lineage

parse_sql_lineage_overrides

parse_sql_lineage_overrides(sql_text)
Parse optional SQL comment directives

-- @lineage email_upper: users.email (transformed)

Source code in src/fastflowtransform/lineage.py
246
247
248
249
250
251
252
253
254
255
256
257
def parse_sql_lineage_overrides(sql_text: str) -> LineageMap:
    """
    Parse optional SQL comment directives:
        -- @lineage email_upper: users.email (transformed)
    """
    out: LineageMap = {}
    for m in _LINEAGE_DIRECTIVE.finditer(sql_text or ""):
        out_col, rel, col, tr = m.group(1), m.group(2), m.group(3), m.group(4)
        out.setdefault(out_col, []).append(
            {"from_relation": rel, "from_column": col, "transformed": bool(tr)}
        )
    return out

merge_lineage

merge_lineage(*maps)

Merge multiple lineage maps. Later maps override/extend earlier ones by output column. If a later map provides any entries for a column, it replaces previous entries for that column.

Source code in src/fastflowtransform/lineage.py
260
261
262
263
264
265
266
267
268
269
270
271
def merge_lineage(*maps: LineageMap | None) -> LineageMap:
    """
    Merge multiple lineage maps. Later maps override/extend earlier ones by output column.
    If a later map provides any entries for a column, it replaces previous entries for that column.
    """
    merged: LineageMap = {}
    for mp in maps:
        if not mp:
            continue
        for out_col, items in mp.items():
            merged[out_col] = list(items)
    return merged