dlt-hub · neuromantik33 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 18, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,6 +45,9 @@ pytest-mock = "^3.12.0"
 twisted = "22.10.0"
 pytest-forked = "^1.6.0"
 pendulum = "^3.0.0"
+types-protobuf = "^5.27.0.20240907"
+pytest-cov = "^5.0.0"
+mypy-protobuf = "^3.6.0"
 
 [tool.poetry.group.sql_database.dependencies]
 sqlalchemy = ">=1.4"
@@ -54,6 +57,11 @@ connectorx = ">=0.3.1"
 [tool.poetry.group.pg_replication.dependencies]
 psycopg2-binary = ">=2.9.9"
 
+[tool.poetry.group.pg_legacy_replication.dependencies]
+protobuf = ">=4.25"
+psycopg2-binary = ">=2.9.9"
+sqlalchemy = ">=1.4"
+
 [tool.poetry.group.google_sheets.dependencies]
 google-api-python-client = "^2.78.0"
 
@@ -116,4 +124,4 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.black]
-include = '.*py$'
+include = '.*py$'
diff --git a/sources/.dlt/example.secrets.toml b/sources/.dlt/example.secrets.toml
@@ -16,7 +16,11 @@ location = "US"
 ### Sources
 [sources]
 
+# local postgres
+helpers.credentials="postgresql://loader:loader@localhost:5432/dlt_data"
+pg_legacy_replication.credentials="postgresql://loader:loader@localhost:5432/dlt_data"
+
 ## chess pipeline
 # the section below defines secrets for "chess_dlt_config_example" source in chess/__init__.py
 [sources.chess]
-secret_str="secret string"  # a string secret
+secret_str="secret string"  # a string secret
diff --git a/sources/pg_legacy_replication/README.md b/sources/pg_legacy_replication/README.md
@@ -0,0 +1,130 @@
+# Postgres legacy replication
+[Postgres](https://www.postgresql.org/) is one of the most popular relational database management systems. This verified source uses Postgres' replication functionality to efficiently process changes
+in tables (a process often referred to as _Change Data Capture_ or CDC). It uses [logical decoding](https://www.postgresql.org/docs/current/logicaldecoding.html) and the optional `decoderbufs`
+[output plugin](https://github.com/debezium/postgres-decoderbufs), which is a shared library which must be built or enabled.
+
+| Source              | Description                                     |
+|---------------------|-------------------------------------------------|
+| replication_source  | Load published messages from a replication slot |
+
+## Install decoderbufs
+
+Instructions can be found [here](https://github.com/debezium/postgres-decoderbufs?tab=readme-ov-file#building)
+
+Below is an example installation in a docker image:
+```Dockerfile
+FROM postgres:14
+
+# Install dependencies required to build decoderbufs
+RUN apt-get update
+RUN apt-get install -f -y \
+      software-properties-common \
+      build-essential \
+      pkg-config \
+      git
+
+RUN apt-get install -f -y \
+      postgresql-server-dev-14 \
+      libprotobuf-c-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+ARG decoderbufs_version=v1.7.0.Final
+RUN git clone https://github.com/debezium/postgres-decoderbufs -b $decoderbufs_version --single-branch && \
+    cd postgres-decoderbufs && \
+    make && make install && \
+    cd .. && \
+    rm -rf postgres-decoderbufs
+```
+
+## Initialize the pipeline
+
+```bash
+$ dlt init pg_legacy_replication duckdb
+```
+
+This uses `duckdb` as destination, but you can choose any of the supported [destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/).
+
+## Set up user
+
+The Postgres user needs to have the `LOGIN` and `REPLICATION` attributes assigned:
+
+```sql
+CREATE ROLE replication_user WITH LOGIN REPLICATION;
+```
+
+It also needs various read only privileges on the database (by first connecting to the database):
+
+```sql
+\connect dlt_data
+GRANT USAGE ON SCHEMA schema_name TO replication_user;
+GRANT SELECT ON ALL TABLES IN SCHEMA public TO replication_user;
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO replication_user;
+```
+
+## Add credentials
+1. Open `.dlt/secrets.toml`.
+2. Enter your Postgres credentials:
+
+    ```toml
+    [sources.pg_legacy_replication]
+    credentials="postgresql://replication_user:<<password>>@localhost:5432/dlt_data"
+    ```
+3. Enter credentials for your chosen destination as per the [docs](https://dlthub.com/docs/dlt-ecosystem/destinations/).
+
+## Run the pipeline
+
+1. Install the necessary dependencies by running the following command:
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+1. Now the pipeline can be run by using the command:
+
+   ```bash
+   python pg_legacy_replication_pipeline.py
+   ```
+
+1. To make sure that everything is loaded as expected, use the command:
+
+   ```bash
+   dlt pipeline pg_replication_pipeline show
+   ```
+
+# Differences between `pg_legacy_replication` and `pg_replication`
+
+## Overview
+
+`pg_legacy_replication` is a fork of the verified `pg_replication` source. The primary goal of this fork is to provide logical replication capabilities for Postgres instances running versions
+earlier than 10, when the `pgoutput` plugin was not yet available. This fork draws inspiration from the original `pg_replication` source and the `decoderbufs` library,
+which is actively maintained by Debezium.
+
+## Key Differences from `pg_replication`
+
+### Replication User Ownership Requirements
+One of the limitations of native Postgre replication is that the replication user must **own** the tables in order to add them to a **publication**.
+Additionally, once a table is added to a publication, it cannot be removed, requiring the creation of a new replication slot, which results in the loss of any state tracking.
+
+### Limitations in `pg_replication`
+The current pg_replication implementation has several limitations:
+- It supports only a single initial snapshot of the data.
+- It requires `CREATE` access to the source database in order to perform the initial snapshot.
+- **Superuser** access is required to replicate entire Postgres schemas.
+  While the `pg_legacy_replication` source theoretically reads the entire WAL across all schemas, the current implementation using dlt transformers restricts this functionality.
+  In practice, this has not been a common use case.
+- The implementation is opinionated in its approach to data transfer. Specifically, when updates or deletes are required, it defaults to a `merge` write disposition,
+  which replicates live data without tracking changes over time.
+
+### Features of `pg_legacy_replication`
+
+This fork of `pg_replication` addresses the aforementioned limitations and introduces the following improvements:
+- Adheres to the dlt philosophy by treating the WAL as an upstream resources. This replication stream is then transformed into various DLT resources, with customizable options for write disposition,
+  file formats, type hints, etc., specified at the resource level rather than at the source level.
+- Supports an initial snapshot of all tables using the transaction slot isolation level. Additionally, ad-hoc snapshots can be performed using the serializable deferred isolation level,
+  similar to `pg_dump`.
+- Emphasizes the use of `pyarrow` and parquet formats for efficient data storage and transfer. A dedicated backend has been implemented to support these formats.
+- Replication messages are decoded using Protocol Buffers (protobufs) in C, rather than relying on native Python byte buffer parsing. This ensures greater efficiency and performance.
+
+## Next steps
+- Add support for the [wal2json](https://github.com/eulerto/wal2json) replication plugin. This is particularly important for environments such as **Amazon RDS**, which supports `wal2json`,
+- as opposed to on-premise or Google Cloud SQL instances that support `decoderbufs`.
diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py
@@ -0,0 +1,140 @@
+"""Replicates postgres tables in batch using logical decoding."""
+
+from typing import Any, Callable, Dict, Iterable, Mapping, Optional, Sequence, Union
+
+import dlt
+from dlt.extract import DltResource
+from dlt.extract.items import TDataItem
+from dlt.sources.credentials import ConnectionStringCredentials
+from collections import defaultdict
+
+from .helpers import (
+    BackendHandler,
+    ItemGenerator,
+    ReplicationOptions,
+    advance_slot,
+    cleanup_snapshot_resources,
+    get_max_lsn,
+    init_replication,
+)
+
+
+@dlt.source
+def replication_source(
+    slot_name: str,
+    schema: str,
+    table_names: Union[str, Sequence[str]],
+    credentials: ConnectionStringCredentials = dlt.secrets.value,
+    repl_options: Optional[Mapping[str, ReplicationOptions]] = None,
+    target_batch_size: int = 1000,
+    flush_slot: bool = True,
+) -> Iterable[DltResource]:
+    """
+    Defines a dlt source for replicating Postgres tables using logical replication.
+    This source reads from a replication slot and pipes the changes using transformers.
+
+    - Relies on a replication slot that publishes DML operations (i.e. `insert`, `update`, and `delete`).
+    - Maintains LSN of last consumed message in state to track progress.
+    - At start of the run, advances the slot upto last consumed message in previous run (for pg>10 only)
+    - Processes in batches to limit memory usage.
+
+    Args:
+        slot_name (str):
+            The name of the logical replication slot used to fetch WAL changes.
+        schema (str):
+            Name of the schema to replicate tables from.
+        table_names (Union[str, Sequence[str]]):
+            The name(s) of the tables to replicate. Can be a single table name or a list of table names.
+        credentials (ConnectionStringCredentials):
+            Database credentials for connecting to the Postgres instance.
+        repl_options (Optional[Mapping[str, ReplicationOptions]], optional):
+            A mapping of table names to `ReplicationOptions`, allowing for fine-grained control over
+            replication behavior for each table.
+
+            Each `ReplicationOptions` dictionary can include the following keys:
+                - `backend` (Optional[TableBackend]): Specifies the backend to use for table replication.
+                - `backend_kwargs` (Optional[Mapping[str, Any]]): Additional configuration options for the backend.
+                - `column_hints` (Optional[TTableSchemaColumns]): A dictionary of hints for column types or properties.
+                - `include_lsn` (Optional[bool]): Whether to include the LSN (Log Sequence Number)
+                  in the replicated data. Defaults to `True`.
+                - `include_deleted_ts` (Optional[bool]): Whether to include a timestamp for deleted rows.
+                  Defaults to `True`.
+                - `include_commit_ts` (Optional[bool]): Whether to include the commit timestamp of each change.
+                - `include_tx_id` (Optional[bool]): Whether to include the transaction ID of each change.
+                - `included_columns` (Optional[Set[str]]): A set of specific columns to include in the replication.
+                  If not specified, all columns are included.
+        target_batch_size (int, optional):
+            The target size of each batch of replicated data items. Defaults to `1000`.
+        flush_slot (bool, optional):
+            If `True`, advances the replication slot to the last processed LSN
+            to prevent replaying already replicated changes. Defaults to `True`.
+
+    Yields:
+        Iterable[DltResource]:
+            A collection of `DltResource` objects, each corresponding to a table being replicated.
+
+    Notes:
+        - The `repl_options` parameter allows fine-tuning of replication behavior, such as column filtering
+          or write disposition configuration, per table.
+        - The replication process is incremental, ensuring only new changes are processed after the last commit LSN.
+    """
+    table_names = [table_names] if isinstance(table_names, str) else table_names or []
+    repl_options = defaultdict(lambda: ReplicationOptions(), repl_options or {})
+
+    @dlt.resource(name=lambda args: args["slot_name"], standalone=True)
+    def replication_resource(slot_name: str) -> Iterable[TDataItem]:
+        # start where we left off in previous run
+        start_lsn = dlt.current.resource_state().get("last_commit_lsn", 0)
+        if flush_slot:
+            advance_slot(start_lsn, slot_name, credentials)
+
+        # continue until last message in replication slot
+        upto_lsn = get_max_lsn(credentials)
+        if upto_lsn is None:
+            return
+
+        table_qnames = {f"{schema}.{table_name}" for table_name in table_names}
+
+        # generate items in batches
+        while True:
+            gen = ItemGenerator(
+                credentials=credentials,
+                slot_name=slot_name,
+                table_qnames=table_qnames,
+                upto_lsn=upto_lsn,
+                start_lsn=start_lsn,
+                repl_options=repl_options,
+                target_batch_size=target_batch_size,
+            )
+            yield from gen
+            if gen.generated_all:
+                dlt.current.resource_state()["last_commit_lsn"] = gen.last_commit_lsn
+                break
+            start_lsn = gen.last_commit_lsn
+
+    wal_reader = replication_resource(slot_name)
+
+    for table in table_names:
+        yield dlt.transformer(
+            _create_table_dispatch(table, repl_options=repl_options[table]),
+            data_from=wal_reader,
+            name=table,
+        )
+
+
+def _create_table_dispatch(
+    table: str, repl_options: ReplicationOptions
+) -> Callable[[TDataItem], Any]:
+    """Creates a dispatch handler that processes data items based on a specified table and optional column hints."""
+    handler = BackendHandler(table, repl_options)
+    # FIXME Uhhh.. why do I have to do this?
+    handler.__qualname__ = "BackendHandler.__call__"  # type: ignore[attr-defined]
+    return handler
+
+
+__all__ = [
+    "ReplicationOptions",
+    "cleanup_snapshot_resources",
+    "init_replication",
+    "replication_source",
+]
diff --git a/sources/pg_legacy_replication/exceptions.py b/sources/pg_legacy_replication/exceptions.py
@@ -0,0 +1,6 @@
+# class SqlDatabaseSourceImportError(Exception):
+#     def __init__(self) -> None:
+#         super().__init__(
+#             "Could not import `sql_database` source. Run `dlt init sql_database <dest>`"
+#             " to download the source code."
+#         )