diff --git a/poetry.lock b/poetry.lock index 6c1979521..4096d4472 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "adlfs" @@ -403,34 +403,6 @@ files = [ {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, ] -[[package]] -name = "backports-zoneinfo" -version = "0.2.1" -description = "Backport of the standard library zoneinfo module" -optional = false -python-versions = ">=3.6" -files = [ - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-win32.whl", hash = "sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-win_amd64.whl", hash = "sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-win32.whl", hash = "sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-win32.whl", hash = "sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6"}, - {file = "backports.zoneinfo-0.2.1.tar.gz", hash = "sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"}, -] - -[package.extras] -tzdata = ["tzdata"] - [[package]] name = "bandit" version = "1.7.5" @@ -730,7 +702,6 @@ files = [ clickhouse-connect = ">=0.5.7" duckdb = ">=0.7.1" fastapi = "0.85.1" -graphlib-backport = {version = ">=1.0.3", markers = "python_version < \"3.9\""} hnswlib = ">=0.7" numpy = ">=1.21.6" onnxruntime = ">=1.14.1" @@ -962,6 +933,93 @@ files = [ {file = "constantly-23.10.4.tar.gz", hash = "sha256:aa92b70a33e2ac0bb33cd745eb61776594dc48764b06c35e0efd050b7f1c7cbd"}, ] +[[package]] +name = "coverage" +version = "7.6.1" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "coverage-7.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b06079abebbc0e89e6163b8e8f0e16270124c154dc6e4a47b413dd538859af16"}, + {file = "coverage-7.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf4b19715bccd7ee27b6b120e7e9dd56037b9c0681dcc1adc9ba9db3d417fa36"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61c0abb4c85b095a784ef23fdd4aede7a2628478e7baba7c5e3deba61070a02"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd21f6ae3f08b41004dfb433fa895d858f3f5979e7762d052b12aef444e29afc"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f59d57baca39b32db42b83b2a7ba6f47ad9c394ec2076b084c3f029b7afca23"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a1ac0ae2b8bd743b88ed0502544847c3053d7171a3cff9228af618a068ed9c34"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e6a08c0be454c3b3beb105c0596ebdc2371fab6bb90c0c0297f4e58fd7e1012c"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f5796e664fe802da4f57a168c85359a8fbf3eab5e55cd4e4569fbacecc903959"}, + {file = "coverage-7.6.1-cp310-cp310-win32.whl", hash = "sha256:7bb65125fcbef8d989fa1dd0e8a060999497629ca5b0efbca209588a73356232"}, + {file = "coverage-7.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:3115a95daa9bdba70aea750db7b96b37259a81a709223c8448fa97727d546fe0"}, + {file = "coverage-7.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7dea0889685db8550f839fa202744652e87c60015029ce3f60e006f8c4462c93"}, + {file = "coverage-7.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed37bd3c3b063412f7620464a9ac1314d33100329f39799255fb8d3027da50d3"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85f5e9a5f8b73e2350097c3756ef7e785f55bd71205defa0bfdaf96c31616ff"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bc572be474cafb617672c43fe989d6e48d3c83af02ce8de73fff1c6bb3c198d"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0420b573964c760df9e9e86d1a9a622d0d27f417e1a949a8a66dd7bcee7bc6"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1f4aa8219db826ce6be7099d559f8ec311549bfc4046f7f9fe9b5cea5c581c56"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fc5a77d0c516700ebad189b587de289a20a78324bc54baee03dd486f0855d234"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b48f312cca9621272ae49008c7f613337c53fadca647d6384cc129d2996d1133"}, + {file = "coverage-7.6.1-cp311-cp311-win32.whl", hash = "sha256:1125ca0e5fd475cbbba3bb67ae20bd2c23a98fac4e32412883f9bcbaa81c314c"}, + {file = "coverage-7.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:8ae539519c4c040c5ffd0632784e21b2f03fc1340752af711f33e5be83a9d6c6"}, + {file = "coverage-7.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:95cae0efeb032af8458fc27d191f85d1717b1d4e49f7cb226cf526ff28179778"}, + {file = "coverage-7.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5621a9175cf9d0b0c84c2ef2b12e9f5f5071357c4d2ea6ca1cf01814f45d2391"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:260933720fdcd75340e7dbe9060655aff3af1f0c5d20f46b57f262ab6c86a5e8"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07e2ca0ad381b91350c0ed49d52699b625aab2b44b65e1b4e02fa9df0e92ad2d"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44fee9975f04b33331cb8eb272827111efc8930cfd582e0320613263ca849ca"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877abb17e6339d96bf08e7a622d05095e72b71f8afd8a9fefc82cf30ed944163"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e0cadcf6733c09154b461f1ca72d5416635e5e4ec4e536192180d34ec160f8a"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3c02d12f837d9683e5ab2f3d9844dc57655b92c74e286c262e0fc54213c216d"}, + {file = "coverage-7.6.1-cp312-cp312-win32.whl", hash = "sha256:e05882b70b87a18d937ca6768ff33cc3f72847cbc4de4491c8e73880766718e5"}, + {file = "coverage-7.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:b5d7b556859dd85f3a541db6a4e0167b86e7273e1cdc973e5b175166bb634fdb"}, + {file = "coverage-7.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a4acd025ecc06185ba2b801f2de85546e0b8ac787cf9d3b06e7e2a69f925b106"}, + {file = "coverage-7.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a6d3adcf24b624a7b778533480e32434a39ad8fa30c315208f6d3e5542aeb6e9"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0c212c49b6c10e6951362f7c6df3329f04c2b1c28499563d4035d964ab8e08c"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e81d7a3e58882450ec4186ca59a3f20a5d4440f25b1cff6f0902ad890e6748a"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78b260de9790fd81e69401c2dc8b17da47c8038176a79092a89cb2b7d945d060"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a78d169acd38300060b28d600344a803628c3fd585c912cacc9ea8790fe96862"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2c09f4ce52cb99dd7505cd0fc8e0e37c77b87f46bc9c1eb03fe3bc9991085388"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6878ef48d4227aace338d88c48738a4258213cd7b74fd9a3d4d7582bb1d8a155"}, + {file = "coverage-7.6.1-cp313-cp313-win32.whl", hash = "sha256:44df346d5215a8c0e360307d46ffaabe0f5d3502c8a1cefd700b34baf31d411a"}, + {file = "coverage-7.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:8284cf8c0dd272a247bc154eb6c95548722dce90d098c17a883ed36e67cdb129"}, + {file = "coverage-7.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d3296782ca4eab572a1a4eca686d8bfb00226300dcefdf43faa25b5242ab8a3e"}, + {file = "coverage-7.6.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:502753043567491d3ff6d08629270127e0c31d4184c4c8d98f92c26f65019962"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a89ecca80709d4076b95f89f308544ec8f7b4727e8a547913a35f16717856cb"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a318d68e92e80af8b00fa99609796fdbcdfef3629c77c6283566c6f02c6d6704"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13b0a73a0896988f053e4fbb7de6d93388e6dd292b0d87ee51d106f2c11b465b"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4421712dbfc5562150f7554f13dde997a2e932a6b5f352edcce948a815efee6f"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:166811d20dfea725e2e4baa71fffd6c968a958577848d2131f39b60043400223"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:225667980479a17db1048cb2bf8bfb39b8e5be8f164b8f6628b64f78a72cf9d3"}, + {file = "coverage-7.6.1-cp313-cp313t-win32.whl", hash = "sha256:170d444ab405852903b7d04ea9ae9b98f98ab6d7e63e1115e82620807519797f"}, + {file = "coverage-7.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9f222de8cded79c49bf184bdbc06630d4c58eec9459b939b4a690c82ed05657"}, + {file = "coverage-7.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6db04803b6c7291985a761004e9060b2bca08da6d04f26a7f2294b8623a0c1a0"}, + {file = "coverage-7.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f1adfc8ac319e1a348af294106bc6a8458a0f1633cc62a1446aebc30c5fa186a"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a95324a9de9650a729239daea117df21f4b9868ce32e63f8b650ebe6cef5595b"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b43c03669dc4618ec25270b06ecd3ee4fa94c7f9b3c14bae6571ca00ef98b0d3"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8929543a7192c13d177b770008bc4e8119f2e1f881d563fc6b6305d2d0ebe9de"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:a09ece4a69cf399510c8ab25e0950d9cf2b42f7b3cb0374f95d2e2ff594478a6"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9054a0754de38d9dbd01a46621636689124d666bad1936d76c0341f7d71bf569"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0dbde0f4aa9a16fa4d754356a8f2e36296ff4d83994b2c9d8398aa32f222f989"}, + {file = "coverage-7.6.1-cp38-cp38-win32.whl", hash = "sha256:da511e6ad4f7323ee5702e6633085fb76c2f893aaf8ce4c51a0ba4fc07580ea7"}, + {file = "coverage-7.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:3f1156e3e8f2872197af3840d8ad307a9dd18e615dc64d9ee41696f287c57ad8"}, + {file = "coverage-7.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abd5fd0db5f4dc9289408aaf34908072f805ff7792632250dcb36dc591d24255"}, + {file = "coverage-7.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:547f45fa1a93154bd82050a7f3cddbc1a7a4dd2a9bf5cb7d06f4ae29fe94eaf8"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645786266c8f18a931b65bfcefdbf6952dd0dea98feee39bd188607a9d307ed2"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e0b2df163b8ed01d515807af24f63de04bebcecbd6c3bfeff88385789fdf75a"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:609b06f178fe8e9f89ef676532760ec0b4deea15e9969bf754b37f7c40326dbc"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:702855feff378050ae4f741045e19a32d57d19f3e0676d589df0575008ea5004"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2bdb062ea438f22d99cba0d7829c2ef0af1d768d1e4a4f528087224c90b132cb"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9c56863d44bd1c4fe2abb8a4d6f5371d197f1ac0ebdee542f07f35895fc07f36"}, + {file = "coverage-7.6.1-cp39-cp39-win32.whl", hash = "sha256:6e2cd258d7d927d09493c8df1ce9174ad01b381d4729a9d8d4e38670ca24774c"}, + {file = "coverage-7.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:06a737c882bd26d0d6ee7269b20b12f14a8704807a01056c80bb881a4b2ce6ca"}, + {file = "coverage-7.6.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:e9a6e0eb86070e8ccaedfbd9d38fec54864f3125ab95419970575b42af7541df"}, + {file = "coverage-7.6.1.tar.gz", hash = "sha256:953510dfb7b12ab69d20135a0662397f077c59b1e6379a768e97c59d852ee51d"}, +] + +[package.dependencies] +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} + +[package.extras] +toml = ["tomli"] + [[package]] name = "cryptography" version = "41.0.4" @@ -1099,7 +1157,6 @@ gcsfs = {version = ">=2022.4.0", optional = true, markers = "extra == \"gcp\" or gitpython = ">=3.1.29" giturlparse = ">=0.10.0" google-cloud-bigquery = {version = ">=2.26.0", optional = true, markers = "extra == \"gcp\" or extra == \"bigquery\""} -graphlib-backport = {version = "*", markers = "python_version < \"3.9\""} grpcio = {version = ">=1.50.0", optional = true, markers = "extra == \"gcp\" or extra == \"bigquery\""} hexbytes = ">=0.2.2" humanize = ">=4.4.0" @@ -1185,7 +1242,6 @@ files = [ ] [package.dependencies] -importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.9\""} natsort = ">=7.0.1" typing-extensions = ">=3.7.4.1" @@ -1956,17 +2012,6 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4 [package.extras] grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] -[[package]] -name = "graphlib-backport" -version = "1.0.3" -description = "Backport of the Python 3.9 graphlib module for Python 3.6+" -optional = false -python-versions = ">=3.6,<4.0" -files = [ - {file = "graphlib_backport-1.0.3-py3-none-any.whl", hash = "sha256:24246967b9e7e6a91550bc770e6169585d35aa32790258579a8a3899a8c18fde"}, - {file = "graphlib_backport-1.0.3.tar.gz", hash = "sha256:7bb8fc7757b8ae4e6d8000a26cd49e9232aaa9a3aa57edb478474b8424bfaae2"}, -] - [[package]] name = "greenlet" version = "2.0.2" @@ -2333,24 +2378,6 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker perf = ["ipython"] testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] -[[package]] -name = "importlib-resources" -version = "6.4.0" -description = "Read resources from Python packages" -optional = false -python-versions = ">=3.8" -files = [ - {file = "importlib_resources-6.4.0-py3-none-any.whl", hash = "sha256:50d10f043df931902d4194ea07ec57960f66a80449ff867bfe782b4c486ba78c"}, - {file = "importlib_resources-6.4.0.tar.gz", hash = "sha256:cdb2b453b8046ca4e3798eb1d84f3cce1446a0e8e7b5ef4efb600f19fc398145"}, -] - -[package.dependencies] -zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["jaraco.test (>=5.4)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"] - [[package]] name = "incremental" version = "22.10.0" @@ -3031,6 +3058,21 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "mypy-protobuf" +version = "3.6.0" +description = "Generate mypy stub files from protobuf specs" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mypy-protobuf-3.6.0.tar.gz", hash = "sha256:02f242eb3409f66889f2b1a3aa58356ec4d909cdd0f93115622e9e70366eca3c"}, + {file = "mypy_protobuf-3.6.0-py3-none-any.whl", hash = "sha256:56176e4d569070e7350ea620262478b49b7efceba4103d468448f1d21492fd6c"}, +] + +[package.dependencies] +protobuf = ">=4.25.3" +types-protobuf = ">=4.24" + [[package]] name = "natsort" version = "8.4.0" @@ -3612,8 +3654,6 @@ files = [ ] [package.dependencies] -"backports.zoneinfo" = {version = ">=0.2.1", markers = "python_version < \"3.9\""} -importlib-resources = {version = ">=5.9.0", markers = "python_version < \"3.9\""} python-dateutil = ">=2.6" tzdata = ">=2020.1" @@ -3812,24 +3852,22 @@ testing = ["google-api-core[grpc] (>=1.31.5)"] [[package]] name = "protobuf" -version = "4.24.4" +version = "4.25.5" description = "" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "protobuf-4.24.4-cp310-abi3-win32.whl", hash = "sha256:ec9912d5cb6714a5710e28e592ee1093d68c5ebfeda61983b3f40331da0b1ebb"}, - {file = "protobuf-4.24.4-cp310-abi3-win_amd64.whl", hash = "sha256:1badab72aa8a3a2b812eacfede5020472e16c6b2212d737cefd685884c191085"}, - {file = "protobuf-4.24.4-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8e61a27f362369c2f33248a0ff6896c20dcd47b5d48239cb9720134bef6082e4"}, - {file = "protobuf-4.24.4-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:bffa46ad9612e6779d0e51ae586fde768339b791a50610d85eb162daeb23661e"}, - {file = "protobuf-4.24.4-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:b493cb590960ff863743b9ff1452c413c2ee12b782f48beca77c8da3e2ffe9d9"}, - {file = "protobuf-4.24.4-cp37-cp37m-win32.whl", hash = "sha256:dbbed8a56e56cee8d9d522ce844a1379a72a70f453bde6243e3c86c30c2a3d46"}, - {file = "protobuf-4.24.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6b7d2e1c753715dcfe9d284a25a52d67818dd43c4932574307daf836f0071e37"}, - {file = "protobuf-4.24.4-cp38-cp38-win32.whl", hash = "sha256:02212557a76cd99574775a81fefeba8738d0f668d6abd0c6b1d3adcc75503dbe"}, - {file = "protobuf-4.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:2fa3886dfaae6b4c5ed2730d3bf47c7a38a72b3a1f0acb4d4caf68e6874b947b"}, - {file = "protobuf-4.24.4-cp39-cp39-win32.whl", hash = "sha256:b77272f3e28bb416e2071186cb39efd4abbf696d682cbb5dc731308ad37fa6dd"}, - {file = "protobuf-4.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:9fee5e8aa20ef1b84123bb9232b3f4a5114d9897ed89b4b8142d81924e05d79b"}, - {file = "protobuf-4.24.4-py3-none-any.whl", hash = "sha256:80797ce7424f8c8d2f2547e2d42bfbb6c08230ce5832d6c099a37335c9c90a92"}, - {file = "protobuf-4.24.4.tar.gz", hash = "sha256:5a70731910cd9104762161719c3d883c960151eea077134458503723b60e3667"}, + {file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"}, + {file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"}, + {file = "protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173"}, + {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d"}, + {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331"}, + {file = "protobuf-4.25.5-cp38-cp38-win32.whl", hash = "sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1"}, + {file = "protobuf-4.25.5-cp38-cp38-win_amd64.whl", hash = "sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a"}, + {file = "protobuf-4.25.5-cp39-cp39-win32.whl", hash = "sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f"}, + {file = "protobuf-4.25.5-cp39-cp39-win_amd64.whl", hash = "sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45"}, + {file = "protobuf-4.25.5-py3-none-any.whl", hash = "sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41"}, + {file = "protobuf-4.25.5.tar.gz", hash = "sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584"}, ] [[package]] @@ -4489,6 +4527,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-cov" +version = "5.0.0" +description = "Pytest plugin for measuring coverage." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857"}, + {file = "pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652"}, +] + +[package.dependencies] +coverage = {version = ">=5.2.1", extras = ["toml"]} +pytest = ">=4.6" + +[package.extras] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] + [[package]] name = "pytest-forked" version = "1.6.0" @@ -4908,7 +4964,6 @@ files = [ [package.dependencies] markdown-it-py = ">=2.2.0" pygments = ">=2.13.0,<3.0.0" -typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9\""} [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] @@ -5650,6 +5705,17 @@ files = [ {file = "twisted_iocpsupport-1.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:300437af17396a945a58dcfffd77863303a8b6d9e65c6e81f1d2eed55b50d444"}, ] +[[package]] +name = "types-protobuf" +version = "5.29.1.20241207" +description = "Typing stubs for protobuf" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types_protobuf-5.29.1.20241207-py3-none-any.whl", hash = "sha256:92893c42083e9b718c678badc0af7a9a1307b92afe1599e5cba5f3d35b668b2f"}, + {file = "types_protobuf-5.29.1.20241207.tar.gz", hash = "sha256:2ebcadb8ab3ef2e3e2f067e0882906d64ba0dc65fc5b0fd7a8b692315b4a0be9"}, +] + [[package]] name = "types-psycopg2" version = "2.9.21.20240218" @@ -6471,5 +6537,5 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" -python-versions = ">=3.8.1,<3.13" -content-hash = "e216234bd35e71ef0c8e5a498c2cc616df417c5b14658b00aed9d935ba5a782e" +python-versions = ">=3.9,<3.13" +content-hash = "6a657c817cec2ef5e110c455fd86ec73ce82e1e97dea77613ba4400238608594" diff --git a/pyproject.toml b/pyproject.toml index a1a431d54..13beebf21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,9 +11,8 @@ readme = "README.md" packages = [{include = "sources"}] [tool.poetry.dependencies] -python = ">=3.8.1,<3.13" +python = ">=3.9,<3.13" dlt = {version = "1.3.0", allow-prereleases = true, extras = ["redshift", "bigquery", "postgres", "duckdb"]} -graphlib-backport = {version = "*", python = "<3.9"} [tool.poetry.group.dltpure.dependencies] dlt = {version = "1.3.0", allow-prereleases = true} @@ -45,6 +44,9 @@ pytest-mock = "^3.12.0" twisted = "22.10.0" pytest-forked = "^1.6.0" pendulum = "^3.0.0" +types-protobuf = "^5.27.0.20240907" +pytest-cov = "^5.0.0" +mypy-protobuf = "^3.6.0" [tool.poetry.group.sql_database.dependencies] sqlalchemy = ">=1.4" @@ -54,6 +56,11 @@ connectorx = ">=0.3.1" [tool.poetry.group.pg_replication.dependencies] psycopg2-binary = ">=2.9.9" +[tool.poetry.group.pg_legacy_replication.dependencies] +protobuf = ">=4.25" +psycopg2-binary = ">=2.9.9" +sqlalchemy = ">=1.4" + [tool.poetry.group.google_sheets.dependencies] google-api-python-client = "^2.78.0" @@ -116,4 +123,4 @@ requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" [tool.black] -include = '.*py$' +include = '.*py$' \ No newline at end of file diff --git a/sources/.dlt/example.secrets.toml b/sources/.dlt/example.secrets.toml index a0e8963e0..4a9590cfe 100644 --- a/sources/.dlt/example.secrets.toml +++ b/sources/.dlt/example.secrets.toml @@ -16,7 +16,11 @@ location = "US" ### Sources [sources] +# local postgres +helpers.credentials="postgresql://loader:loader@localhost:5432/dlt_data" +pg_legacy_replication.credentials="postgresql://loader:loader@localhost:5432/dlt_data" + ## chess pipeline # the section below defines secrets for "chess_dlt_config_example" source in chess/__init__.py [sources.chess] -secret_str="secret string" # a string secret +secret_str="secret string" # a string secret \ No newline at end of file diff --git a/sources/pg_legacy_replication/README.md b/sources/pg_legacy_replication/README.md new file mode 100644 index 000000000..f6c9de239 --- /dev/null +++ b/sources/pg_legacy_replication/README.md @@ -0,0 +1,130 @@ +# Postgres legacy replication +[Postgres](https://www.postgresql.org/) is one of the most popular relational database management systems. This verified source uses Postgres' replication functionality to efficiently process changes +in tables (a process often referred to as _Change Data Capture_ or CDC). It uses [logical decoding](https://www.postgresql.org/docs/current/logicaldecoding.html) and the optional `decoderbufs` +[output plugin](https://github.com/debezium/postgres-decoderbufs), which is a shared library which must be built or enabled. + +| Source | Description | +|---------------------|-------------------------------------------------| +| replication_source | Load published messages from a replication slot | + +## Install decoderbufs + +Instructions can be found [here](https://github.com/debezium/postgres-decoderbufs?tab=readme-ov-file#building) + +Below is an example installation in a docker image: +```Dockerfile +FROM postgres:14 + +# Install dependencies required to build decoderbufs +RUN apt-get update +RUN apt-get install -f -y \ + software-properties-common \ + build-essential \ + pkg-config \ + git + +RUN apt-get install -f -y \ + postgresql-server-dev-14 \ + libprotobuf-c-dev && \ + rm -rf /var/lib/apt/lists/* + +ARG decoderbufs_version=v1.7.0.Final +RUN git clone https://github.com/debezium/postgres-decoderbufs -b $decoderbufs_version --single-branch && \ + cd postgres-decoderbufs && \ + make && make install && \ + cd .. && \ + rm -rf postgres-decoderbufs +``` + +## Initialize the pipeline + +```bash +$ dlt init pg_legacy_replication duckdb +``` + +This uses `duckdb` as destination, but you can choose any of the supported [destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/). + +## Set up user + +The Postgres user needs to have the `LOGIN` and `REPLICATION` attributes assigned: + +```sql +CREATE ROLE replication_user WITH LOGIN REPLICATION; +``` + +It also needs various read only privileges on the database (by first connecting to the database): + +```sql +\connect dlt_data +GRANT USAGE ON SCHEMA schema_name TO replication_user; +GRANT SELECT ON ALL TABLES IN SCHEMA public TO replication_user; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO replication_user; +``` + +## Add credentials +1. Open `.dlt/secrets.toml`. +2. Enter your Postgres credentials: + + ```toml + [sources.pg_legacy_replication] + credentials="postgresql://replication_user:<>@localhost:5432/dlt_data" + ``` +3. Enter credentials for your chosen destination as per the [docs](https://dlthub.com/docs/dlt-ecosystem/destinations/). + +## Run the pipeline + +1. Install the necessary dependencies by running the following command: + + ```bash + pip install -r requirements.txt + ``` + +1. Now the pipeline can be run by using the command: + + ```bash + python pg_legacy_replication_pipeline.py + ``` + +1. To make sure that everything is loaded as expected, use the command: + + ```bash + dlt pipeline pg_replication_pipeline show + ``` + +# Differences between `pg_legacy_replication` and `pg_replication` + +## Overview + +`pg_legacy_replication` is a fork of the verified `pg_replication` source. The primary goal of this fork is to provide logical replication capabilities for Postgres instances running versions +earlier than 10, when the `pgoutput` plugin was not yet available. This fork draws inspiration from the original `pg_replication` source and the `decoderbufs` library, +which is actively maintained by Debezium. + +## Key Differences from `pg_replication` + +### Replication User Ownership Requirements +One of the limitations of native Postgre replication is that the replication user must **own** the tables in order to add them to a **publication**. +Additionally, once a table is added to a publication, it cannot be removed, requiring the creation of a new replication slot, which results in the loss of any state tracking. + +### Limitations in `pg_replication` +The current pg_replication implementation has several limitations: +- It supports only a single initial snapshot of the data. +- It requires `CREATE` access to the source database in order to perform the initial snapshot. +- **Superuser** access is required to replicate entire Postgres schemas. + While the `pg_legacy_replication` source theoretically reads the entire WAL across all schemas, the current implementation using dlt transformers restricts this functionality. + In practice, this has not been a common use case. +- The implementation is opinionated in its approach to data transfer. Specifically, when updates or deletes are required, it defaults to a `merge` write disposition, + which replicates live data without tracking changes over time. + +### Features of `pg_legacy_replication` + +This fork of `pg_replication` addresses the aforementioned limitations and introduces the following improvements: +- Adheres to the dlt philosophy by treating the WAL as an upstream resources. This replication stream is then transformed into various DLT resources, with customizable options for write disposition, + file formats, type hints, etc., specified at the resource level rather than at the source level. +- Supports an initial snapshot of all tables using the transaction slot isolation level. Additionally, ad-hoc snapshots can be performed using the serializable deferred isolation level, + similar to `pg_dump`. +- Emphasizes the use of `pyarrow` and parquet formats for efficient data storage and transfer. A dedicated backend has been implemented to support these formats. +- Replication messages are decoded using Protocol Buffers (protobufs) in C, rather than relying on native Python byte buffer parsing. This ensures greater efficiency and performance. + +## Next steps +- Add support for the [wal2json](https://github.com/eulerto/wal2json) replication plugin. This is particularly important for environments such as **Amazon RDS**, which supports `wal2json`, +- as opposed to on-premise or Google Cloud SQL instances that support `decoderbufs`. \ No newline at end of file diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py new file mode 100644 index 000000000..1968e7883 --- /dev/null +++ b/sources/pg_legacy_replication/__init__.py @@ -0,0 +1,213 @@ +"""Replicates postgres tables in batch using logical decoding.""" + +from collections import defaultdict +from typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Union + +import dlt +from dlt.extract import DltResource +from dlt.extract.items import TDataItem +from dlt.sources.credentials import ConnectionStringCredentials +from dlt.sources.sql_database import sql_table + +from .helpers import ( + BackendHandler, + ItemGenerator, + ReplicationOptions, + SqlTableOptions, + advance_slot, + cleanup_snapshot_resources, + configure_engine, + create_replication_slot, + drop_replication_slot, + get_max_lsn, + get_rep_conn, +) + + +@dlt.source +def replication_source( + slot_name: str, + schema: str, + table_names: Union[str, Sequence[str]], + credentials: ConnectionStringCredentials = dlt.secrets.value, + repl_options: Optional[Mapping[str, ReplicationOptions]] = None, + target_batch_size: int = 1000, + flush_slot: bool = True, +) -> Iterable[DltResource]: + """ + Defines a dlt source for replicating Postgres tables using logical replication. + This source reads from a replication slot and pipes the changes using transformers. + + - Relies on a replication slot that publishes DML operations (i.e. `insert`, `update`, and `delete`). + - Maintains LSN of last consumed message in state to track progress. + - At start of the run, advances the slot upto last consumed message in previous run (for pg>10 only) + - Processes in batches to limit memory usage. + + Args: + slot_name (str): + The name of the logical replication slot used to fetch WAL changes. + schema (str): + Name of the schema to replicate tables from. + table_names (Union[str, Sequence[str]]): + The name(s) of the tables to replicate. Can be a single table name or a list of table names. + credentials (ConnectionStringCredentials): + Database credentials for connecting to the Postgres instance. + repl_options (Optional[Mapping[str, ReplicationOptions]], optional): + A mapping of table names to `ReplicationOptions`, allowing for fine-grained control over + replication behavior for each table. + + Each `ReplicationOptions` dictionary can include the following keys: + - `backend` (Optional[TableBackend]): Specifies the backend to use for table replication. + - `backend_kwargs` (Optional[Mapping[str, Any]]): Additional configuration options for the backend. + - `column_hints` (Optional[TTableSchemaColumns]): A dictionary of hints for column types or properties. + - `include_lsn` (Optional[bool]): Whether to include the LSN (Log Sequence Number) + in the replicated data. Defaults to `True`. + - `include_deleted_ts` (Optional[bool]): Whether to include a timestamp for deleted rows. + Defaults to `True`. + - `include_commit_ts` (Optional[bool]): Whether to include the commit timestamp of each change. + - `include_tx_id` (Optional[bool]): Whether to include the transaction ID of each change. + - `included_columns` (Optional[Set[str]]): A set of specific columns to include in the replication. + If not specified, all columns are included. + target_batch_size (int, optional): + The target size of each batch of replicated data items. Defaults to `1000`. + flush_slot (bool, optional): + If `True`, advances the replication slot to the last processed LSN + to prevent replaying already replicated changes. Defaults to `True`. + + Yields: + Iterable[DltResource]: + A collection of `DltResource` objects, each corresponding to a table being replicated. + + Notes: + - The `repl_options` parameter allows fine-tuning of replication behavior, such as column filtering + or write disposition configuration, per table. + - The replication process is incremental, ensuring only new changes are processed after the last commit LSN. + """ + table_names = [table_names] if isinstance(table_names, str) else table_names or [] + repl_options = defaultdict(lambda: ReplicationOptions(), repl_options or {}) + + @dlt.resource(name=lambda args: args["slot_name"], standalone=True) + def replication_resource(slot_name: str) -> Iterable[TDataItem]: + # start where we left off in previous run + start_lsn = dlt.current.resource_state().get("last_commit_lsn", 0) + if flush_slot and start_lsn > 0: + advance_slot(start_lsn, slot_name, credentials) + + # continue until last message in replication slot + upto_lsn = get_max_lsn(credentials) + if upto_lsn is None: + return + + table_qnames = {f"{schema}.{table_name}" for table_name in table_names} + + # generate items in batches + while True: + gen = ItemGenerator( + credentials=credentials, + slot_name=slot_name, + table_qnames=table_qnames, + upto_lsn=upto_lsn, + start_lsn=start_lsn, + repl_options=repl_options, + target_batch_size=target_batch_size, + ) + yield from gen + if gen.generated_all: + dlt.current.resource_state()["last_commit_lsn"] = gen.last_commit_lsn + break + start_lsn = gen.last_commit_lsn + + wal_reader = replication_resource(slot_name) + + for table in table_names: + yield dlt.transformer( + _create_table_dispatch(table, repl_options=repl_options[table]), + data_from=wal_reader, + name=table, + ) + + +def _create_table_dispatch( + table: str, repl_options: ReplicationOptions +) -> Callable[[TDataItem], Any]: + """Creates a dispatch handler that processes data items based on a specified table and optional column hints.""" + handler = BackendHandler(table, repl_options) + # FIXME Uhhh.. why do I have to do this? + handler.__qualname__ = "BackendHandler.__call__" # type: ignore[attr-defined] + return handler + + +@dlt.source +def init_replication( + slot_name: str, + schema: str, + table_names: Optional[Union[str, Sequence[str]]] = None, + credentials: ConnectionStringCredentials = dlt.secrets.value, + take_snapshots: bool = False, + table_options: Optional[Mapping[str, SqlTableOptions]] = None, + reset: bool = False, +) -> Iterable[DltResource]: + """ + Initializes a replication session for Postgres using logical replication. + Optionally, snapshots of specified tables can be taken during initialization. + + Args: + slot_name (str): + The name of the logical replication slot to be used or created. + schema (str): + Name of the schema to replicate tables from. + table_names (Optional[Union[str, Sequence[str]]]): + The name(s) of the table(s) to replicate. Can be a single table name or a list of table names. + If not provided, no tables will be replicated unless `take_snapshots` is `True`. + credentials (ConnectionStringCredentials): + Database credentials for connecting to the Postgres instance. + take_snapshots (bool): + Whether to take initial snapshots of the specified tables. + Defaults to `False`. + table_options (Optional[Mapping[str, SqlTableOptions]]): + Additional options for configuring replication for specific tables. + These are the exact same parameters for the `dlt.sources.sql_database.sql_table` function. + Argument is only used if `take_snapshots` is `True`. + reset (bool, optional): + If `True`, drops the existing replication slot before creating a new one. + Use with caution, as this will clear existing replication state. + Defaults to `False`. + + Returns: + - None if `take_snapshots` is `False` + - a list of `DltResource` objects for the snapshot table(s) if `take_snapshots` is `True`. + + Notes: + - If `reset` is `True`, the existing replication slot will be dropped before creating a new one. + - When `take_snapshots` is `True`, the function configures a snapshot isolation level for consistent table snapshots. + """ + rep_conn = get_rep_conn(credentials) + rep_cur = rep_conn.cursor() + if reset: + drop_replication_slot(slot_name, rep_cur) + slot = create_replication_slot(slot_name, rep_cur) + + # Close connection if no snapshots are needed + if not take_snapshots: + rep_conn.close() + return + + assert table_names is not None + + engine = configure_engine( + credentials, rep_conn, slot.get("snapshot_name") if slot else None + ) + + table_names = [table_names] if isinstance(table_names, str) else table_names or [] + + for table in table_names: + table_args = (table_options or {}).get(table, {}).copy() + yield sql_table(credentials=engine, table=table, schema=schema, **table_args) + + +__all__ = [ + "ReplicationOptions", + "cleanup_snapshot_resources", + "init_replication", + "replication_source", +] diff --git a/sources/pg_legacy_replication/exceptions.py b/sources/pg_legacy_replication/exceptions.py new file mode 100644 index 000000000..99e3db420 --- /dev/null +++ b/sources/pg_legacy_replication/exceptions.py @@ -0,0 +1,6 @@ +# class SqlDatabaseSourceImportError(Exception): +# def __init__(self) -> None: +# super().__init__( +# "Could not import `sql_database` source. Run `dlt init sql_database `" +# " to download the source code." +# ) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py new file mode 100644 index 000000000..33f2ac2bc --- /dev/null +++ b/sources/pg_legacy_replication/helpers.py @@ -0,0 +1,645 @@ +import hashlib +from collections import defaultdict +from dataclasses import dataclass, field +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + Iterable, + Iterator, + List, + Mapping, + NamedTuple, + Optional, + Sequence, + Set, + TypedDict, +) + +import dlt +import psycopg2 +from dlt.common import logger +from dlt.common.libs.sql_alchemy import Engine, MetaData, Table, sa +from dlt.common.pendulum import pendulum +from dlt.common.schema.typing import TColumnSchema, TTableSchema, TTableSchemaColumns +from dlt.common.schema.utils import merge_column +from dlt.common.typing import TDataItem +from dlt.extract import DltSource +from dlt.extract.items import DataItemWithMeta +from dlt.sources.credentials import ConnectionStringCredentials +from dlt.sources.sql_database import ( + ReflectionLevel, + TableBackend, + TQueryAdapter, + TTypeAdapter, + arrow_helpers as arrow, + engine_from_credentials, +) +from psycopg2.extensions import connection as ConnectionExt, cursor +from psycopg2.extras import ( + LogicalReplicationConnection, + ReplicationCursor, + ReplicationMessage, + StopReplication, +) + +from .pg_logicaldec_pb2 import DatumMessage, Op, RowMessage, TypeInfo +from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val + + +class ReplicationOptions(TypedDict, total=False): + backend: Optional[TableBackend] + backend_kwargs: Optional[Mapping[str, Any]] + column_hints: Optional[TTableSchemaColumns] + include_lsn: Optional[bool] # Default is true + include_deleted_ts: Optional[bool] # Default is true + include_commit_ts: Optional[bool] + include_tx_id: Optional[bool] + included_columns: Optional[Set[str]] + + +class SqlTableOptions(TypedDict, total=False): + backend: TableBackend + backend_kwargs: Optional[Dict[str, Any]] + chunk_size: int + defer_table_reflect: Optional[bool] + detect_precision_hints: Optional[bool] + included_columns: Optional[List[str]] + metadata: Optional[MetaData] + query_adapter_callback: Optional[TQueryAdapter] + reflection_level: Optional[ReflectionLevel] + table_adapter_callback: Optional[Callable[[Table], None]] + type_adapter_callback: Optional[TTypeAdapter] + + +def configure_engine( + credentials: ConnectionStringCredentials, + rep_conn: LogicalReplicationConnection, + snapshot_name: Optional[str], +) -> Engine: + """ + Configures the SQLAlchemy engine. + Also attaches the replication connection in order to prevent it being garbage collected and closed. + + Args: + snapshot_name (str, optional): This is used during the initial first table snapshot allowing + all transactions to run with the same consistent snapshot. + """ + engine: Engine = engine_from_credentials(credentials) + engine.execution_options(stream_results=True, max_row_buffer=2 * 50000) + setattr(engine, "rep_conn", rep_conn) # noqa + + @sa.event.listens_for(engine, "begin") + def on_begin(conn: sa.Connection) -> None: + cur = conn.connection.cursor() + if snapshot_name is None: + # Using the same isolation level that pg_backup uses + cur.execute( + "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE, READ ONLY, DEFERRABLE" + ) + else: + cur.execute("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ") + cur.execute(f"SET TRANSACTION SNAPSHOT '{snapshot_name}'") + + @sa.event.listens_for(engine, "engine_disposed") + def on_engine_disposed(e: Engine) -> None: + delattr(e, "rep_conn") + + return engine + + +def cleanup_snapshot_resources(snapshots: DltSource) -> None: + """FIXME Awful hack to release the underlying SQL engine when snapshotting tables""" + resources = snapshots.resources + if resources: + engine: Engine = next(iter(resources.values()))._explicit_args["credentials"] + engine.dispose() + + +def get_pg_version(cur: cursor) -> int: + """Returns Postgres server version as int.""" + return cur.connection.server_version + + +def create_replication_slot( # type: ignore[return] + name: str, cur: ReplicationCursor, output_plugin: str = "decoderbufs" +) -> Optional[Dict[str, str]]: + """Creates a replication slot if it doesn't exist yet.""" + try: + cur.create_replication_slot(name, output_plugin=output_plugin) + logger.info("Successfully created replication slot '%s'", name) + result = cur.fetchone() + return { + "slot_name": result[0], + "consistent_point": result[1], + "snapshot_name": result[2], + "output_plugin": result[3], + } + except psycopg2.errors.DuplicateObject: # the replication slot already exists + logger.info( + "Replication slot '%s' cannot be created because it already exists", name + ) + + +def drop_replication_slot(name: str, cur: ReplicationCursor) -> None: + """Drops a replication slot if it exists.""" + try: + cur.drop_replication_slot(name) + logger.info("Successfully dropped replication slot '%s'", name) + except psycopg2.errors.UndefinedObject: # the replication slot does not exist + logger.info( + "Replication slot '%s' cannot be dropped because it does not exist", name + ) + + +def get_max_lsn(credentials: ConnectionStringCredentials) -> Optional[int]: + """ + Returns maximum Log Sequence Number (LSN). + + Returns None if the replication slot is empty. + Does not consume the slot, i.e. messages are not flushed. + """ + cur = _get_conn(credentials).cursor() + try: + loc_fn = ( + "pg_current_xlog_location" + if get_pg_version(cur) < 100000 + else "pg_current_wal_lsn" + ) + # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) + cur.execute(f"SELECT {loc_fn}() - '0/0' as max_lsn;") + lsn: int = cur.fetchone()[0] + return lsn + finally: + cur.connection.close() + + +def lsn_int_to_hex(lsn: int) -> str: + """Convert integer LSN to postgres hexadecimal representation.""" + # https://stackoverflow.com/questions/66797767/lsn-external-representation. + return f"{lsn >> 32 & 4294967295:X}/{lsn & 4294967295:08X}" + + +def advance_slot( + upto_lsn: int, + slot_name: str, + credentials: ConnectionStringCredentials, +) -> None: + """ + Advances position in the replication slot. + + Flushes all messages upto (and including) the message with LSN = `upto_lsn`. + This function is used as alternative to psycopg2's `send_feedback` method, because + the behavior of that method seems odd when used outside of `consume_stream`. + """ + assert upto_lsn > 0 + cur = _get_conn(credentials).cursor() + try: + # There is unfortunately no way in pg9.6 to manually advance the replication slot + if get_pg_version(cur) > 100000: + cur.execute( + f"SELECT * FROM pg_replication_slot_advance('{slot_name}', '{lsn_int_to_hex(upto_lsn)}');" + ) + finally: + cur.connection.close() + + +def _get_conn( + credentials: ConnectionStringCredentials, + connection_factory: Optional[Any] = None, +) -> ConnectionExt: + """Returns a psycopg2 connection to interact with postgres.""" + return psycopg2.connect( # type: ignore[no-any-return] + database=credentials.database, + user=credentials.username, + password=credentials.password, + host=credentials.host, + port=credentials.port, + connection_factory=connection_factory, + **({} if credentials.query is None else credentials.query), + ) + + +def get_rep_conn( + credentials: ConnectionStringCredentials, +) -> LogicalReplicationConnection: + """ + Returns a psycopg2 LogicalReplicationConnection to interact with postgres replication functionality. + + Raises error if the user does not have the REPLICATION attribute assigned. + """ + return _get_conn(credentials, LogicalReplicationConnection) # type: ignore[return-value] + + +class MessageConsumer: + """ + Consumes messages from a ReplicationCursor sequentially. + + Generates data item for each `insert`, `update`, and `delete` message. + Processes in batches to limit memory usage. + Maintains message data needed by subsequent messages in internal state. + """ + + def __init__( + self, + upto_lsn: int, + table_qnames: Set[str], + repl_options: DefaultDict[str, ReplicationOptions], + target_batch_size: int = 1000, + ) -> None: + self.upto_lsn = upto_lsn + self.table_qnames = table_qnames + self.target_batch_size = target_batch_size + self.repl_options = repl_options + + self.consumed_all: bool = False + # maps table names to list of data items + self.data_items: Dict[str, List[TDataItem]] = defaultdict(list) + # maps table name to table schema + self.last_table_schema: Dict[str, TTableSchema] = {} + # maps table names to new_typeinfo hashes + self.last_table_hashes: Dict[str, int] = {} + self.last_commit_ts: pendulum.DateTime + self.last_commit_lsn: int + + def __call__(self, msg: ReplicationMessage) -> None: + """Processes message received from stream.""" + self.process_msg(msg) + + def process_msg(self, msg: ReplicationMessage) -> None: + """Processes encoded replication message. + + Identifies message type and decodes accordingly. + Message treatment is different for various message types. + Breaks out of stream with StopReplication exception when + - `upto_lsn` is reached + - `target_batch_size` is reached + - a table's schema has changed + """ + row_msg = RowMessage() + try: + row_msg.ParseFromString(msg.payload) + assert row_msg.op != Op.UNKNOWN, f"Unsupported operation : {row_msg}" + + if row_msg.op == Op.BEGIN: + # self.last_commit_ts = _epoch_micros_to_datetime(row_msg.commit_time) + pass + elif row_msg.op == Op.COMMIT: + self.process_commit(lsn=msg.data_start) + else: # INSERT, UPDATE or DELETE + self.process_change(row_msg, lsn=msg.data_start) + except StopReplication: + raise + except Exception: + logger.error( + "A fatal error occurred while processing a message: %s", row_msg + ) + raise + + def process_commit(self, lsn: int) -> None: + """ + Updates object state when Commit message is observed. + + Raises StopReplication when `upto_lsn` or `target_batch_size` is reached. + """ + self.last_commit_lsn = lsn + if lsn >= self.upto_lsn: + self.consumed_all = True + n_items = sum( + [len(items) for items in self.data_items.values()] + ) # combine items for all tables + if self.consumed_all or n_items >= self.target_batch_size: + raise StopReplication + + def process_change(self, msg: RowMessage, lsn: int) -> None: + """Processes replication message of type Insert, Update or Delete""" + if msg.table not in self.table_qnames: + return + table_name = msg.table.split(".")[1] + table_schema = self.get_table_schema(msg, table_name) + data_item = gen_data_item( + msg, table_schema["columns"], lsn, **self.repl_options[table_name] + ) + self.data_items[table_name].append(data_item) + + def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: + """Given a row message, calculates or fetches a table schema.""" + last_schema = self.last_table_schema.get(table_name) + + # Used cached schema if the operation is a DELETE since the inferred one will always be less precise + if msg.op == Op.DELETE and last_schema: + return last_schema + + # Return cached schema if hash matches + current_hash = hash_typeinfo(msg.new_typeinfo) + if current_hash == self.last_table_hashes.get(table_name): + return self.last_table_schema[table_name] + + new_schema = infer_table_schema(msg, **self.repl_options[table_name]) + if last_schema is None: + # Cache the inferred schema and hash if it is not already cached + self.last_table_schema[table_name] = new_schema + self.last_table_hashes[table_name] = current_hash + else: + try: + retained_schema = compare_schemas(last_schema, new_schema) + self.last_table_schema[table_name] = retained_schema + except AssertionError as e: + logger.debug(str(e)) + raise StopReplication + + return new_schema + + +def hash_typeinfo(new_typeinfo: Sequence[TypeInfo]) -> int: + """Generate a hash for the entire new_typeinfo list by hashing each TypeInfo message.""" + typeinfo_tuple = tuple( + (info.modifier, info.value_optional) for info in new_typeinfo + ) + hash_obj = hashlib.blake2b(repr(typeinfo_tuple).encode(), digest_size=8) + return int(hash_obj.hexdigest(), 16) + + +class TableItems(NamedTuple): + schema: TTableSchema + items: List[TDataItem] + + +@dataclass +class ItemGenerator: + credentials: ConnectionStringCredentials + slot_name: str + table_qnames: Set[str] + upto_lsn: int + start_lsn: int + repl_options: DefaultDict[str, ReplicationOptions] + target_batch_size: int = 1000 + last_commit_lsn: Optional[int] = field(default=None, init=False) + generated_all: bool = False + + def __iter__(self) -> Iterator[TableItems]: + """ + Yields data items/schema from MessageConsumer. + + Starts replication of messages from the replication slot. + Maintains LSN of last consumed commit message in object state. + Advances the slot only when all messages have been consumed. + """ + cur = get_rep_conn(self.credentials).cursor() + consumer = MessageConsumer( + upto_lsn=self.upto_lsn, + table_qnames=self.table_qnames, + repl_options=self.repl_options, + target_batch_size=self.target_batch_size, + ) + try: + cur.start_replication(slot_name=self.slot_name, start_lsn=self.start_lsn) + cur.consume_stream(consumer) + except StopReplication: # completed batch or reached `upto_lsn` + yield from self.flush_batch(cur, consumer) + finally: + cur.connection.close() + + def flush_batch( + self, cur: ReplicationCursor, consumer: MessageConsumer + ) -> Iterator[TableItems]: + last_commit_lsn = consumer.last_commit_lsn + consumed_all = consumer.consumed_all + for table, data_items in consumer.data_items.items(): + logger.info("Flushing %s events for table '%s'", len(data_items), table) + yield TableItems(consumer.last_table_schema[table], data_items) + if consumed_all: + cur.send_feedback( + write_lsn=last_commit_lsn, + flush_lsn=last_commit_lsn, + reply=True, + force=True, + ) + else: + cur.send_feedback(write_lsn=last_commit_lsn, reply=True, force=True) + self.last_commit_lsn = last_commit_lsn + self.generated_all = consumed_all + + +@dataclass +class BackendHandler: + """ + Consumes messages from ItemGenerator once a batch is ready for emitting. + + It is mainly responsible for emitting schema and dict data times or transforming + into arrow tables. + """ + + table: str + repl_options: ReplicationOptions + + def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: + if table_items.schema["name"] != self.table: + return + + # Apply column hints if provided + columns = table_items.schema["columns"] + if column_hints := self.repl_options.get("column_hints"): + for col_name, col_hint in column_hints.items(): + if col_name in columns: + columns[col_name] = merge_column(columns[col_name], col_hint) + + # Process based on backend + data = table_items.items + backend = self.repl_options.get("backend", "sqlalchemy") + try: + if backend == "sqlalchemy": + yield from self.emit_schema_and_items(columns, data) + elif backend == "pyarrow": + yield from self.emit_arrow_table(columns, data) + else: + raise NotImplementedError(f"Unsupported backend: {backend}") + except Exception: + logger.error( + "A fatal error occurred while processing batch for '%s' (columns=%s, data=%s)", + self.table, + columns, + data, + ) + raise + + def emit_schema_and_items( + self, columns: TTableSchemaColumns, items: List[TDataItem] + ) -> Iterator[DataItemWithMeta]: + yield dlt.mark.with_hints( + [], + dlt.mark.make_hints(table_name=self.table, columns=columns), + create_table_variant=True, + ) + yield dlt.mark.with_table_name(items, self.table) + + def emit_arrow_table( + self, columns: TTableSchemaColumns, items: List[TDataItem] + ) -> Iterator[DataItemWithMeta]: + # Create rows for pyarrow using ordered column keys + rows = [ + tuple(item.get(column, None) for column in list(columns.keys())) + for item in items + ] + tz = self.repl_options.get("backend_kwargs", {}).get("tz", "UTC") + yield dlt.mark.with_table_name( + arrow.row_tuples_to_arrow(rows, columns=columns, tz=tz), + self.table, + ) + + +def infer_table_schema( + msg: RowMessage, + include_lsn: bool = True, + include_deleted_ts: bool = True, + include_commit_ts: bool = False, + include_tx_id: bool = False, + included_columns: Optional[Set[str]] = None, + **_: Any, +) -> TTableSchema: + """Infers the table schema from the replication message and optional hints.""" + # Choose the correct source based on operation type + is_change = msg.op != Op.DELETE + tuples = msg.new_tuple if is_change else msg.old_tuple + schema = TTableSchema(name=msg.table.split(".")[1]) + + # Filter and map columns, conditionally using `new_typeinfo` when available + schema["columns"] = { + col_name: _to_dlt_column_schema( + col_name, datum=col, type_info=msg.new_typeinfo[i] if is_change else None + ) + for i, col in enumerate(tuples) + if (col_name := _actual_column_name(col)) + and (not included_columns or col_name in included_columns) + } + + # Add replication columns + if include_lsn: + schema["columns"]["_pg_lsn"] = { + "data_type": "bigint", + "name": "_pg_lsn", + "nullable": True, + } + if include_deleted_ts: + schema["columns"]["_pg_deleted_ts"] = { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + } + if include_commit_ts: + schema["columns"]["_pg_commit_ts"] = { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + } + if include_tx_id: + schema["columns"]["_pg_tx_id"] = { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + } + + return schema + + +def gen_data_item( + msg: RowMessage, + column_schema: TTableSchemaColumns, + lsn: int, + include_lsn: bool = True, + include_deleted_ts: bool = True, + include_commit_ts: bool = False, + include_tx_id: bool = False, + included_columns: Optional[Set[str]] = None, + **_: Any, +) -> TDataItem: + """Generates data item from a row message and corresponding metadata.""" + data_item: TDataItem = {} + if include_lsn: + data_item["_pg_lsn"] = lsn + if include_commit_ts: + data_item["_pg_commit_ts"] = _epoch_micros_to_datetime(msg.commit_time) + if include_tx_id: + data_item["_pg_tx_id"] = msg.transaction_id + + # Select the relevant row tuple based on operation type + is_delete = msg.op == Op.DELETE + row = msg.old_tuple if is_delete else msg.new_tuple + if is_delete and include_deleted_ts: + data_item["_pg_deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) + + for data in row: + col_name = _actual_column_name(data) + if not included_columns or col_name in included_columns: + data_item[col_name] = _to_dlt_val( + data, column_schema[col_name], for_delete=is_delete + ) + + return data_item + + +def _actual_column_name(column: DatumMessage) -> str: + """ + Certain column names are quoted since they are reserved keywords, + however let the destination decide on how to normalize them + """ + col_name = column.column_name + if col_name.startswith('"') and col_name.endswith('"'): + col_name = col_name[1:-1] + return col_name + + +ALLOWED_COL_SCHEMA_FIELDS: Set[str] = { + "name", + "data_type", + "nullable", + "precision", + "scale", +} + + +def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: + """ + Compares the last schema with the new one and chooses the more + precise one if they are relatively equal or else raises a + AssertionError due to an incompatible schema change + """ + assert last["name"] == new["name"], "Table names do not match" + + table_schema = TTableSchema(name=last["name"], columns={}) + last_cols, new_cols = last["columns"], new["columns"] + assert len(last_cols) == len( + new_cols + ), f"Columns mismatch last:{last_cols} new:{new_cols}" + + for name, s1 in last_cols.items(): + s2 = new_cols.get(name) + assert ( + s2 and s1["data_type"] == s2["data_type"] + ), f"Incompatible schema for column '{name}'" + + # Ensure new has no fields outside allowed fields + extra_fields = set(s2.keys()) - ALLOWED_COL_SCHEMA_FIELDS + assert not extra_fields, f"Unexpected fields {extra_fields} in column '{name}'" + + # Select the more precise schema by comparing nullable, precision, and scale + col_schema = TColumnSchema(name=name, data_type=s1["data_type"]) + if "nullable" in s1 or "nullable" in s2: + # Get nullable values (could be True, False, or None) + s1_null = s1.get("nullable") + s2_null = s2.get("nullable") + if s1_null is not None and s2_null is not None: + col_schema["nullable"] = s1_null or s2_null # Default is True + else: + col_schema["nullable"] = s1_null if s1_null is not None else s2_null + if "precision" in s1 or "precision" in s2: + col_schema["precision"] = s1.get("precision", s2.get("precision")) + if "scale" in s1 or "scale" in s2: + col_schema["scale"] = s1.get("scale", s2.get("scale")) + + # Update with the more detailed schema per column + table_schema["columns"][name] = col_schema + + return table_schema diff --git a/sources/pg_legacy_replication/pg_logicaldec.proto b/sources/pg_legacy_replication/pg_logicaldec.proto new file mode 100644 index 000000000..43371f5a8 --- /dev/null +++ b/sources/pg_legacy_replication/pg_logicaldec.proto @@ -0,0 +1,50 @@ +package decoderbufs; + +option java_package="io.debezium.connector.postgresql.proto"; +option java_outer_classname = "PgProto"; +option optimize_for = SPEED; + +enum Op { + UNKNOWN = -1; + INSERT = 0; + UPDATE = 1; + DELETE = 2; + BEGIN = 3; + COMMIT = 4; +} + +message Point { + required double x = 1; + required double y = 2; +} + +message DatumMessage { + optional string column_name = 1; + optional int64 column_type = 2; + oneof datum { + int32 datum_int32 = 3; + int64 datum_int64 = 4; + float datum_float = 5; + double datum_double = 6; + bool datum_bool = 7; + string datum_string = 8; + bytes datum_bytes = 9; + Point datum_point = 10; + bool datum_missing = 11; + } +} + +message TypeInfo { + required string modifier = 1; + required bool value_optional = 2; +} + +message RowMessage { + optional uint32 transaction_id = 1; + optional uint64 commit_time = 2; + optional string table = 3; + optional Op op = 4; + repeated DatumMessage new_tuple = 5; + repeated DatumMessage old_tuple = 6; + repeated TypeInfo new_typeinfo = 7; +} diff --git a/sources/pg_legacy_replication/pg_logicaldec_pb2.py b/sources/pg_legacy_replication/pg_logicaldec_pb2.py new file mode 100644 index 000000000..08fa960a1 --- /dev/null +++ b/sources/pg_legacy_replication/pg_logicaldec_pb2.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: pg_logicaldec.proto +# Protobuf Python Version: 5.26.1 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x13pg_logicaldec.proto\x12\x0b\x64\x65\x63oderbufs"\x1d\n\x05Point\x12\t\n\x01x\x18\x01 \x02(\x01\x12\t\n\x01y\x18\x02 \x02(\x01"\xa7\x02\n\x0c\x44\x61tumMessage\x12\x13\n\x0b\x63olumn_name\x18\x01 \x01(\t\x12\x13\n\x0b\x63olumn_type\x18\x02 \x01(\x03\x12\x15\n\x0b\x64\x61tum_int32\x18\x03 \x01(\x05H\x00\x12\x15\n\x0b\x64\x61tum_int64\x18\x04 \x01(\x03H\x00\x12\x15\n\x0b\x64\x61tum_float\x18\x05 \x01(\x02H\x00\x12\x16\n\x0c\x64\x61tum_double\x18\x06 \x01(\x01H\x00\x12\x14\n\ndatum_bool\x18\x07 \x01(\x08H\x00\x12\x16\n\x0c\x64\x61tum_string\x18\x08 \x01(\tH\x00\x12\x15\n\x0b\x64\x61tum_bytes\x18\t \x01(\x0cH\x00\x12)\n\x0b\x64\x61tum_point\x18\n \x01(\x0b\x32\x12.decoderbufs.PointH\x00\x12\x17\n\rdatum_missing\x18\x0b \x01(\x08H\x00\x42\x07\n\x05\x64\x61tum"4\n\x08TypeInfo\x12\x10\n\x08modifier\x18\x01 \x02(\t\x12\x16\n\x0evalue_optional\x18\x02 \x02(\x08"\xee\x01\n\nRowMessage\x12\x16\n\x0etransaction_id\x18\x01 \x01(\r\x12\x13\n\x0b\x63ommit_time\x18\x02 \x01(\x04\x12\r\n\x05table\x18\x03 \x01(\t\x12\x1b\n\x02op\x18\x04 \x01(\x0e\x32\x0f.decoderbufs.Op\x12,\n\tnew_tuple\x18\x05 \x03(\x0b\x32\x19.decoderbufs.DatumMessage\x12,\n\told_tuple\x18\x06 \x03(\x0b\x32\x19.decoderbufs.DatumMessage\x12+\n\x0cnew_typeinfo\x18\x07 \x03(\x0b\x32\x15.decoderbufs.TypeInfo*U\n\x02Op\x12\x14\n\x07UNKNOWN\x10\xff\xff\xff\xff\xff\xff\xff\xff\xff\x01\x12\n\n\x06INSERT\x10\x00\x12\n\n\x06UPDATE\x10\x01\x12\n\n\x06\x44\x45LETE\x10\x02\x12\t\n\x05\x42\x45GIN\x10\x03\x12\n\n\x06\x43OMMIT\x10\x04\x42\x33\n&io.debezium.connector.postgresql.protoB\x07PgProtoH\x01' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "pg_logicaldec_pb2", _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals["DESCRIPTOR"]._loaded_options = None + _globals[ + "DESCRIPTOR" + ]._serialized_options = ( + b"\n&io.debezium.connector.postgresql.protoB\007PgProtoH\001" + ) + _globals["_OP"]._serialized_start = 660 + _globals["_OP"]._serialized_end = 745 + _globals["_POINT"]._serialized_start = 36 + _globals["_POINT"]._serialized_end = 65 + _globals["_DATUMMESSAGE"]._serialized_start = 68 + _globals["_DATUMMESSAGE"]._serialized_end = 363 + _globals["_TYPEINFO"]._serialized_start = 365 + _globals["_TYPEINFO"]._serialized_end = 417 + _globals["_ROWMESSAGE"]._serialized_start = 420 + _globals["_ROWMESSAGE"]._serialized_end = 658 +# @@protoc_insertion_point(module_scope) diff --git a/sources/pg_legacy_replication/pg_logicaldec_pb2.pyi b/sources/pg_legacy_replication/pg_logicaldec_pb2.pyi new file mode 100644 index 000000000..abd25bf22 --- /dev/null +++ b/sources/pg_legacy_replication/pg_logicaldec_pb2.pyi @@ -0,0 +1,166 @@ +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file +""" + +import builtins +import collections.abc +import google.protobuf.descriptor +import google.protobuf.internal.containers +import google.protobuf.internal.enum_type_wrapper +import google.protobuf.message +import sys +import typing + +if sys.version_info >= (3, 10): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +class _Op: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + +class _OpEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[_Op.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + UNKNOWN: _Op.ValueType # -1 + INSERT: _Op.ValueType # 0 + UPDATE: _Op.ValueType # 1 + DELETE: _Op.ValueType # 2 + BEGIN: _Op.ValueType # 3 + COMMIT: _Op.ValueType # 4 + +class Op(_Op, metaclass=_OpEnumTypeWrapper): ... + +UNKNOWN: Op.ValueType # -1 +INSERT: Op.ValueType # 0 +UPDATE: Op.ValueType # 1 +DELETE: Op.ValueType # 2 +BEGIN: Op.ValueType # 3 +COMMIT: Op.ValueType # 4 +global___Op = Op + +@typing.final +class Point(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + X_FIELD_NUMBER: builtins.int + Y_FIELD_NUMBER: builtins.int + x: builtins.float + y: builtins.float + def __init__( + self, + *, + x: builtins.float | None = ..., + y: builtins.float | None = ..., + ) -> None: ... + def HasField(self, field_name: typing.Literal["x", b"x", "y", b"y"]) -> builtins.bool: ... + def ClearField(self, field_name: typing.Literal["x", b"x", "y", b"y"]) -> None: ... + +global___Point = Point + +@typing.final +class DatumMessage(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + COLUMN_NAME_FIELD_NUMBER: builtins.int + COLUMN_TYPE_FIELD_NUMBER: builtins.int + DATUM_INT32_FIELD_NUMBER: builtins.int + DATUM_INT64_FIELD_NUMBER: builtins.int + DATUM_FLOAT_FIELD_NUMBER: builtins.int + DATUM_DOUBLE_FIELD_NUMBER: builtins.int + DATUM_BOOL_FIELD_NUMBER: builtins.int + DATUM_STRING_FIELD_NUMBER: builtins.int + DATUM_BYTES_FIELD_NUMBER: builtins.int + DATUM_POINT_FIELD_NUMBER: builtins.int + DATUM_MISSING_FIELD_NUMBER: builtins.int + column_name: builtins.str + column_type: builtins.int + datum_int32: builtins.int + datum_int64: builtins.int + datum_float: builtins.float + datum_double: builtins.float + datum_bool: builtins.bool + datum_string: builtins.str + datum_bytes: builtins.bytes + datum_missing: builtins.bool + @property + def datum_point(self) -> global___Point: ... + def __init__( + self, + *, + column_name: builtins.str | None = ..., + column_type: builtins.int | None = ..., + datum_int32: builtins.int | None = ..., + datum_int64: builtins.int | None = ..., + datum_float: builtins.float | None = ..., + datum_double: builtins.float | None = ..., + datum_bool: builtins.bool | None = ..., + datum_string: builtins.str | None = ..., + datum_bytes: builtins.bytes | None = ..., + datum_point: global___Point | None = ..., + datum_missing: builtins.bool | None = ..., + ) -> None: ... + def HasField(self, field_name: typing.Literal["column_name", b"column_name", "column_type", b"column_type", "datum", b"datum", "datum_bool", b"datum_bool", "datum_bytes", b"datum_bytes", "datum_double", b"datum_double", "datum_float", b"datum_float", "datum_int32", b"datum_int32", "datum_int64", b"datum_int64", "datum_missing", b"datum_missing", "datum_point", b"datum_point", "datum_string", b"datum_string"]) -> builtins.bool: ... + def ClearField(self, field_name: typing.Literal["column_name", b"column_name", "column_type", b"column_type", "datum", b"datum", "datum_bool", b"datum_bool", "datum_bytes", b"datum_bytes", "datum_double", b"datum_double", "datum_float", b"datum_float", "datum_int32", b"datum_int32", "datum_int64", b"datum_int64", "datum_missing", b"datum_missing", "datum_point", b"datum_point", "datum_string", b"datum_string"]) -> None: ... + def WhichOneof(self, oneof_group: typing.Literal["datum", b"datum"]) -> typing.Literal["datum_int32", "datum_int64", "datum_float", "datum_double", "datum_bool", "datum_string", "datum_bytes", "datum_point", "datum_missing"] | None: ... + +global___DatumMessage = DatumMessage + +@typing.final +class TypeInfo(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + MODIFIER_FIELD_NUMBER: builtins.int + VALUE_OPTIONAL_FIELD_NUMBER: builtins.int + modifier: builtins.str + value_optional: builtins.bool + def __init__( + self, + *, + modifier: builtins.str | None = ..., + value_optional: builtins.bool | None = ..., + ) -> None: ... + def HasField(self, field_name: typing.Literal["modifier", b"modifier", "value_optional", b"value_optional"]) -> builtins.bool: ... + def ClearField(self, field_name: typing.Literal["modifier", b"modifier", "value_optional", b"value_optional"]) -> None: ... + +global___TypeInfo = TypeInfo + +@typing.final +class RowMessage(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + TRANSACTION_ID_FIELD_NUMBER: builtins.int + COMMIT_TIME_FIELD_NUMBER: builtins.int + TABLE_FIELD_NUMBER: builtins.int + OP_FIELD_NUMBER: builtins.int + NEW_TUPLE_FIELD_NUMBER: builtins.int + OLD_TUPLE_FIELD_NUMBER: builtins.int + NEW_TYPEINFO_FIELD_NUMBER: builtins.int + transaction_id: builtins.int + commit_time: builtins.int + table: builtins.str + op: global___Op.ValueType + @property + def new_tuple(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___DatumMessage]: ... + @property + def old_tuple(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___DatumMessage]: ... + @property + def new_typeinfo(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___TypeInfo]: ... + def __init__( + self, + *, + transaction_id: builtins.int | None = ..., + commit_time: builtins.int | None = ..., + table: builtins.str | None = ..., + op: global___Op.ValueType | None = ..., + new_tuple: collections.abc.Iterable[global___DatumMessage] | None = ..., + old_tuple: collections.abc.Iterable[global___DatumMessage] | None = ..., + new_typeinfo: collections.abc.Iterable[global___TypeInfo] | None = ..., + ) -> None: ... + def HasField(self, field_name: typing.Literal["commit_time", b"commit_time", "op", b"op", "table", b"table", "transaction_id", b"transaction_id"]) -> builtins.bool: ... + def ClearField(self, field_name: typing.Literal["commit_time", b"commit_time", "new_tuple", b"new_tuple", "new_typeinfo", b"new_typeinfo", "old_tuple", b"old_tuple", "op", b"op", "table", b"table", "transaction_id", b"transaction_id"]) -> None: ... + +global___RowMessage = RowMessage diff --git a/sources/pg_legacy_replication/requirements.txt b/sources/pg_legacy_replication/requirements.txt new file mode 100644 index 000000000..85f40b3e5 --- /dev/null +++ b/sources/pg_legacy_replication/requirements.txt @@ -0,0 +1,4 @@ +dlt>=1.3.0 +psycopg2-binary>=2.9.9 +protobuf>=5 +sqlalchemy>=1.4 \ No newline at end of file diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py new file mode 100644 index 000000000..4f1b3477a --- /dev/null +++ b/sources/pg_legacy_replication/schema_types.py @@ -0,0 +1,222 @@ +import json +import re +from functools import lru_cache +from typing import Any, Callable, Dict, List, Optional, Tuple + +import pendulum +from dlt.common import Decimal, logger +from dlt.common.data_types.type_helpers import coerce_value +from dlt.common.data_types.typing import TDataType +from dlt.common.schema.typing import TColumnSchema, TColumnType +from dlt.destinations.impl.postgres.factory import PostgresTypeMapper + +from .pg_logicaldec_pb2 import DatumMessage, TypeInfo + +_DUMMY_VALS: Dict[TDataType, Any] = { + "bigint": 0, + "binary": b" ", + "bool": True, + "json": [0], + "date": pendulum.Date(1970, 1, 1), + "decimal": Decimal(0), + "double": 0.0, + "text": "", + "time": pendulum.Time(), + "timestamp": pendulum.from_timestamp(0), + "wei": 0, +} +"""Dummy values used to replace NULLs in NOT NULL columns in key-only delete records.""" + +_PG_TYPES: Dict[int, str] = { + 16: "boolean", + 17: "bytea", + 20: "bigint", + 21: "smallint", + 23: "integer", + 25: "text", + 700: "real", + 701: "double precision", + 1043: "character varying", + 1082: "date", + 1083: "time without time zone", + 1114: "timestamp without time zone", + 1184: "timestamp with time zone", + 1700: "numeric", + 3802: "jsonb", +} +"""Maps postgres type OID to type string.""" + +_MISSING_TYPES: Dict[str, TDataType] = { + "real": "double", + "text": "text", + "timestamp without time zone": "timestamp", +} +# FIXME Missing types for old postgres versions + +_DATUM_RAW_TYPES: Dict[str, TDataType] = { + "datum_int32": "bigint", + "datum_int64": "bigint", + "datum_float": "double", + "datum_double": "double", + "datum_bool": "bool", + "datum_string": "text", + "datum_bytes": "binary", +} +"""Maps decoderbuf's datum msg type to dlt type.""" + +_FIXED_PRECISION_TYPES: Dict[int, Tuple[int, Optional[int]]] = { + 21: (16, None), # smallint + 23: (32, None), # integer + 20: (64, None), # bigint + 700: (32, None), # real +} +"""Dict for fixed precision types""" + +_VARYING_PRECISION_PATTERNS: Dict[int, str] = { + 1043: r"character varying\((\d+)\)", + 1700: r"numeric\((\d+),(\d+)\)", + 1184: r"timestamp\((\d+)\) with time zone", + 1083: r"time\((\d+)\) without time zone", +} +"""Regex patterns for precision/scale types""" + + +def _get_precision_and_scale( + type_id: int, modifier: Optional[str] +) -> Tuple[Optional[int], Optional[int]]: + """Get precision from postgres type attributes and modifiers.""" + if type_id in _FIXED_PRECISION_TYPES: + return _FIXED_PRECISION_TYPES[type_id] + + # If modifier or pattern is missing, return defaults + if not modifier or (pattern := _VARYING_PRECISION_PATTERNS.get(type_id)) is None: + return None, None + + if match := re.search(pattern, modifier): + groups = match.groups() + precision = int(groups[0]) + scale = int(groups[1]) if len(groups) > 1 else None + return precision, scale + + return None, None + + +@lru_cache(maxsize=None) +def _type_mapper() -> PostgresTypeMapper: + from dlt.destinations import postgres + + return PostgresTypeMapper(postgres().capabilities()) + + +def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: + """ + Converts postgres type OID to dlt column type. + + Type OIDs not in _PG_TYPES mapping default to "text" type. + """ + pg_type = _PG_TYPES.get(type_id) + if pg_type in _MISSING_TYPES: + return {"data_type": _MISSING_TYPES[pg_type]} + if modifier and modifier.endswith("[]"): + return {"data_type": "json"} + if pg_type is None: + logger.warning( + "No type found for type_id '%s' and modifier '%s'", type_id, modifier + ) + pg_type = "character varying" + + precision, scale = _get_precision_and_scale(type_id, modifier) + return _type_mapper().from_destination_type(pg_type, precision, scale) + + +def _to_dlt_column_schema( + col_name: str, datum: DatumMessage, type_info: Optional[TypeInfo] +) -> TColumnSchema: + """Converts decoderbuf's datum value/typeinfo to dlt column schema.""" + column_schema: TColumnSchema = { + "name": col_name, + **_to_dlt_column_type( + datum.column_type, type_info.modifier if type_info else None + ), + } + + # Set nullable attribute if type_info is available + if type_info: + column_schema["nullable"] = type_info.value_optional + + return column_schema + + +def _epoch_micros_to_datetime(microseconds_since_1970: int) -> pendulum.DateTime: + return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000) + + +def _microseconds_to_time(microseconds: int) -> pendulum.Time: + return pendulum.Time().add(microseconds=microseconds) + + +def _epoch_days_to_date(epoch_days: int) -> pendulum.Date: + return pendulum.Date(1970, 1, 1).add(days=epoch_days) + + +data_type_handlers: Dict[TDataType, Callable[[Any], Any]] = { + "date": _epoch_days_to_date, + "time": _microseconds_to_time, + "timestamp": _epoch_micros_to_datetime, +} +"""Dispatch table for type conversions""" + + +def _to_dlt_val( + val: DatumMessage, col_schema: TColumnSchema, *, for_delete: bool = False +) -> Any: + """Converts decoderbuf's datum value into dlt-compatible data value.""" + data_type = col_schema["data_type"] + assert data_type is not None + datum = _get_datum_attr(val) + if datum is None: + nullable = col_schema.get("nullable", False) + if for_delete and not nullable: + return _DUMMY_VALS[data_type] + return None + + raw_value = getattr(val, datum) + if data_type in data_type_handlers: + return data_type_handlers[data_type](raw_value) + + raw_type = _DATUM_RAW_TYPES[datum] + if raw_type == "binary" and _is_scalar_pg_array(data_type, raw_value): + return _pg_array_to_json_array(raw_value) + + return coerce_value(data_type, raw_type, raw_value) + + +def _is_scalar_pg_array(data_type: TDataType, raw_value: bytes) -> bool: + return ( + len(raw_value) > 1 + and data_type == "json" + and raw_value[0] == ord("{") + and raw_value[-1] == ord("}") + ) + + +def _pg_array_to_json_array(raw_value: bytes) -> List[Any]: + """ + Decode the byte string into a scalar array + """ + without_braces = raw_value[1:-1].decode() + + def safe_load(x: str) -> Any: + try: + return json.loads(x) + except json.JSONDecodeError: + return x + + return [safe_load(x) for x in without_braces.split(",")] + + +def _get_datum_attr(val: DatumMessage) -> Optional[str]: + datum = val.WhichOneof("datum") + if datum is None or datum == "datum_missing": + return None + return datum diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py new file mode 100644 index 000000000..be38414bc --- /dev/null +++ b/sources/pg_legacy_replication_pipeline.py @@ -0,0 +1,239 @@ +import dlt +from dlt.common.destination import Destination +from dlt.destinations.impl.postgres.configuration import PostgresCredentials + +from pg_legacy_replication import init_replication, replication_source + +PG_CREDS = dlt.secrets.get("sources.pg_replication.credentials", PostgresCredentials) + + +def replicate_single_table() -> None: + """Sets up replication for a single Postgres table and loads changes into a destination. + + Demonstrates basic usage of `init_replication` helper and `replication_resource` resource. + Uses `src_pl` to create and change the replicated Postgres table—this + is only for demonstration purposes, you won't need this when you run in production + as you'll probably have another process feeding your Postgres instance. + """ + # create source and destination pipelines + src_pl = get_postgres_pipeline() + dest_pl = dlt.pipeline( + pipeline_name="pg_replication_pipeline", + destination="duckdb", + dataset_name="replicate_single_table", + dev_mode=True, + ) + + # create table "my_source_table" in source to demonstrate replication + create_source_table( + src_pl, "CREATE TABLE {table_name} (id integer PRIMARY KEY, val bool);" + ) + + # initialize replication for the source table—this creates a replication slot and publication + slot_name = "example_slot" + init_replication( # requires the Postgres user to have the REPLICATION attribute assigned + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="my_source_table", + reset=True, + ) + + # create a resource that generates items for each change in the source table + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="my_source_table", + ) + changes.my_source_table.apply_hints( + write_disposition="merge", + primary_key="id", + columns={ + "_pg_deleted_ts": {"hard_delete": True}, + "_pg_lsn": {"dedup_sort": "desc"}, + }, + ) + + # insert two records in source table and propagate changes to destination + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, true), (2, false);" + ) + dest_pl.run(changes) + show_destination_table(dest_pl) + + # update record in source table and propagate change to destination + change_source_table(src_pl, "UPDATE {table_name} SET val = true WHERE id = 2;") + dest_pl.run(changes) + show_destination_table(dest_pl) + + # delete record from source table and propagate change to destination + change_source_table(src_pl, "DELETE FROM {table_name} WHERE id = 2;") + dest_pl.run(changes) + show_destination_table(dest_pl) + + +def replicate_with_initial_load() -> None: + """Sets up replication with initial load. + + Demonstrates usage of `take_snapshots` argument and snapshot resource + returned by `init_replication` helper. + """ + # create source and destination pipelines + src_pl = get_postgres_pipeline() + dest_pl = dlt.pipeline( + pipeline_name="pg_replication_pipeline", + destination="duckdb", + dataset_name="replicate_with_initial_load", + dev_mode=True, + ) + + # create table "my_source_table" in source to demonstrate replication + create_source_table( + src_pl, "CREATE TABLE {table_name} (id integer PRIMARY KEY, val bool);" + ) + + # insert records before initializing replication + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, true), (2, false);" + ) + + # initialize replication for the source table + slot_name = "example_slot" + snapshot = init_replication( # requires the Postgres user to have the REPLICATION attribute assigned + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="my_source_table", + take_snapshots=True, # let function return resource(s) for initial load + reset=True, + ) + + # perform initial load to capture all records present in source table prior to replication initialization + dest_pl.run(snapshot) + show_destination_table(dest_pl) + + # insert record in source table and propagate change to destination + change_source_table(src_pl, "INSERT INTO {table_name} VALUES (3, true);") + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="my_source_table", + ) + dest_pl.run(changes) + show_destination_table(dest_pl) + + +def replicate_with_column_selection() -> None: + """Sets up replication with column selection. + + Demonstrates usage of `include_columns` argument. + """ + # create source and destination pipelines + src_pl = get_postgres_pipeline() + dest_pl = dlt.pipeline( + pipeline_name="pg_replication_pipeline", + destination="duckdb", + dataset_name="replicate_with_column_selection", + dev_mode=True, + ) + + # create two source tables to demonstrate schema replication + create_source_table( + src_pl, + "CREATE TABLE {table_name} (c1 integer PRIMARY KEY, c2 bool, c3 varchar);", + "tbl_x", + ) + create_source_table( + src_pl, + "CREATE TABLE {table_name} (c1 integer PRIMARY KEY, c2 bool, c3 varchar);", + "tbl_y", + ) + + # initialize schema replication by omitting the `table_names` argument + slot_name = "example_slot" + init_replication( # requires the Postgres user to have the REPLICATION attribute assigned + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + reset=True, + ) + + # create a resource that generates items for each change in the schema's tables + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + repl_options={ + "tbl_x": {"included_columns": {"c1", "c2"}} + }, # columns not specified here are excluded from generated data items + ) + + # insert records in source tables and propagate changes to destination + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, true, 'foo');", "tbl_x" + ) + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, false, 'bar');", "tbl_y" + ) + dest_pl.run(changes) + + # show columns in schema for both tables + # column c3 is not in the schema for tbl_x because we did not include it + # tbl_y does have column c3 because we didn't specify include columns for this table and by default all columns are included + print("tbl_x", ":", list(dest_pl.default_schema.get_table_columns("tbl_x").keys())) + print("tbl_y", ":", list(dest_pl.default_schema.get_table_columns("tbl_y").keys())) + + +# define some helper methods to make examples more readable + + +def get_postgres_pipeline() -> dlt.Pipeline: + """Returns a pipeline loading into `postgres` destination. + + Uses workaround to fix destination to `postgres`, so it does not get replaced + during `dlt init`. + """ + # this trick prevents dlt init command from replacing "destination" argument to "pipeline" + p_call = dlt.pipeline + pipe = p_call( + pipeline_name="source_pipeline", + destination=Destination.from_reference("postgres", credentials=PG_CREDS), + dataset_name="source_dataset", + dev_mode=True, + ) + return pipe + + +def create_source_table( + src_pl: dlt.Pipeline, sql: str, table_name: str = "my_source_table" +) -> None: + with src_pl.sql_client() as c: + try: + c.create_dataset() + except dlt.destinations.exceptions.DatabaseTerminalException: + pass + qual_name = c.make_qualified_table_name(table_name) + c.execute_sql(sql.format(table_name=qual_name)) + + +def change_source_table( + src_pl: dlt.Pipeline, sql: str, table_name: str = "my_source_table" +) -> None: + with src_pl.sql_client() as c: + qual_name = c.make_qualified_table_name(table_name) + c.execute_sql(sql.format(table_name=qual_name)) + + +def show_destination_table( + dest_pl: dlt.Pipeline, + table_name: str = "my_source_table", + column_names: str = "id, val", +) -> None: + with dest_pl.sql_client() as c: + dest_qual_name = c.make_qualified_table_name(table_name) + with c.execute_query(f"SELECT {column_names} FROM {dest_qual_name}") as curr: + print(table_name, ":\n", curr.df()) + + +if __name__ == "__main__": + replicate_single_table() + # replicate_with_initial_load() + # replicate_with_column_selection() diff --git a/tests/pg_legacy_replication/__init__.py b/tests/pg_legacy_replication/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py new file mode 100644 index 000000000..e078dcb6c --- /dev/null +++ b/tests/pg_legacy_replication/cases.py @@ -0,0 +1,1005 @@ +from base64 import b64encode +from enum import IntEnum +from typing import List, Tuple + +import pendulum +from dlt.common import Decimal +from dlt.common.schema import TColumnSchema, TTableSchema, TTableSchemaColumns +from dlt.common.typing import TDataItem + +TABLE_ROW_ALL_DATA_TYPES = { + "col1": 989127831, + "col2": 898912.821982, + "col3": True, + "col4": "2022-05-23T13:26:45.176451+00:00", + "col5": "string data \n \r \x8e 🦆", + "col6": Decimal("2323.34"), + "col7": b"binary data \n \r \x8e", + # "col8": 2**56 + 92093890840, # TODO: uncommment and make it work + "col9": { + "json": [1, 2, 3, "a"], + "link": ( + "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6" + " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" + ), + }, + "col10": "2023-02-27", + "col11": "13:26:45.176451", + "col1_null": None, + "col2_null": None, + "col3_null": None, + "col4_null": None, + "col5_null": None, + "col6_null": None, + "col7_null": None, + # "col8_null": None, + "col9_null": None, + "col10_null": None, + "col11_null": None, + "col1_precision": 22324, + "col4_precision": "2022-05-23T13:26:46.167231+00:00", + "col5_precision": "string data 2 \n \r \x8e 🦆", + "col6_precision": Decimal("2323.34"), + "col7_precision": b"binary data 2 \n \r \x8e", + "col11_precision": "13:26:45.176451", +} +TABLE_UPDATE: List[TColumnSchema] = [ + {"name": "col1", "data_type": "bigint", "nullable": False}, + {"name": "col2", "data_type": "double", "nullable": False}, + {"name": "col3", "data_type": "bool", "nullable": False}, + {"name": "col4", "data_type": "timestamp", "nullable": False}, + {"name": "col5", "data_type": "text", "nullable": False}, + {"name": "col6", "data_type": "decimal", "nullable": False}, + {"name": "col7", "data_type": "binary", "nullable": False}, + # {"name": "col8", "data_type": "wei", "nullable": False}, + {"name": "col9", "data_type": "json", "nullable": False, "variant": True}, + {"name": "col10", "data_type": "date", "nullable": False}, + {"name": "col11", "data_type": "time", "nullable": False}, + {"name": "col1_null", "data_type": "bigint", "nullable": True}, + {"name": "col2_null", "data_type": "double", "nullable": True}, + {"name": "col3_null", "data_type": "bool", "nullable": True}, + {"name": "col4_null", "data_type": "timestamp", "nullable": True}, + {"name": "col5_null", "data_type": "text", "nullable": True}, + {"name": "col6_null", "data_type": "decimal", "nullable": True}, + {"name": "col7_null", "data_type": "binary", "nullable": True}, + # {"name": "col8_null", "data_type": "wei", "nullable": True}, + {"name": "col9_null", "data_type": "json", "nullable": True, "variant": True}, + {"name": "col10_null", "data_type": "date", "nullable": True}, + {"name": "col11_null", "data_type": "time", "nullable": True}, + { + "name": "col1_precision", + "data_type": "bigint", + "precision": 16, + "nullable": False, + }, + { + "name": "col4_precision", + "data_type": "timestamp", + "precision": 6, + "nullable": False, + }, + {"name": "col5_precision", "data_type": "text", "precision": 25, "nullable": False}, + { + "name": "col6_precision", + "data_type": "decimal", + "precision": 6, + "scale": 2, + "nullable": False, + }, + { + "name": "col7_precision", + "data_type": "binary", + "precision": 19, + "nullable": False, + }, + {"name": "col11_precision", "data_type": "time", "precision": 6, "nullable": False}, +] + +TABLE_UPDATE_COLUMNS_SCHEMA: TTableSchemaColumns = {t["name"]: t for t in TABLE_UPDATE} + +ROW_MESSAGES: List[dict] = [ + { + "transactionId": 969, + "commitTime": "1728662646949062", + "table": "src_pl_dataset_202410110404048747_staging.tbl_y", + "op": "INSERT", + "newTuple": [ + { + "columnName": "id_y", + "columnType": 20, + "datumInt64": 2, + }, + { + "columnName": "val_y", + "columnType": 16, + "datumBool": False, + }, + { + "columnName": '"primary"', + "columnType": 16, + "datumBool": True, + }, + { + "columnName": "_dlt_load_id", + "columnType": 1043, + "datumString": "1728662646.2657657", + }, + { + "columnName": "_dlt_id", + "columnType": 1043, + "datumString": "gGjifTMTAUs5ag", + }, + ], + "newTypeinfo": [ + { + "modifier": "bigint", + "valueOptional": False, + }, + { + "modifier": "boolean", + "valueOptional": True, + }, + { + "modifier": "boolean", + "valueOptional": True, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, + ], + "oldTuple": [], + }, + { + "transactionId": 2018, + "commitTime": "1729503423666542", + "table": "src_pl_dataset_202410210936594956.items", + "op": "INSERT", + "newTuple": [ + { + "columnName": "col4", + "columnType": 1184, + "datumInt64": 1653312405176451, + }, + { + "columnName": "col9", + "columnType": 3802, + "datumString": ( + '{"link": "?commen\\ntU\\nrn=urn%3Ali%3Acomment%3A%28acti\\n \\u0006 \\\\vity%3A69\'08444473\\n\\n551163392' + '%2C6n \\r \x8e9085", "json": [1, 2, 3, "a"]}' + ), + }, + { + "columnName": "col10", + "columnType": 1082, + "datumInt32": 19415, + }, + { + "columnName": "col11", + "columnType": 1083, + "datumInt64": 48405176451, + }, + {"columnName": "col12", "columnType": 1114}, + {"columnName": "col13", "columnType": 700}, + {"columnName": "col14", "columnType": 1043, "datum_missing": True}, + ], + "newTypeinfo": [ + {"modifier": "timestamp with time zone", "valueOptional": False}, + {"modifier": "jsonb", "valueOptional": False}, + {"modifier": "date", "valueOptional": False}, + {"modifier": "time without time zone", "valueOptional": False}, + {"modifier": "timestamp without time zone", "valueOptional": True}, + {"modifier": "real", "valueOptional": True}, + {"modifier": "character varying", "valueOptional": True}, + ], + }, + { + "transactionId": 932, + "commitTime": "1729299383354856", + "table": "src_pl_dataset_202410191256122080.tbl_x", + "op": "DELETE", + "oldTuple": [ + { + "columnName": "id_x", + "columnType": 20, + "datumInt64": 1, + }, + { + "columnName": "val_x", + "columnType": 1043, + }, + { + "columnName": "col_bool", + "columnType": 16, + }, + { + "columnName": "col_bytea", + "columnType": 17, + }, + { + "columnName": "col_int4", + "columnType": 21, + }, + { + "columnName": "col_int", + "columnType": 23, + }, + { + "columnName": "col_real", + "columnType": 700, + }, + { + "columnName": "col_double", + "columnType": 701, + }, + { + "columnName": "col_date", + "columnType": 1082, + }, + { + "columnName": "col_time", + "columnType": 1083, + }, + { + "columnName": "col_ts", + "columnType": 1114, + }, + { + "columnName": "col_tstz", + "columnType": 1184, + }, + { + "columnName": "col_num", + "columnType": 1700, + }, + { + "columnName": "col_json", + "columnType": 3802, + }, + ], + }, + { + "transactionId": 754, + "commitTime": "1736873892023448", + "table": "src_pl_dataset_202501140458116348.data_types", + "op": "INSERT", + "newTuple": [ + {"columnName": "bit_col", "columnType": 1560, "datumString": "1"}, + { + "columnName": "box_col", + "columnType": 603, + "datumBytes": b64encode(b"(1,1),(0,0)").decode(), + }, + { + "columnName": "uuid_col", + "columnType": 2950, + "datumString": "6e1f5de1-1093-4bfe-98e4-62ac56b2db54", + }, + { + "columnName": "text_a", + "columnType": 1009, + "datumBytes": b64encode( + b'{"Network administration",GNS3,BGP}' + ).decode(), + }, + ], + "newTypeinfo": [ + { + "modifier": "bit(1)", + "valueOptional": True, + }, + { + "modifier": "box", + "valueOptional": True, + }, + { + "modifier": "uuid", + "valueOptional": True, + }, + { + "modifier": "text[]", + "valueOptional": True, + }, + ], + "oldTuple": [], + }, +] + +DATA_ITEMS: List[TDataItem] = [ + { + "id_y": 2, + "val_y": False, + "primary": True, + "_dlt_id": "gGjifTMTAUs5ag", + "_dlt_load_id": "1728662646.2657657", + "_pg_lsn": 1, + "_pg_commit_ts": pendulum.parse("2024-10-11T16:04:06.949062+00:00"), + "_pg_tx_id": 969, + }, + { + "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), + "col9": { + "json": [1, 2, 3, "a"], + "link": ( + "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6" + " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" + ), + }, + "col10": pendulum.parse("2023-02-27", strict=False).date(), + "col11": pendulum.parse("13:26:45.176451", strict=False).time(), + "col12": None, + "col13": None, + "col14": None, + "_pg_lsn": 1, + "_pg_commit_ts": pendulum.parse("2024-10-21T09:37:03.666542+00:00"), + "_pg_tx_id": 2018, + }, + { + "id_x": 1, + "val_x": "", + "col_bool": True, + "col_bytea": b" ", + "col_int4": 0, + "col_int": 0, + "col_real": 0.0, + "col_double": 0.0, + "col_time": pendulum.parse("00:00:00", strict=False).time(), + "col_date": pendulum.parse("1970-01-01", strict=False).date(), + "col_ts": pendulum.parse("1970-01-01T00:00:00+00:00"), + "col_tstz": pendulum.parse("1970-01-01T00:00:00+00:00"), + "col_num": Decimal(0), + "col_json": [0], + "_pg_lsn": 1, + "_pg_deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), + "_pg_commit_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), + "_pg_tx_id": 932, + }, + { + "bit_col": "1", + "box_col": "KDEsMSksKDAsMCk=", + "uuid_col": "6e1f5de1-1093-4bfe-98e4-62ac56b2db54", + "text_a": ["Network administration", "GNS3", "BGP"], + "_pg_lsn": 1, + "_pg_commit_ts": pendulum.parse("2025-01-14T16:58:12.023448+00:00"), + "_pg_tx_id": 754, + }, +] + +TABLE_SCHEMAS: List[TTableSchema] = [ + { + "name": "tbl_y", + "columns": { + "id_y": { + "data_type": "bigint", + "name": "id_y", + "nullable": False, + "precision": 64, + }, + "val_y": {"data_type": "bool", "name": "val_y", "nullable": True}, + "primary": {"data_type": "bool", "name": "primary", "nullable": True}, + "_dlt_id": {"data_type": "text", "name": "_dlt_id", "nullable": False}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + "_pg_commit_ts": { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + }, + "_pg_tx_id": { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + }, + }, + }, + { + "name": "items", + "columns": { + "col4": {"data_type": "timestamp", "name": "col4", "nullable": False}, + "col9": {"data_type": "json", "name": "col9", "nullable": False}, + "col10": {"data_type": "date", "name": "col10", "nullable": False}, + "col11": {"data_type": "time", "name": "col11", "nullable": False}, + "col12": {"data_type": "timestamp", "name": "col12", "nullable": True}, + "col13": {"data_type": "double", "name": "col13", "nullable": True}, + "col14": {"data_type": "text", "name": "col14", "nullable": True}, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + "_pg_commit_ts": { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + }, + "_pg_tx_id": { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + }, + }, + }, + { + "name": "tbl_x", + "columns": { + "id_x": {"data_type": "bigint", "name": "id_x", "precision": 64}, + "val_x": {"data_type": "text", "name": "val_x"}, + "col_bool": {"data_type": "bool", "name": "col_bool"}, + "col_bytea": {"data_type": "binary", "name": "col_bytea"}, + "col_int4": {"data_type": "bigint", "name": "col_int4", "precision": 16}, + "col_int": {"data_type": "bigint", "name": "col_int", "precision": 32}, + "col_real": {"data_type": "double", "name": "col_real"}, + "col_double": {"data_type": "double", "name": "col_double"}, + "col_date": {"data_type": "date", "name": "col_date"}, + "col_time": {"data_type": "time", "name": "col_time"}, + "col_ts": {"data_type": "timestamp", "name": "col_ts"}, + "col_tstz": {"data_type": "timestamp", "name": "col_tstz"}, + "col_num": {"data_type": "decimal", "name": "col_num"}, + "col_json": {"data_type": "json", "name": "col_json"}, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + "_pg_commit_ts": { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + }, + "_pg_tx_id": { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + }, + }, + }, + { + "name": "data_types", + "columns": { + "bit_col": {"data_type": "text", "name": "bit_col", "nullable": True}, + "box_col": {"data_type": "text", "name": "box_col", "nullable": True}, + "uuid_col": {"data_type": "text", "name": "uuid_col", "nullable": True}, + "text_a": {"data_type": "json", "name": "text_a", "nullable": True}, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + "_pg_commit_ts": { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + }, + "_pg_tx_id": { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + }, + }, + }, +] + + +class SchemaChoice(IntEnum): + first = 0 + second = 1 + error = -1 + + +SIMILAR_SCHEMAS: List[Tuple[TTableSchema, TTableSchema, SchemaChoice]] = [ + ( + { + "name": "items", + "columns": { + "col1": { + "name": "col1", + "data_type": "bigint", + "precision": 64, + "nullable": False, + }, + "col2": {"name": "col2", "data_type": "double", "nullable": False}, + "col3": {"name": "col3", "data_type": "bool", "nullable": False}, + "col4": {"name": "col4", "data_type": "timestamp", "nullable": False}, + "col5": {"name": "col5", "data_type": "text", "nullable": False}, + "col6": { + "name": "col6", + "data_type": "decimal", + "precision": 38, + "scale": 9, + "nullable": False, + }, + "col7": {"name": "col7", "data_type": "binary", "nullable": False}, + "col9": {"name": "col9", "data_type": "json", "nullable": False}, + "col10": {"name": "col10", "data_type": "date", "nullable": False}, + "col11": {"name": "col11", "data_type": "time", "nullable": False}, + "col1_null": { + "name": "col1_null", + "data_type": "bigint", + "precision": 64, + "nullable": True, + }, + "col2_null": { + "name": "col2_null", + "data_type": "double", + "nullable": True, + }, + "col3_null": { + "name": "col3_null", + "data_type": "bool", + "nullable": True, + }, + "col4_null": { + "name": "col4_null", + "data_type": "timestamp", + "nullable": True, + }, + "col5_null": { + "name": "col5_null", + "data_type": "text", + "nullable": True, + }, + "col6_null": { + "name": "col6_null", + "data_type": "decimal", + "precision": 38, + "scale": 9, + "nullable": True, + }, + "col7_null": { + "name": "col7_null", + "data_type": "binary", + "nullable": True, + }, + "col9_null": { + "name": "col9_null", + "data_type": "json", + "nullable": True, + }, + "col10_null": { + "name": "col10_null", + "data_type": "date", + "nullable": True, + }, + "col11_null": { + "name": "col11_null", + "data_type": "time", + "nullable": True, + }, + "col1_precision": { + "name": "col1_precision", + "data_type": "bigint", + "precision": 16, + "nullable": False, + }, + "col4_precision": { + "name": "col4_precision", + "data_type": "timestamp", + "precision": 3, + "nullable": False, + }, + "col5_precision": { + "name": "col5_precision", + "data_type": "text", + "precision": 25, + "nullable": False, + }, + "col6_precision": { + "name": "col6_precision", + "data_type": "decimal", + "precision": 6, + "scale": 2, + "nullable": False, + }, + "col7_precision": { + "name": "col7_precision", + "data_type": "binary", + "nullable": False, + }, + "col11_precision": { + "name": "col11_precision", + "data_type": "time", + "precision": 3, + "nullable": False, + }, + "_dlt_load_id": { + "name": "_dlt_load_id", + "data_type": "text", + "nullable": False, + }, + "_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False}, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + }, + }, + { + "name": "items", + "columns": { + "col1": { + "name": "col1", + "data_type": "bigint", + "precision": 64, + "nullable": False, + }, + "col2": {"name": "col2", "data_type": "double"}, + "col3": {"name": "col3", "data_type": "bool"}, + "col4": {"name": "col4", "data_type": "timestamp"}, + "col5": {"name": "col5", "data_type": "text"}, + "col6": {"name": "col6", "data_type": "decimal"}, + "col7": {"name": "col7", "data_type": "binary"}, + "col9": {"name": "col9", "data_type": "json"}, + "col10": {"name": "col10", "data_type": "date"}, + "col11": {"name": "col11", "data_type": "time"}, + "col1_null": { + "name": "col1_null", + "data_type": "bigint", + "precision": 64, + }, + "col2_null": {"name": "col2_null", "data_type": "double"}, + "col3_null": {"name": "col3_null", "data_type": "bool"}, + "col4_null": {"name": "col4_null", "data_type": "timestamp"}, + "col5_null": {"name": "col5_null", "data_type": "text"}, + "col6_null": {"name": "col6_null", "data_type": "decimal"}, + "col7_null": {"name": "col7_null", "data_type": "binary"}, + "col9_null": {"name": "col9_null", "data_type": "json"}, + "col10_null": {"name": "col10_null", "data_type": "date"}, + "col11_null": {"name": "col11_null", "data_type": "time"}, + "col1_precision": { + "name": "col1_precision", + "data_type": "bigint", + "precision": 16, + }, + "col4_precision": {"name": "col4_precision", "data_type": "timestamp"}, + "col5_precision": {"name": "col5_precision", "data_type": "text"}, + "col6_precision": {"name": "col6_precision", "data_type": "decimal"}, + "col7_precision": {"name": "col7_precision", "data_type": "binary"}, + "col11_precision": {"name": "col11_precision", "data_type": "time"}, + "_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text"}, + "_dlt_id": {"name": "_dlt_id", "data_type": "text"}, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + }, + }, + SchemaChoice.first, + ), + ( + { + "name": "items", + "columns": { + "_dlt_id": {"data_type": "text", "name": "_dlt_id", "nullable": False}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "c1": { + "data_type": "bigint", + "name": "c1", + "nullable": True, + "precision": 64, + }, + "c2": { + "data_type": "bigint", + "name": "c2", + "nullable": True, + "precision": 64, + }, + "c3": { + "data_type": "bigint", + "name": "c3", + "nullable": True, + "precision": 64, + }, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + }, + }, + { + "name": "items", + "columns": { + "_dlt_id": {"data_type": "text", "name": "_dlt_id", "nullable": False}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "c1": { + "data_type": "bigint", + "name": "c1", + "nullable": True, + "precision": 64, + }, + "c2": { + "data_type": "bigint", + "name": "c2", + "nullable": True, + "precision": 64, + }, + "c3": { + "data_type": "bigint", + "name": "c3", + "nullable": True, + "precision": 64, + }, + # Added c4 column + "c4": { + "data_type": "bigint", + "name": "c4", + "nullable": True, + "precision": 64, + }, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + }, + }, + SchemaChoice.error, + ), + ( + { + "name": "scale_teams", + "columns": { + "id": { + "name": "id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "user_id": { + "name": "user_id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "begin_at": { + "name": "begin_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "created_at": { + "name": "created_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "updated_at": { + "name": "updated_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "scale_id": { + "name": "scale_id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "team_id": { + "name": "team_id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "comment": {"name": "comment", "nullable": True, "data_type": "text"}, + "old_feedback": { + "name": "old_feedback", + "nullable": True, + "data_type": "text", + }, + "feedback_rating": { + "name": "feedback_rating", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "final_mark": { + "name": "final_mark", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "truant_id": { + "name": "truant_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "flag_id": { + "name": "flag_id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "token": {"name": "token", "nullable": True, "data_type": "text"}, + "ip": {"name": "ip", "nullable": True, "data_type": "text"}, + "internship_id": { + "name": "internship_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "filled_at": { + "name": "filled_at", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_lsn": {"name": "_pg_lsn", "nullable": True, "data_type": "bigint"}, + "_pg_deleted_ts": { + "name": "_pg_deleted_ts", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_commit_ts": { + "name": "_pg_commit_ts", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_tx_id": { + "name": "_pg_tx_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "_dlt_load_id": { + "name": "_dlt_load_id", + "data_type": "text", + "nullable": False, + }, + }, + }, + { + "name": "scale_teams", + "columns": { + "id": { + "name": "id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "user_id": { + "name": "user_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "begin_at": { + "name": "begin_at", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "created_at": { + "name": "created_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "updated_at": { + "name": "updated_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "scale_id": { + "name": "scale_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "team_id": { + "name": "team_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "comment": {"name": "comment", "nullable": True, "data_type": "text"}, + "old_feedback": { + "name": "old_feedback", + "nullable": True, + "data_type": "text", + }, + "feedback_rating": { + "name": "feedback_rating", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "final_mark": { + "name": "final_mark", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "truant_id": { + "name": "truant_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "flag_id": { + "name": "flag_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "token": {"name": "token", "nullable": True, "data_type": "text"}, + "ip": {"name": "ip", "nullable": True, "data_type": "text"}, + "internship_id": { + "name": "internship_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "filled_at": { + "name": "filled_at", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_lsn": {"name": "_pg_lsn", "nullable": True, "data_type": "bigint"}, + "_pg_deleted_ts": { + "name": "_pg_deleted_ts", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_commit_ts": { + "name": "_pg_commit_ts", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_tx_id": { + "name": "_pg_tx_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "_dlt_load_id": { + "name": "_dlt_load_id", + "data_type": "text", + "nullable": False, + }, + }, + }, + SchemaChoice.second, + ), +] diff --git a/tests/pg_legacy_replication/conftest.py b/tests/pg_legacy_replication/conftest.py new file mode 100644 index 000000000..dcd1a0f16 --- /dev/null +++ b/tests/pg_legacy_replication/conftest.py @@ -0,0 +1,43 @@ +import faulthandler +import pytest + +from typing import Iterator, Tuple + +import dlt +from dlt.common.utils import uniq_id + + +def pytest_configure(): + faulthandler.enable() + + +@pytest.fixture() +def src_config() -> Iterator[Tuple[dlt.Pipeline, str]]: + # random slot to enable parallel runs + slot = "test_slot_" + uniq_id(4) + # setup + src_pl = dlt.pipeline( + pipeline_name="src_pl", + destination=dlt.destinations.postgres( + credentials=dlt.secrets.get("sources.pg_replication.credentials") + ), + dev_mode=True, + ) + yield src_pl, slot + # teardown + with src_pl.sql_client() as c: + # drop tables + try: + c.drop_dataset() + except Exception as e: + print(e) + with c.with_staging_dataset(): + try: + c.drop_dataset() + except Exception as e: + print(e) + # drop replication slot + try: + c.execute_sql(f"SELECT pg_drop_replication_slot('{slot}');") + except Exception as e: + print(e) diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py new file mode 100644 index 000000000..b93d2486d --- /dev/null +++ b/tests/pg_legacy_replication/test_helpers.py @@ -0,0 +1,64 @@ +import pytest +from dlt.common.schema.typing import TTableSchema +from dlt.common.typing import TDataItem +from google.protobuf.json_format import ParseDict as parse_dict +from sources.pg_legacy_replication.helpers import ( + compare_schemas, + gen_data_item, + infer_table_schema, +) +from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage +from .cases import ( + DATA_ITEMS, + ROW_MESSAGES, + SIMILAR_SCHEMAS, + TABLE_SCHEMAS, + SchemaChoice, +) + + +@pytest.mark.parametrize("data, expected_schema", zip(ROW_MESSAGES, TABLE_SCHEMAS)) +def test_infer_table_schema( + data, + expected_schema: TTableSchema, +): + row_msg = RowMessage() + parse_dict(data, row_msg) + assert ( + infer_table_schema( + row_msg, + include_commit_ts=True, + include_tx_id=True, + ) + == expected_schema + ) + + +@pytest.mark.parametrize("data, data_item", zip(ROW_MESSAGES, DATA_ITEMS)) +def test_gen_data_item(data, data_item: TDataItem): + row_msg = RowMessage() + parse_dict(data, row_msg) + column_schema = infer_table_schema(row_msg)["columns"] + assert ( + gen_data_item( + row_msg, + column_schema, + lsn=1, + include_commit_ts=True, + include_tx_id=True, + ) + == data_item + ) + + +@pytest.mark.parametrize("s1, s2, choice", SIMILAR_SCHEMAS) +def test_compare_schemas(s1: TTableSchema, s2: TTableSchema, choice: SchemaChoice): + if choice == SchemaChoice.error: + with pytest.raises(AssertionError): + compare_schemas(s1, s2) + with pytest.raises(AssertionError): + compare_schemas(s2, s1) + else: + expected_schema = (s1, s2)[choice] + assert compare_schemas(s1, s2) == expected_schema + assert compare_schemas(s2, s1) == expected_schema diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py new file mode 100644 index 000000000..ca103dd1c --- /dev/null +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -0,0 +1,746 @@ +from copy import deepcopy +from typing import Dict, Tuple + +import dlt +import pytest +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.destinations.job_client_impl import SqlJobClientBase + +from sources.pg_legacy_replication import ( + init_replication, + cleanup_snapshot_resources, + replication_source, + ReplicationOptions, +) +from sources.pg_legacy_replication.helpers import SqlTableOptions, TableBackend +from tests.utils import ( + ALL_DESTINATIONS, + assert_load_info, + load_table_counts, +) +from .cases import TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE_COLUMNS_SCHEMA +from .utils import add_pk, assert_loaded_data + +merge_hints: TTableSchemaColumns = { + "_pg_deleted_ts": {"hard_delete": True}, + "_pg_lsn": {"dedup_sort": "desc"}, +} + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_core_functionality( + src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend +) -> None: + @dlt.resource(write_disposition="merge", primary_key="id_x") + def tbl_x(data): + yield data + + @dlt.resource(write_disposition="merge", primary_key="id_y") + def tbl_y(data): + yield data + + src_pl, slot_name = src_config + + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo"}), + tbl_y({"id_y": 1, "val_y": True}), + ] + ) + add_pk(src_pl.sql_client, "tbl_x", "id_x") + add_pk(src_pl.sql_client, "tbl_y", "id_y") + + snapshots = init_replication( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + take_snapshots=True, + table_options={ + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + }, + ) + + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + repl_options={ + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + }, + ) + changes.tbl_x.apply_hints( + write_disposition="merge", primary_key="id_x", columns=merge_hints + ) + changes.tbl_y.apply_hints( + write_disposition="merge", primary_key="id_y", columns=merge_hints + ) + + src_pl.run( + [ + tbl_x([{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}]), + tbl_y({"id_y": 2, "val_y": False}), + ] + ) + + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, dev_mode=True + ) + + # initial load + info = dest_pl.run(snapshots) + cleanup_snapshot_resources(snapshots) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} + exp_tbl_x = [{"id_x": 1, "val_x": "foo"}] + exp_tbl_y = [{"id_y": 1, "val_y": True}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info, expected_load_packages=2) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} + exp_tbl_x = [ + {"id_x": 1, "val_x": "foo"}, + {"id_x": 2, "val_x": "bar"}, + {"id_x": 3, "val_x": "baz"}, + ] + exp_tbl_y = [{"id_y": 1, "val_y": True}, {"id_y": 2, "val_y": False}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # change single table + src_pl.run(tbl_y({"id_y": 3, "val_y": True})) + + # process changes + info = dest_pl.run(changes) + assert_load_info(info, expected_load_packages=2) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} + exp_tbl_y = [ + {"id_y": 1, "val_y": True}, + {"id_y": 2, "val_y": False}, + {"id_y": 3, "val_y": True}, + ] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # update tables + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + c.execute_sql(f"UPDATE {qual_name} SET val_x = 'foo_updated' WHERE id_x = 1;") + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_y") + c.execute_sql(f"UPDATE {qual_name} SET val_y = false WHERE id_y = 1;") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info, expected_load_packages=2) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} + exp_tbl_x = [ + {"id_x": 1, "val_x": "foo_updated"}, + {"id_x": 2, "val_x": "bar"}, + {"id_x": 3, "val_x": "baz"}, + ] + exp_tbl_y = [ + {"id_y": 1, "val_y": False}, + {"id_y": 2, "val_y": False}, + {"id_y": 3, "val_y": True}, + ] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # delete from table + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 1;") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info, expected_load_packages=2) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 3} + exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] + exp_tbl_y = [ + {"id_y": 1, "val_y": False}, + {"id_y": 2, "val_y": False}, + {"id_y": 3, "val_y": True}, + ] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_without_init_load( + src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend +) -> None: + @dlt.resource(write_disposition="merge", primary_key="id_x") + def tbl_x(data): + yield data + + @dlt.resource(write_disposition="merge", primary_key="id_y") + def tbl_y(data): + yield data + + src_pl, slot_name = src_config + + # create postgres table + # since we're skipping initial load, these records should not be in the replicated table + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo"}), + tbl_y({"id_y": 1, "val_y": True}), + ] + ) + add_pk(src_pl.sql_client, "tbl_x", "id_x") + add_pk(src_pl.sql_client, "tbl_y", "id_y") + + # initialize replication and create resource for changes + init_replication( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + ) + + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + repl_options={ + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + }, + ) + changes.tbl_x.apply_hints( + write_disposition="merge", primary_key="id_x", columns=merge_hints + ) + changes.tbl_y.apply_hints( + write_disposition="merge", primary_key="id_y", columns=merge_hints + ) + + # change postgres table after replication has been initialized + # these records should be in the replicated table + src_pl.run( + [ + tbl_x([{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}]), + tbl_y({"id_y": 2, "val_y": False}), + ] + ) + + # load changes to destination and assert expectations + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, dev_mode=True + ) + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 1} + exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] + exp_tbl_y = [{"id_y": 2, "val_y": False}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # delete from table + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 2;") + + # process change and assert expectations + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} + exp_tbl_x = [{"id_x": 3, "val_x": "baz"}] + exp_tbl_y = [{"id_y": 2, "val_y": False}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("give_hints", [True, False]) +@pytest.mark.parametrize("init_load", [True, False]) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_mapped_data_types( + src_config: Tuple[dlt.Pipeline, str], + destination_name: str, + give_hints: bool, + init_load: bool, + backend: TableBackend, +) -> None: + """Assert common data types (the ones mapped in PostgresTypeMapper) are properly handled.""" + + data = deepcopy(TABLE_ROW_ALL_DATA_TYPES) + column_schema = deepcopy(TABLE_UPDATE_COLUMNS_SCHEMA) + + # FIXME Need to figure out why when creating a snapshot my schema get loaded in another job + expected_load_packages = 1 + if init_load: + expected_load_packages = 2 + + # resource to load data into postgres source table + @dlt.resource(primary_key="col1", write_disposition="merge", columns=column_schema) + def items(data): + yield data + + src_pl, slot_name = src_config + + # create postgres table with single record containing all data types + src_pl.run(items(data)) + add_pk(src_pl.sql_client, "items", "col1") + + # initialize replication and create resources + snapshot = init_replication( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="items", + take_snapshots=init_load, + table_options={"items": {"backend": backend}}, + ) + if init_load and give_hints: + snapshot.items.apply_hints(columns=column_schema) + + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="items", + repl_options={"items": {"backend": backend}}, + ) + changes.items.apply_hints( + write_disposition="merge", primary_key="col1", columns=merge_hints + ) + if give_hints: + changes.items.apply_hints(columns=column_schema) + + # initial load + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, dev_mode=True + ) + if init_load: + info = dest_pl.run(snapshot) + cleanup_snapshot_resources(snapshot) + assert_load_info(info) + assert load_table_counts(dest_pl, "items")["items"] == 1 + + # insert two records in postgres table + r1 = deepcopy(data) + r2 = deepcopy(data) + r1["col1"] = 1 + r2["col1"] = 2 + src_pl.run(items([r1, r2])) + + info = dest_pl.run(changes) + assert_load_info(info, expected_load_packages=expected_load_packages) + assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 + + if give_hints: + # compare observed with expected column types + observed = dest_pl.default_schema.get_table("items")["columns"] + for name, expected in column_schema.items(): + assert observed[name]["data_type"] == expected["data_type"] + # postgres bytea does not have precision + if ( + expected.get("precision") is not None + and expected["data_type"] != "binary" + ): + assert observed[name]["precision"] == expected["precision"] + + # update two records in postgres table + # this does two deletes and two inserts because dlt implements "merge" as "delete-and-insert" + # as such, postgres will create four replication messages: two of type Delete and two of type Insert + r1["col2"] = 1.5 + r2["col3"] = False + src_pl.run(items([r1, r2])) + + # process changes and assert expectations + info = dest_pl.run(changes) + assert_load_info(info, expected_load_packages=expected_load_packages) + assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 + exp = [ + {"col1": 1, "col2": 1.5, "col3": True}, + {"col1": 2, "col2": 898912.821982, "col3": False}, + { + "col1": 989127831, + "col2": 898912.821982, + "col3": True, + }, # only present with init load + ] + if not init_load: + del exp[-1] + assert_loaded_data(dest_pl, "items", ["col1", "col2", "col3"], exp, "col1") + + # now do an actual update, so postgres will create a replication message of type Update + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("items") + c.execute_sql(f"UPDATE {qual_name} SET col2 = 2.5 WHERE col1 = 2;") + + # process change and assert expectation + info = dest_pl.run(changes) + assert_load_info(info, expected_load_packages=expected_load_packages) + assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 + exp = [{"col1": 2, "col2": 2.5, "col3": False}] + assert_loaded_data( + dest_pl, "items", ["col1", "col2", "col3"], exp, "col1", "col1 = 2" + ) + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_unmapped_data_types( + src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend +) -> None: + """Assert postgres data types that aren't explicitly mapped default to "text" type.""" + src_pl, slot_name = src_config + + # create postgres table with some unmapped types + with src_pl.sql_client() as c: + c.create_dataset() + c.execute_sql( + "CREATE TABLE data_types (bit_col bit(1), box_col box, uuid_col uuid);" + ) + + # initialize replication and create resource + init_replication( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="data_types", + ) + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="data_types", + repl_options={"data_types": {"backend": backend}}, + ) + + # insert record in source table to create replication item + with src_pl.sql_client() as c: + c.execute_sql( + "INSERT INTO data_types VALUES (B'1', box '((1,1), (0,0))', gen_random_uuid());" + ) + + # run destination pipeline and assert resulting data types + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, dev_mode=True + ) + dest_pl.extract(changes) + dest_pl.normalize() + columns = dest_pl.default_schema.get_table_columns("data_types") + assert columns["bit_col"]["data_type"] == "text" + assert columns["box_col"]["data_type"] == "text" + assert columns["uuid_col"]["data_type"] == "text" + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("init_load", [True, False]) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_included_columns( + src_config: Tuple[dlt.Pipeline, str], + destination_name: str, + init_load: bool, + backend: TableBackend, +) -> None: + def get_cols(pipeline: dlt.Pipeline, table_name: str) -> set: + with pipeline.destination_client(pipeline.default_schema_name) as client: + assert isinstance(client, SqlJobClientBase) + return { + k + for k in client.get_storage_table(table_name)[1].keys() + if not k.startswith("_dlt_") + } + + @dlt.resource + def tbl_x(data): + yield data + + @dlt.resource + def tbl_y(data): + yield data + + @dlt.resource + def tbl_z(data): + yield data + + src_pl, slot_name = src_config + + # create three postgres tables + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo", "another_col_x": 1}), + tbl_y({"id_y": 1, "val_y": "foo", "another_col_y": 1}), + tbl_z({"id_z": 1, "val_z": "foo", "another_col_z": 1}), + ] + ) + + # initialize replication and create resources + table_options = { + "tbl_x": {"backend": backend, "included_columns": {"id_x", "val_x"}}, + "tbl_y": {"backend": backend, "included_columns": {"id_y", "val_y"}}, + "tbl_z": {"backend": backend}, + # tbl_z is not specified, hence all columns should be included + } + snapshots = init_replication( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y", "tbl_z"), + take_snapshots=init_load, + table_options=table_options, + ) + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y", "tbl_z"), + repl_options=table_options, + ) + + # update three postgres tables + src_pl.run( + [ + tbl_x({"id_x": 2, "val_x": "foo", "another_col_x": 1}), + tbl_y({"id_y": 2, "val_y": "foo", "another_col_y": 1}), + tbl_z({"id_z": 2, "val_z": "foo", "another_col_z": 1}), + ] + ) + + # load to destination and assert column expectations + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, dev_mode=True + ) + if init_load: + dest_pl.run(snapshots) + cleanup_snapshot_resources(snapshots) + assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x"} + assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y"} + assert get_cols(dest_pl, "tbl_z") == {"id_z", "val_z", "another_col_z"} + + dest_pl.run(changes) + assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x", "_pg_lsn", "_pg_deleted_ts"} + assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y", "_pg_lsn", "_pg_deleted_ts"} + assert get_cols(dest_pl, "tbl_z") == { + "id_z", + "val_z", + "another_col_z", + "_pg_lsn", + "_pg_deleted_ts", + } + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("init_load", [True, False]) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_column_hints( + src_config: Tuple[dlt.Pipeline, str], + destination_name: str, + init_load: bool, + backend: TableBackend, +) -> None: + @dlt.resource + def tbl_x(data): + yield data + + @dlt.resource + def tbl_y(data): + yield data + + @dlt.resource + def tbl_z(data): + yield data + + src_pl, slot_name = src_config + + # create three postgres tables + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo", "another_col_x": 1}), + tbl_y({"id_y": 1, "val_y": "foo", "another_col_y": 1}), + tbl_z({"id_z": 1, "val_z": "foo", "another_col_z": 1}), + ] + ) + + # initialize replication and create resources + snapshots = init_replication( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y", "tbl_z"), + take_snapshots=init_load, + table_options={ + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + "tbl_z": {"backend": backend}, + }, + ) + if init_load: + snapshots.tbl_x.apply_hints(columns={"another_col_x": {"data_type": "double"}}) + snapshots.tbl_y.apply_hints(columns={"another_col_y": {"precision": 32}}) + + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y", "tbl_z"), + repl_options={ + "tbl_x": { + "backend": backend, + "column_hints": {"another_col_x": {"data_type": "double"}}, + }, + "tbl_y": { + "backend": backend, + "column_hints": {"another_col_y": {"precision": 32}}, + }, + "tbl_z": {"backend": backend}, + }, + ) + + # update three postgres tables + src_pl.run( + [ + tbl_x({"id_x": 2, "val_x": "foo", "another_col_x": 1}), + tbl_y({"id_y": 2, "val_y": "foo", "another_col_y": 1}), + tbl_z({"id_z": 2, "val_z": "foo", "another_col_z": 1}), + ] + ) + + # load to destination and assert column expectations + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, dev_mode=True + ) + if init_load: + dest_pl.run(snapshots) + cleanup_snapshot_resources(snapshots) + assert ( + dest_pl.default_schema.get_table_columns("tbl_x")["another_col_x"][ + "data_type" + ] + == "double" + ) + assert ( + dest_pl.default_schema.get_table_columns("tbl_y")["another_col_y"][ + "precision" + ] + == 32 + ) + assert ( + dest_pl.default_schema.get_table_columns("tbl_z")["another_col_z"][ + "data_type" + ] + == "bigint" + ) + dest_pl.run(changes) + assert ( + dest_pl.default_schema.get_table_columns("tbl_x")["another_col_x"]["data_type"] + == "double" + ) + assert ( + dest_pl.default_schema.get_table_columns("tbl_y")["another_col_y"]["precision"] + == 32 + ) + assert ( + dest_pl.default_schema.get_table_columns("tbl_z")["another_col_z"]["data_type"] + == "bigint" + ) + + # the tests below should pass, but they don't because of a bug that causes + # column hints to be added to other tables when dispatching to multiple tables + assert "another_col_x" not in dest_pl.default_schema.get_table_columns("tbl_y") + assert "another_col_x" not in dest_pl.default_schema.get_table_columns("tbl_z") + assert "another_col_y" not in dest_pl.default_schema.get_table_columns( + "tbl_x", include_incomplete=True + ) + assert "another_col_y" not in dest_pl.default_schema.get_table_columns( + "tbl_z", include_incomplete=True + ) + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_table_schema_change( + src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend +) -> None: + src_pl, slot_name = src_config + + # create postgres table + src_pl.run([{"c1": 1, "c2": 1}], table_name="items") + + # initialize replication + init_replication( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="items", + ) + + # create resource and pipeline + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="items", + repl_options={"items": {"backend": backend}}, + ) + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, dev_mode=True + ) + + # add a column in one commit, this will create one Relation message + src_pl.run([{"c1": 2, "c2": 1}, {"c1": 3, "c2": 1, "c3": 1}], table_name="items") + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "items") == {"items": 2} + exp = [{"c1": 2, "c2": 1, "c3": None}, {"c1": 3, "c2": 1, "c3": 1}] + assert_loaded_data(dest_pl, "items", ["c1", "c2", "c3"], exp, "c1") + + # add a column in two commits, this will create two Relation messages + src_pl.run([{"c1": 4, "c2": 1, "c3": 1}], table_name="items") + src_pl.run([{"c1": 5, "c2": 1, "c3": 1, "c4": 1}], table_name="items") + dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "items") == {"items": 4} + exp = [ + {"c1": 4, "c2": 1, "c3": 1, "c4": None}, + {"c1": 5, "c2": 1, "c3": 1, "c4": 1}, + ] + assert_loaded_data( + dest_pl, "items", ["c1", "c2", "c3", "c4"], exp, "c1", "c1 IN (4, 5)" + ) + + +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_batching(src_config: Tuple[dlt.Pipeline, str], backend: TableBackend) -> None: + # this test asserts the number of data items yielded by the replication resource + # is not affected by `target_batch_size` and the number of replication messages per transaction + src_pl, slot_name = src_config + + # create postgres table with single record + data = {"id": 1000, "val": True} + src_pl.run([data], table_name="items") + + # initialize replication and create resource for changes + init_replication( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="items", + ) + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="items", + target_batch_size=50, + repl_options={"items": {"backend": backend}}, + ) + + # create destination pipeline and resource + dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) + + # insert 100 records into source table in one transaction + batch = [{**r, **{"id": key}} for r in [data] for key in range(1, 101)] + src_pl.run(batch, table_name="items") + extract_info = dest_pl.extract(changes) + assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 + + # insert 100 records into source table in 5 transactions + batch = [{**r, **{"id": key}} for r in [data] for key in range(101, 121)] + src_pl.run(batch, table_name="items") + batch = [{**r, **{"id": key}} for r in [data] for key in range(121, 141)] + src_pl.run(batch, table_name="items") + batch = [{**r, **{"id": key}} for r in [data] for key in range(141, 161)] + src_pl.run(batch, table_name="items") + batch = [{**r, **{"id": key}} for r in [data] for key in range(161, 181)] + src_pl.run(batch, table_name="items") + batch = [{**r, **{"id": key}} for r in [data] for key in range(181, 201)] + src_pl.run(batch, table_name="items") + extract_info = dest_pl.extract(changes) + assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 diff --git a/tests/pg_legacy_replication/utils.py b/tests/pg_legacy_replication/utils.py new file mode 100644 index 000000000..5deb16af0 --- /dev/null +++ b/tests/pg_legacy_replication/utils.py @@ -0,0 +1,52 @@ +from typing import Sequence, List, Dict, Any, Optional + +import dlt +from dlt import Pipeline +from dlt.common.data_writers.escape import escape_postgres_identifier +from dlt.common.configuration.specs import ConnectionStringCredentials + +from tests.utils import select_data + + +def add_pk(sql_client, table_name: str, column_name: str) -> None: + """Adds primary key to postgres table. + + In the context of replication, the primary key serves as REPLICA IDENTITY. + A REPLICA IDENTITY is required when publishing UPDATEs and/or DELETEs. + """ + with sql_client() as c: + qual_name = c.make_qualified_table_name(table_name) + c.execute_sql(f"ALTER TABLE {qual_name} ADD PRIMARY KEY ({column_name});") + + +def assert_loaded_data( + pipeline: Pipeline, + table_name: str, + column_names: Sequence[str], + expectation: List[Dict[str, Any]], + sort_column_name: str, + where_clause: Optional[str] = None, +) -> None: + """Asserts loaded data meets expectation.""" + qual_name = pipeline.sql_client().make_qualified_table_name(table_name) + escape_id = pipeline.destination_client().capabilities.escape_identifier + column_str = ", ".join(map(escape_id, column_names)) + qry = f"SELECT {column_str} FROM {qual_name}" + if where_clause is not None: + qry += " WHERE " + where_clause + observation = [ + {column_name: row[idx] for idx, column_name in enumerate(column_names)} + for row in select_data(pipeline, qry) + ] + assert sorted(observation, key=lambda d: d[sort_column_name]) == expectation + + +def is_super_user(sql_client) -> bool: + """Returns True if Postgres user is superuser, False otherwise.""" + username = dlt.secrets.get( + "sources.pg_replication.credentials", ConnectionStringCredentials + ).username + with sql_client() as c: + return c.execute_sql( # type: ignore[no-any-return] + f"SELECT rolsuper FROM pg_roles WHERE rolname = '{username}';" + )[0][0] diff --git a/tests/postgres/docker-compose.yml b/tests/postgres/docker-compose.yml index aa0a2c5d7..3b901a5ca 100644 --- a/tests/postgres/docker-compose.yml +++ b/tests/postgres/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.7" services: db: env_file: postgres.env @@ -6,9 +5,14 @@ services: context: postgres dockerfile: Dockerfile container_name: dlt_postgres_db + command: + - postgres + - -c + - config_file=/etc/postgresql/postgresql.conf restart: unless-stopped volumes: - db_home:/var/lib/postgresql/data + - ./postgresql.conf:/etc/postgresql/postgresql.conf:ro ports: - 5432:5432 diff --git a/tests/postgres/postgres/Dockerfile b/tests/postgres/postgres/Dockerfile index 1dfd569b5..e7f9aa73c 100644 --- a/tests/postgres/postgres/Dockerfile +++ b/tests/postgres/postgres/Dockerfile @@ -1,2 +1,23 @@ FROM postgres:14 + +# Install dependencies required to build decoderbufs +RUN apt-get update +RUN apt-get install -f -y \ + software-properties-common \ + build-essential \ + pkg-config \ + git + +RUN apt-get install -f -y \ + postgresql-server-dev-14 \ + libprotobuf-c-dev && \ + rm -rf /var/lib/apt/lists/* + +ARG decoderbufs_version=v1.7.0.Final +RUN git clone https://github.com/debezium/postgres-decoderbufs -b $decoderbufs_version --single-branch && \ + cd postgres-decoderbufs && \ + make && make install && \ + cd .. && \ + rm -rf postgres-decoderbufs + COPY 01_init.sql /docker-entrypoint-initdb.d/ \ No newline at end of file diff --git a/tests/postgres/postgresql.conf b/tests/postgres/postgresql.conf new file mode 100644 index 000000000..93a3dab5a --- /dev/null +++ b/tests/postgres/postgresql.conf @@ -0,0 +1,798 @@ +# ----------------------------- +# PostgreSQL configuration file +# ----------------------------- +# +# This file consists of lines of the form: +# +# name = value +# +# (The "=" is optional.) Whitespace may be used. Comments are introduced with +# "#" anywhere on a line. The complete list of parameter names and allowed +# values can be found in the PostgreSQL documentation. +# +# The commented-out settings shown in this file represent the default values. +# Re-commenting a setting is NOT sufficient to revert it to the default value; +# you need to reload the server. +# +# This file is read on server startup and when the server receives a SIGHUP +# signal. If you edit the file on a running system, you have to SIGHUP the +# server for the changes to take effect, run "pg_ctl reload", or execute +# "SELECT pg_reload_conf()". Some parameters, which are marked below, +# require a server shutdown and restart to take effect. +# +# Any parameter can also be given as a command-line option to the server, e.g., +# "postgres -c log_connections=on". Some parameters can be changed at run time +# with the "SET" SQL command. +# +# Memory units: B = bytes Time units: us = microseconds +# kB = kilobytes ms = milliseconds +# MB = megabytes s = seconds +# GB = gigabytes min = minutes +# TB = terabytes h = hours +# d = days + + +#------------------------------------------------------------------------------ +# FILE LOCATIONS +#------------------------------------------------------------------------------ + +# The default values of these variables are driven from the -D command-line +# option or PGDATA environment variable, represented here as ConfigDir. + +#data_directory = 'ConfigDir' # use data in another directory + # (change requires restart) +#hba_file = 'ConfigDir/pg_hba.conf' # host-based authentication file + # (change requires restart) +#ident_file = 'ConfigDir/pg_ident.conf' # ident configuration file + # (change requires restart) + +# If external_pid_file is not explicitly set, no extra PID file is written. +#external_pid_file = '' # write an extra PID file + # (change requires restart) + + +#------------------------------------------------------------------------------ +# CONNECTIONS AND AUTHENTICATION +#------------------------------------------------------------------------------ + +# - Connection Settings - + +listen_addresses = '*' + # comma-separated list of addresses; + # defaults to 'localhost'; use '*' for all + # (change requires restart) +#port = 5432 # (change requires restart) +#max_connections = 100 # (change requires restart) +#superuser_reserved_connections = 3 # (change requires restart) +#unix_socket_directories = '/tmp' # comma-separated list of directories + # (change requires restart) +#unix_socket_group = '' # (change requires restart) +#unix_socket_permissions = 0777 # begin with 0 to use octal notation + # (change requires restart) +#bonjour = off # advertise server via Bonjour + # (change requires restart) +#bonjour_name = '' # defaults to the computer name + # (change requires restart) + +# - TCP settings - +# see "man tcp" for details + +#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; + # 0 selects the system default +#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; + # 0 selects the system default +#tcp_keepalives_count = 0 # TCP_KEEPCNT; + # 0 selects the system default +#tcp_user_timeout = 0 # TCP_USER_TIMEOUT, in milliseconds; + # 0 selects the system default + +#client_connection_check_interval = 0 # time between checks for client + # disconnection while running queries; + # 0 for never + +# - Authentication - + +#authentication_timeout = 1min # 1s-600s +#password_encryption = scram-sha-256 # scram-sha-256 or md5 +#db_user_namespace = off + +# GSSAPI using Kerberos +#krb_server_keyfile = 'FILE:${sysconfdir}/krb5.keytab' +#krb_caseins_users = off + +# - SSL - + +#ssl = off +#ssl_ca_file = '' +#ssl_cert_file = 'server.crt' +#ssl_crl_file = '' +#ssl_crl_dir = '' +#ssl_key_file = 'server.key' +#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers +#ssl_prefer_server_ciphers = on +#ssl_ecdh_curve = 'prime256v1' +#ssl_min_protocol_version = 'TLSv1.2' +#ssl_max_protocol_version = '' +#ssl_dh_params_file = '' +#ssl_passphrase_command = '' +#ssl_passphrase_command_supports_reload = off + + +#------------------------------------------------------------------------------ +# RESOURCE USAGE (except WAL) +#------------------------------------------------------------------------------ + +# - Memory - + +#shared_buffers = 32MB # min 128kB + # (change requires restart) +#huge_pages = try # on, off, or try + # (change requires restart) +#huge_page_size = 0 # zero for system default + # (change requires restart) +#temp_buffers = 8MB # min 800kB +#max_prepared_transactions = 0 # zero disables the feature + # (change requires restart) +# Caution: it is not advisable to set max_prepared_transactions nonzero unless +# you actively intend to use prepared transactions. +#work_mem = 4MB # min 64kB +#hash_mem_multiplier = 1.0 # 1-1000.0 multiplier on hash table work_mem +#maintenance_work_mem = 64MB # min 1MB +#autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem +#logical_decoding_work_mem = 64MB # min 64kB +#max_stack_depth = 2MB # min 100kB +#shared_memory_type = mmap # the default is the first option + # supported by the operating system: + # mmap + # sysv + # windows + # (change requires restart) +#dynamic_shared_memory_type = posix # the default is the first option + # supported by the operating system: + # posix + # sysv + # windows + # mmap + # (change requires restart) +#min_dynamic_shared_memory = 0MB # (change requires restart) + +# - Disk - + +#temp_file_limit = -1 # limits per-process temp file space + # in kilobytes, or -1 for no limit + +# - Kernel Resources - + +#max_files_per_process = 1000 # min 64 + # (change requires restart) + +# - Cost-Based Vacuum Delay - + +#vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables) +#vacuum_cost_page_hit = 1 # 0-10000 credits +#vacuum_cost_page_miss = 2 # 0-10000 credits +#vacuum_cost_page_dirty = 20 # 0-10000 credits +#vacuum_cost_limit = 200 # 1-10000 credits + +# - Background Writer - + +#bgwriter_delay = 200ms # 10-10000ms between rounds +#bgwriter_lru_maxpages = 100 # max buffers written/round, 0 disables +#bgwriter_lru_multiplier = 2.0 # 0-10.0 multiplier on buffers scanned/round +#bgwriter_flush_after = 0 # measured in pages, 0 disables + +# - Asynchronous Behavior - + +#backend_flush_after = 0 # measured in pages, 0 disables +#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching +#maintenance_io_concurrency = 10 # 1-1000; 0 disables prefetching +#max_worker_processes = 8 # (change requires restart) +#max_parallel_workers_per_gather = 2 # limited by max_parallel_workers +#max_parallel_maintenance_workers = 2 # limited by max_parallel_workers +#max_parallel_workers = 8 # number of max_worker_processes that + # can be used in parallel operations +#parallel_leader_participation = on +#old_snapshot_threshold = -1 # 1min-60d; -1 disables; 0 is immediate + # (change requires restart) + + +#------------------------------------------------------------------------------ +# WRITE-AHEAD LOG +#------------------------------------------------------------------------------ + +# - Settings - + +wal_level = logical # minimal, replica, or logical + # (change requires restart) +#fsync = on # flush data to disk for crash safety + # (turning this off can cause + # unrecoverable data corruption) +#synchronous_commit = on # synchronization level; + # off, local, remote_write, remote_apply, or on +#wal_sync_method = fsync # the default is the first option + # supported by the operating system: + # open_datasync + # fdatasync (default on Linux and FreeBSD) + # fsync + # fsync_writethrough + # open_sync +#full_page_writes = on # recover from partial page writes +#wal_log_hints = off # also do full page writes of non-critical updates + # (change requires restart) +#wal_compression = off # enable compression of full-page writes +#wal_init_zero = on # zero-fill new WAL files +#wal_recycle = on # recycle WAL files +#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers + # (change requires restart) +#wal_writer_delay = 200ms # 1-10000 milliseconds +#wal_writer_flush_after = 1MB # measured in pages, 0 disables +#wal_skip_threshold = 2MB + +#commit_delay = 0 # range 0-100000, in microseconds +#commit_siblings = 5 # range 1-1000 + +# - Checkpoints - + +#checkpoint_timeout = 5min # range 30s-1d +#checkpoint_completion_target = 0.9 # checkpoint target duration, 0.0 - 1.0 +#checkpoint_flush_after = 0 # measured in pages, 0 disables +#checkpoint_warning = 30s # 0 disables +#max_wal_size = 1GB +#min_wal_size = 80MB + +# - Archiving - + +#archive_mode = off # enables archiving; off, on, or always + # (change requires restart) +#archive_command = '' # command to use to archive a logfile segment + # placeholders: %p = path of file to archive + # %f = file name only + # e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' +#archive_timeout = 0 # force a logfile segment switch after this + # number of seconds; 0 disables + +# - Archive Recovery - + +# These are only used in recovery mode. + +#restore_command = '' # command to use to restore an archived logfile segment + # placeholders: %p = path of file to restore + # %f = file name only + # e.g. 'cp /mnt/server/archivedir/%f %p' +#archive_cleanup_command = '' # command to execute at every restartpoint +#recovery_end_command = '' # command to execute at completion of recovery + +# - Recovery Target - + +# Set these only when performing a targeted recovery. + +#recovery_target = '' # 'immediate' to end recovery as soon as a + # consistent state is reached + # (change requires restart) +#recovery_target_name = '' # the named restore point to which recovery will proceed + # (change requires restart) +#recovery_target_time = '' # the time stamp up to which recovery will proceed + # (change requires restart) +#recovery_target_xid = '' # the transaction ID up to which recovery will proceed + # (change requires restart) +#recovery_target_lsn = '' # the WAL LSN up to which recovery will proceed + # (change requires restart) +#recovery_target_inclusive = on # Specifies whether to stop: + # just after the specified recovery target (on) + # just before the recovery target (off) + # (change requires restart) +#recovery_target_timeline = 'latest' # 'current', 'latest', or timeline ID + # (change requires restart) +#recovery_target_action = 'pause' # 'pause', 'promote', 'shutdown' + # (change requires restart) + + +#------------------------------------------------------------------------------ +# REPLICATION +#------------------------------------------------------------------------------ + +# - Sending Servers - + +# Set these on the primary and on any standby that will send replication data. + +#max_wal_senders = 10 # max number of walsender processes + # (change requires restart) +#max_replication_slots = 10 # max number of replication slots + # (change requires restart) +#wal_keep_size = 0 # in megabytes; 0 disables +#max_slot_wal_keep_size = -1 # in megabytes; -1 disables +#wal_sender_timeout = 60s # in milliseconds; 0 disables +#track_commit_timestamp = off # collect timestamp of transaction commit + # (change requires restart) + +# - Primary Server - + +# These settings are ignored on a standby server. + +#synchronous_standby_names = '' # standby servers that provide sync rep + # method to choose sync standbys, number of sync standbys, + # and comma-separated list of application_name + # from standby(s); '*' = all +#vacuum_defer_cleanup_age = 0 # number of xacts by which cleanup is delayed + +# - Standby Servers - + +# These settings are ignored on a primary server. + +#primary_conninfo = '' # connection string to sending server +#primary_slot_name = '' # replication slot on sending server +#promote_trigger_file = '' # file name whose presence ends recovery +#hot_standby = on # "off" disallows queries during recovery + # (change requires restart) +#max_standby_archive_delay = 30s # max delay before canceling queries + # when reading WAL from archive; + # -1 allows indefinite delay +#max_standby_streaming_delay = 30s # max delay before canceling queries + # when reading streaming WAL; + # -1 allows indefinite delay +#wal_receiver_create_temp_slot = off # create temp slot if primary_slot_name + # is not set +#wal_receiver_status_interval = 10s # send replies at least this often + # 0 disables +#hot_standby_feedback = off # send info from standby to prevent + # query conflicts +#wal_receiver_timeout = 60s # time that receiver waits for + # communication from primary + # in milliseconds; 0 disables +#wal_retrieve_retry_interval = 5s # time to wait before retrying to + # retrieve WAL after a failed attempt +#recovery_min_apply_delay = 0 # minimum delay for applying changes during recovery + +# - Subscribers - + +# These settings are ignored on a publisher. + +#max_logical_replication_workers = 4 # taken from max_worker_processes + # (change requires restart) +#max_sync_workers_per_subscription = 2 # taken from max_logical_replication_workers + + +#------------------------------------------------------------------------------ +# QUERY TUNING +#------------------------------------------------------------------------------ + +# - Planner Method Configuration - + +#enable_async_append = on +#enable_bitmapscan = on +#enable_gathermerge = on +#enable_hashagg = on +#enable_hashjoin = on +#enable_incremental_sort = on +#enable_indexscan = on +#enable_indexonlyscan = on +#enable_material = on +#enable_memoize = on +#enable_mergejoin = on +#enable_nestloop = on +#enable_parallel_append = on +#enable_parallel_hash = on +#enable_partition_pruning = on +#enable_partitionwise_join = off +#enable_partitionwise_aggregate = off +#enable_seqscan = on +#enable_sort = on +#enable_tidscan = on + +# - Planner Cost Constants - + +#seq_page_cost = 1.0 # measured on an arbitrary scale +#random_page_cost = 4.0 # same scale as above +#cpu_tuple_cost = 0.01 # same scale as above +#cpu_index_tuple_cost = 0.005 # same scale as above +#cpu_operator_cost = 0.0025 # same scale as above +#parallel_setup_cost = 1000.0 # same scale as above +#parallel_tuple_cost = 0.1 # same scale as above +#min_parallel_table_scan_size = 8MB +#min_parallel_index_scan_size = 512kB +#effective_cache_size = 4GB + +#jit_above_cost = 100000 # perform JIT compilation if available + # and query more expensive than this; + # -1 disables +#jit_inline_above_cost = 500000 # inline small functions if query is + # more expensive than this; -1 disables +#jit_optimize_above_cost = 500000 # use expensive JIT optimizations if + # query is more expensive than this; + # -1 disables + +# - Genetic Query Optimizer - + +#geqo = on +#geqo_threshold = 12 +#geqo_effort = 5 # range 1-10 +#geqo_pool_size = 0 # selects default based on effort +#geqo_generations = 0 # selects default based on effort +#geqo_selection_bias = 2.0 # range 1.5-2.0 +#geqo_seed = 0.0 # range 0.0-1.0 + +# - Other Planner Options - + +#default_statistics_target = 100 # range 1-10000 +#constraint_exclusion = partition # on, off, or partition +#cursor_tuple_fraction = 0.1 # range 0.0-1.0 +#from_collapse_limit = 8 +#jit = on # allow JIT compilation +#join_collapse_limit = 8 # 1 disables collapsing of explicit + # JOIN clauses +#plan_cache_mode = auto # auto, force_generic_plan or + # force_custom_plan + + +#------------------------------------------------------------------------------ +# REPORTING AND LOGGING +#------------------------------------------------------------------------------ + +# - Where to Log - + +#log_destination = 'stderr' # Valid values are combinations of + # stderr, csvlog, syslog, and eventlog, + # depending on platform. csvlog + # requires logging_collector to be on. + +# This is used when logging to stderr: +#logging_collector = off # Enable capturing of stderr and csvlog + # into log files. Required to be on for + # csvlogs. + # (change requires restart) + +# These are only used if logging_collector is on: +#log_directory = 'log' # directory where log files are written, + # can be absolute or relative to PGDATA +#log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' # log file name pattern, + # can include strftime() escapes +#log_file_mode = 0600 # creation mode for log files, + # begin with 0 to use octal notation +#log_rotation_age = 1d # Automatic rotation of logfiles will + # happen after that time. 0 disables. +#log_rotation_size = 10MB # Automatic rotation of logfiles will + # happen after that much log output. + # 0 disables. +#log_truncate_on_rotation = off # If on, an existing log file with the + # same name as the new log file will be + # truncated rather than appended to. + # But such truncation only occurs on + # time-driven rotation, not on restarts + # or size-driven rotation. Default is + # off, meaning append to existing files + # in all cases. + +# These are relevant when logging to syslog: +#syslog_facility = 'LOCAL0' +#syslog_ident = 'postgres' +#syslog_sequence_numbers = on +#syslog_split_messages = on + +# This is only relevant when logging to eventlog (Windows): +# (change requires restart) +#event_source = 'PostgreSQL' + +# - When to Log - + +#log_min_messages = warning # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # info + # notice + # warning + # error + # log + # fatal + # panic + +#log_min_error_statement = error # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # info + # notice + # warning + # error + # log + # fatal + # panic (effectively off) + +#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements + # and their durations, > 0 logs only + # statements running at least this number + # of milliseconds + +#log_min_duration_sample = -1 # -1 is disabled, 0 logs a sample of statements + # and their durations, > 0 logs only a sample of + # statements running at least this number + # of milliseconds; + # sample fraction is determined by log_statement_sample_rate + +#log_statement_sample_rate = 1.0 # fraction of logged statements exceeding + # log_min_duration_sample to be logged; + # 1.0 logs all such statements, 0.0 never logs + + +#log_transaction_sample_rate = 0.0 # fraction of transactions whose statements + # are logged regardless of their duration; 1.0 logs all + # statements from all transactions, 0.0 never logs + +# - What to Log - + +#debug_print_parse = off +#debug_print_rewritten = off +#debug_print_plan = off +#debug_pretty_print = on +#log_autovacuum_min_duration = -1 # log autovacuum activity; + # -1 disables, 0 logs all actions and + # their durations, > 0 logs only + # actions running at least this number + # of milliseconds. +#log_checkpoints = off +log_connections = on +log_disconnections = on +#log_duration = off +#log_error_verbosity = default # terse, default, or verbose messages +#log_hostname = off +#log_line_prefix = '%m [%p] ' # special values: + # %a = application name + # %u = user name + # %d = database name + # %r = remote host and port + # %h = remote host + # %b = backend type + # %p = process ID + # %P = process ID of parallel group leader + # %t = timestamp without milliseconds + # %m = timestamp with milliseconds + # %n = timestamp with milliseconds (as a Unix epoch) + # %Q = query ID (0 if none or not computed) + # %i = command tag + # %e = SQL state + # %c = session ID + # %l = session line number + # %s = session start timestamp + # %v = virtual transaction ID + # %x = transaction ID (0 if none) + # %q = stop here in non-session + # processes + # %% = '%' + # e.g. '<%u%%%d> ' +#log_lock_waits = off # log lock waits >= deadlock_timeout +#log_recovery_conflict_waits = off # log standby recovery conflict waits + # >= deadlock_timeout +#log_parameter_max_length = -1 # when logging statements, limit logged + # bind-parameter values to N bytes; + # -1 means print in full, 0 disables +#log_parameter_max_length_on_error = 0 # when logging an error, limit logged + # bind-parameter values to N bytes; + # -1 means print in full, 0 disables +log_statement = 'all' # none, ddl, mod, all +#log_replication_commands = off +#log_temp_files = -1 # log temporary files equal or larger + # than the specified size in kilobytes; + # -1 disables, 0 logs all temp files +#log_timezone = 'GMT' + + +#------------------------------------------------------------------------------ +# PROCESS TITLE +#------------------------------------------------------------------------------ + +#cluster_name = '' # added to process titles if nonempty + # (change requires restart) +#update_process_title = on + + +#------------------------------------------------------------------------------ +# STATISTICS +#------------------------------------------------------------------------------ + +# - Query and Index Statistics Collector - + +#track_activities = on +#track_activity_query_size = 1024 # (change requires restart) +#track_counts = on +#track_io_timing = off +#track_wal_io_timing = off +#track_functions = none # none, pl, all +#stats_temp_directory = 'pg_stat_tmp' + + +# - Monitoring - + +#compute_query_id = auto +#log_statement_stats = off +#log_parser_stats = off +#log_planner_stats = off +#log_executor_stats = off + + +#------------------------------------------------------------------------------ +# AUTOVACUUM +#------------------------------------------------------------------------------ + +#autovacuum = on # Enable autovacuum subprocess? 'on' + # requires track_counts to also be on. +#autovacuum_max_workers = 3 # max number of autovacuum subprocesses + # (change requires restart) +#autovacuum_naptime = 1min # time between autovacuum runs +#autovacuum_vacuum_threshold = 50 # min number of row updates before + # vacuum +#autovacuum_vacuum_insert_threshold = 1000 # min number of row inserts + # before vacuum; -1 disables insert + # vacuums +#autovacuum_analyze_threshold = 50 # min number of row updates before + # analyze +#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum +#autovacuum_vacuum_insert_scale_factor = 0.2 # fraction of inserts over table + # size before insert vacuum +#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze +#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum + # (change requires restart) +#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age + # before forced vacuum + # (change requires restart) +#autovacuum_vacuum_cost_delay = 2ms # default vacuum cost delay for + # autovacuum, in milliseconds; + # -1 means use vacuum_cost_delay +#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for + # autovacuum, -1 means use + # vacuum_cost_limit + + +#------------------------------------------------------------------------------ +# CLIENT CONNECTION DEFAULTS +#------------------------------------------------------------------------------ + +# - Statement Behavior - + +#client_min_messages = notice # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # log + # notice + # warning + # error +#search_path = '"$user", public' # schema names +#row_security = on +#default_table_access_method = 'heap' +#default_tablespace = '' # a tablespace name, '' uses the default +#default_toast_compression = 'pglz' # 'pglz' or 'lz4' +#temp_tablespaces = '' # a list of tablespace names, '' uses + # only default tablespace +#check_function_bodies = on +#default_transaction_isolation = 'read committed' +#default_transaction_read_only = off +#default_transaction_deferrable = off +#session_replication_role = 'origin' +#statement_timeout = 0 # in milliseconds, 0 is disabled +#lock_timeout = 0 # in milliseconds, 0 is disabled +#idle_in_transaction_session_timeout = 0 # in milliseconds, 0 is disabled +#idle_session_timeout = 0 # in milliseconds, 0 is disabled +#vacuum_freeze_table_age = 150000000 +#vacuum_freeze_min_age = 50000000 +#vacuum_failsafe_age = 1600000000 +#vacuum_multixact_freeze_table_age = 150000000 +#vacuum_multixact_freeze_min_age = 5000000 +#vacuum_multixact_failsafe_age = 1600000000 +#bytea_output = 'hex' # hex, escape +#xmlbinary = 'base64' +#xmloption = 'content' +#gin_pending_list_limit = 4MB + +# - Locale and Formatting - + +#datestyle = 'iso, mdy' +#intervalstyle = 'postgres' +#timezone = 'GMT' +#timezone_abbreviations = 'Default' # Select the set of available time zone + # abbreviations. Currently, there are + # Default + # Australia (historical usage) + # India + # You can create your own file in + # share/timezonesets/. +#extra_float_digits = 1 # min -15, max 3; any value >0 actually + # selects precise output mode +#client_encoding = sql_ascii # actually, defaults to database + # encoding + +# These settings are initialized by initdb, but they can be changed. +#lc_messages = 'C' # locale for system error message + # strings +#lc_monetary = 'C' # locale for monetary formatting +#lc_numeric = 'C' # locale for number formatting +#lc_time = 'C' # locale for time formatting + +# default configuration for text search +#default_text_search_config = 'pg_catalog.simple' + +# - Shared Library Preloading - + +#local_preload_libraries = '' +#session_preload_libraries = '' +shared_preload_libraries = 'decoderbufs' # (change requires restart) +#jit_provider = 'llvmjit' # JIT library to use + +# - Other Defaults - + +#dynamic_library_path = '$libdir' +#extension_destdir = '' # prepend path when loading extensions + # and shared objects (added by Debian) +#gin_fuzzy_search_limit = 0 + + +#------------------------------------------------------------------------------ +# LOCK MANAGEMENT +#------------------------------------------------------------------------------ + +#deadlock_timeout = 1s +#max_locks_per_transaction = 64 # min 10 + # (change requires restart) +#max_pred_locks_per_transaction = 64 # min 10 + # (change requires restart) +#max_pred_locks_per_relation = -2 # negative values mean + # (max_pred_locks_per_transaction + # / -max_pred_locks_per_relation) - 1 +#max_pred_locks_per_page = 2 # min 0 + + +#------------------------------------------------------------------------------ +# VERSION AND PLATFORM COMPATIBILITY +#------------------------------------------------------------------------------ + +# - Previous PostgreSQL Versions - + +#array_nulls = on +#backslash_quote = safe_encoding # on, off, or safe_encoding +#escape_string_warning = on +#lo_compat_privileges = off +#quote_all_identifiers = off +#standard_conforming_strings = on +#synchronize_seqscans = on + +# - Other Platforms and Clients - + +#transform_null_equals = off + + +#------------------------------------------------------------------------------ +# ERROR HANDLING +#------------------------------------------------------------------------------ + +#exit_on_error = off # terminate session on any error? +#restart_after_crash = on # reinitialize after backend crash? +#data_sync_retry = off # retry or panic on failure to fsync + # data? + # (change requires restart) +#recovery_init_sync_method = fsync # fsync, syncfs (Linux 5.8+) + + +#------------------------------------------------------------------------------ +# CONFIG FILE INCLUDES +#------------------------------------------------------------------------------ + +# These options allow settings to be loaded from files other than the +# default postgresql.conf. Note that these are directives, not variable +# assignments, so they can usefully be given more than once. + +#include_dir = '...' # include files ending in '.conf' from + # a directory, e.g., 'conf.d' +#include_if_exists = '...' # include file only if it exists +#include = '...' # include file + + +#------------------------------------------------------------------------------ +# CUSTOMIZED OPTIONS +#------------------------------------------------------------------------------ + +# Add settings for extensions here \ No newline at end of file