diff --git a/Cargo.lock b/Cargo.lock index 75c149a..81903d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,20 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "serde", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -138,6 +152,21 @@ version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "1.3.2" @@ -159,6 +188,12 @@ dependencies = [ "generic-array", ] +[[package]] +name = "borrow-or-share" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3eeab4423108c5d7c744f4d234de88d18d636100093ae04caf4825134b9c3a32" + [[package]] name = "bstr" version = "1.9.1" @@ -176,6 +211,12 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "bytecount" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" + [[package]] name = "byteorder" version = "1.5.0" @@ -386,6 +427,17 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -398,6 +450,15 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "email_address" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e079f19b08ca6239f47f8ba8509c11cf3ea30095831f7fed61441475edd8c449" +dependencies = [ + "serde", +] + [[package]] name = "encoding_rs" version = "0.8.34" @@ -436,12 +497,34 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fastrand" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "fluent-uri" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1918b65d96df47d3591bed19c5cca17e3fa5d0707318e4b5ef2eae01764df7e5" +dependencies = [ + "borrow-or-share", + "ref-cast", + "serde", +] + [[package]] name = "fnv" version = "1.0.7" @@ -472,6 +555,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fraction" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f158e3ff0a1b334408dc9fb811cd99b446986f4d8b741bb08f9df1604085ae7" +dependencies = [ + "lazy_static", + "num", +] + [[package]] name = "futures-channel" version = "0.3.30" @@ -756,6 +849,124 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -772,6 +983,27 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -839,6 +1071,30 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "jsonschema" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26a960f0c34d5423581d858ce94815cc11f0171b09939409097969ed269ede1b" +dependencies = [ + "ahash", + "base64", + "bytecount", + "email_address", + "fancy-regex", + "fraction", + "idna 1.0.3", + "itoa", + "num-cmp", + "once_cell", + "percent-encoding", + "referencing", + "regex-syntax", + "serde", + "serde_json", + "uuid-simd", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -863,6 +1119,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" + [[package]] name = "lock_api" version = "0.4.12" @@ -888,6 +1150,7 @@ dependencies = [ "colored", "convert_case", "gray_matter", + "jsonschema", "lazy_static", "log", "minijinja", @@ -993,17 +1256,87 @@ dependencies = [ "tempfile", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-cmp" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63335b2e2c34fae2fb0aa2cecfd9f0832a1e24b3b32ecec612c3426d46dc8aaa" + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] @@ -1019,9 +1352,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.19.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "openai-api-rs" @@ -1092,6 +1425,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "outref" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" + [[package]] name = "parking_lot" version = "0.12.3" @@ -1366,6 +1705,39 @@ dependencies = [ "bitflags 2.5.0", ] +[[package]] +name = "ref-cast" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf0a6f84d5f1d581da8b41b47ec8600871962f2a528115b542b362d4b744931" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "referencing" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb8e15af8558cb157432dd3d88c1d1e982d0a5755cf80ce593b6499260aebc49" +dependencies = [ + "ahash", + "fluent-uri", + "once_cell", + "percent-encoding", + "serde_json", +] + [[package]] name = "regex" version = "1.10.4" @@ -1391,9 +1763,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "reqwest" @@ -1664,6 +2036,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "strsim" version = "0.10.0" @@ -1693,6 +2071,17 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "system-configuration" version = "0.5.1" @@ -1809,6 +2198,16 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -2083,7 +2482,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", - "idna", + "idna 0.5.0", "percent-encoding", ] @@ -2093,12 +2492,41 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "uuid" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" + +[[package]] +name = "uuid-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b082222b4f6619906941c17eb2297fff4c2fb96cb60164170522942a200bd8" +dependencies = [ + "outref", + "uuid", + "vsimd", +] + [[package]] name = "vcpkg" version = "0.2.15" @@ -2111,6 +2539,12 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "wait-timeout" version = "0.2.0" @@ -2388,6 +2822,18 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "yaml-rust" version = "0.4.5" @@ -2403,6 +2849,30 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.35" @@ -2423,3 +2893,46 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zerofrom" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index f22389f..df67fd7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ wasm-bindgen = { version = "0.2.95", optional = true } serde-wasm-bindgen = { version = "0.6.5", optional = true } tokio = { version = "1.42.0", features = ["rt"] } openai-api-rs = { version = "5.2.3", optional = true } +jsonschema = { version = "0.26.2", default-features = false } [features] default = ["openai"] diff --git a/src/bin/cli.rs b/src/bin/cli.rs index 22d5b8d..d08d4c6 100644 --- a/src/bin/cli.rs +++ b/src/bin/cli.rs @@ -23,9 +23,11 @@ use clap::{Parser, Subcommand}; use colored::Colorize; +use log::error; use mdmodels::{ datamodel::DataModel, exporters::{render_jinja_template, Templates}, + json::validation::validate_json, llm::extraction::query_openai, pipeline::process_pipeline, }; @@ -53,6 +55,8 @@ enum Commands { Pipeline(PipelineArgs), /// Large Language Model Extraction Extract(ExtractArgs), + /// Validate a dataset against a markdown model. + Dataset(DatasetArgs), } /// Arguments for the validate subcommand. @@ -141,6 +145,33 @@ struct ExtractArgs { multiple: bool, } +/// Arguments for the dataset subcommand. +#[derive(Parser, Debug)] +struct DatasetArgs { + #[command(subcommand)] + command: DatasetCommands, +} + +/// Subcommands for dataset operations +#[derive(Subcommand, Debug)] +enum DatasetCommands { + /// Validate a dataset against a markdown model. + Validate(ValidateDatasetArgs), + // Add more dataset subcommands here as needed +} + +/// Arguments for the validate dataset subcommand. +#[derive(Parser, Debug)] +struct ValidateDatasetArgs { + /// Path to the dataset file. + #[arg(short, long, help = "Path to the dataset file")] + input: InputType, + + /// Path to the markdown model. + #[arg(short, long, help = "Path to the markdown model")] + model: InputType, +} + /// Represents the input type, either remote URL or local file path. #[derive(Deserialize, Serialize, Clone, Debug)] enum InputType { @@ -186,6 +217,9 @@ fn main() -> Result<(), Box> { Commands::Convert(args) => convert(args), Commands::Pipeline(args) => process_pipeline(&args.input), Commands::Extract(args) => query_llm(args), + Commands::Dataset(args) => match args.command { + DatasetCommands::Validate(args) => validate_ds(args), + }, } } @@ -355,6 +389,20 @@ fn render_all_json_schemes( Ok(()) } +/// Validates a dataset against a markdown model. +fn validate_ds(args: ValidateDatasetArgs) -> Result<(), Box> { + let model_path = resolve_input_path(&args.model); + let model = DataModel::from_markdown(&model_path)?; + let dataset_path = resolve_input_path(&args.input); + let result = validate_json(dataset_path, &model, None)?; + + for error in result { + error!("{}", error); + } + + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/bindings/python.rs b/src/bindings/python.rs index 91ff6f4..00e9065 100644 --- a/src/bindings/python.rs +++ b/src/bindings/python.rs @@ -82,7 +82,7 @@ impl DataModel { /// /// A string that represents the `DataModel` instance. fn __repr__(&self) -> String { - self.model.sdrdm_schema() + self.model.internal_schema() } /// Converts the `DataModel` instance to a specified template format. diff --git a/src/datamodel.rs b/src/datamodel.rs index 92a2caa..26058a2 100644 --- a/src/datamodel.rs +++ b/src/datamodel.rs @@ -30,6 +30,7 @@ use serde::{Deserialize, Serialize}; use crate::exporters::{render_jinja_template, Templates}; use crate::json::export::to_json_schema; +use crate::json::validation::{validate_json, ValidationError}; use crate::markdown::frontmatter::FrontMatter; use crate::markdown::parser::parse_markdown; use crate::object::{Enumeration, Object}; @@ -60,7 +61,7 @@ use pyo3::pyclass; // * `parse` - Parse a markdown file and create a data model // * `json_schema` - Generate a JSON schema from the data model // * `json_schema_all` - Generate JSON schemas for all objects in the data model -// * `sdrdm_schema` - Generate a SDRDM schema from the data model +// * `internal_schema` - Generate an internal schema from the data model #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[cfg_attr(feature = "python", pyclass(get_all))] pub struct DataModel { @@ -82,6 +83,28 @@ impl DataModel { } } + /// Validates a dataset against the data model. + /// + /// This function takes the path to a dataset and validates it against the + /// current data model. It returns a vector of validation errors if any + /// validation issues are found, or an empty vector if the validation is successful. + /// + /// # Arguments + /// + /// * `path` - A reference to the path of the dataset to validate. + /// * `root` - An optional root path for the schema. Will use the first object if not provided. + /// + /// # Returns + /// A Result containing a vector of `ValidationError` if validation fails, + /// or an empty vector if successful. + pub fn validate_json( + &self, + path: &Path, + root: Option, + ) -> Result, Box> { + validate_json(path.to_path_buf(), self, root) + } + // Get the JSON schema for an object // // * `obj_name` - Name of the object @@ -166,7 +189,7 @@ impl DataModel { Ok(()) } - // Get the SDRDM schema for the markdown file + // Get the internal schema for the markdown file // // # Panics // @@ -177,18 +200,18 @@ impl DataModel { // ``` // let model = DataModel::new(); // model.parse("path/to/file.md".to_string()); - // let schema = model.sdrdm_schema(); + // let schema = model.internal_schema(); // ``` // // # Returns // - // A SDRDM schema string - pub fn sdrdm_schema(&self) -> String { + // An internal schema string + pub fn internal_schema(&self) -> String { if self.objects.is_empty() { panic!("No objects found in the markdown file"); } - serde_json::to_string_pretty(&self).expect("Could not serialize to sdRDM schema") + serde_json::to_string_pretty(&self).expect("Could not serialize to internal schema") } // Parse a markdown file and create a data model @@ -199,14 +222,14 @@ impl DataModel { // // ``` // let path = Path::new("path/to/file.md"); - // let model = DataModel::from_sdrdm_schema(path); + // let model = DataModel::from_internal_schema(path); // ``` // // # Returns // // A data model // - pub fn from_sdrdm_schema(path: &Path) -> Result> { + pub fn from_internal_schema(path: &Path) -> Result> { if !path.exists() { return Err("File does not exist".into()); } @@ -457,12 +480,12 @@ mod tests { } #[test] - fn test_from_sdrdm_schema() { + fn test_from_internal_schema() { // Arrange - let path = Path::new("tests/data/expected_sdrdm_schema.json"); + let path = Path::new("tests/data/expected_internal_schema.json"); // Act - let model = DataModel::from_sdrdm_schema(path).expect("Failed to parse SDRDM schema"); + let model = DataModel::from_internal_schema(path).expect("Failed to parse internal schema"); // Assert assert_eq!(model.objects.len(), 2); diff --git a/src/json/validation.rs b/src/json/validation.rs new file mode 100644 index 0000000..bda494d --- /dev/null +++ b/src/json/validation.rs @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2024 Jan Range + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +use std::error::Error; +use std::path::PathBuf; + +use colored::Colorize; +use jsonschema::error::ValidationErrorKind; +use serde_json::Value; +use std::convert::TryFrom; + +use crate::datamodel::DataModel; +use jsonschema::validator_for; + +/// Represents a validation error that occurs during dataset validation. +#[derive(Debug)] +pub struct ValidationError { + pub instance_path: String, + pub schema_path: String, + pub message: String, + pub kind: ValidationErrorKind, +} + +impl std::fmt::Display for ValidationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Validation Error: Instance {} violates schema at {}: {}", + self.instance_path.red().bold(), + self.schema_path.green().bold(), + self.message.yellow().bold() + ) + } +} + +impl From> for ValidationError { + fn from(err: jsonschema::ValidationError) -> Self { + ValidationError { + instance_path: err.instance_path.to_string(), + schema_path: err.schema_path.to_string(), + message: err.to_string(), + kind: err.kind, + } + } +} + +/// Validates a dataset against a given DataModel. +/// +/// # Arguments +/// +/// * `dataset` - The dataset to validate, which can be provided in various forms. +/// * `model` - A reference to the DataModel against which the dataset will be validated. +/// * `root` - An optional root path for the schema. +/// +/// # Returns +/// +/// A Result containing a vector of ValidationErrors if validation fails, or an empty vector if successful. +pub fn validate_json>( + dataset: T, + model: &DataModel, + root: Option, +) -> Result, Box> { + // Convert the dataset input to a Value + let dataset_input: DatasetInput = dataset.into(); + let value: Value = dataset_input.try_into()?; + + // Get the JSON Schema from the model + let schema = model.json_schema(root, false)?; + let schema_value: Value = serde_json::from_str(&schema)?; + + // Create a validator for the schema + let validator = validator_for(&schema_value)?; + + // Validate the dataset against the schema + let result = validator.iter_errors(&value); + let mut errors: Vec = Vec::new(); + + for err in result { + errors.push(ValidationError::from(err)); + } + + Ok(errors) +} + +/// Enum representing the different types of dataset inputs. +pub enum DatasetInput { + Path(PathBuf), + Value(Value), + String(String), +} + +impl From for DatasetInput { + /// Converts a PathBuf into a DatasetInput. + fn from(path: PathBuf) -> Self { + DatasetInput::Path(path) + } +} + +impl From for DatasetInput { + /// Converts a Value into a DatasetInput. + fn from(value: Value) -> Self { + DatasetInput::Value(value) + } +} + +impl From for DatasetInput { + /// Converts a String into a DatasetInput. + fn from(string: String) -> Self { + DatasetInput::String(string) + } +} + +impl TryFrom for Value { + type Error = Box; + + fn try_from(input: DatasetInput) -> Result { + match input { + DatasetInput::Path(path) => { + // Logic to read from the path and convert to Value + let content = std::fs::read_to_string(path)?; + let value: Value = serde_json::from_str(&content)?; + Ok(value) + } + DatasetInput::Value(value) => Ok(value), + DatasetInput::String(string) => { + let value: Value = serde_json::from_str(&string)?; + Ok(value) + } + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 9cee236..9283d78 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,6 +41,7 @@ pub mod json { mod datatype; pub mod export; pub mod schema; + pub mod validation; } pub(crate) mod markdown { diff --git a/src/pipeline.rs b/src/pipeline.rs index c656555..36ec090 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -340,7 +340,7 @@ fn serialize_to_internal_schema( ) -> Result<(), Box> { match merge_state { MergeState::Merge => { - let schema = model.sdrdm_schema(); + let schema = model.internal_schema(); save_to_file(out, &schema)?; print_render_msg(out, &Templates::Internal); Ok(()) diff --git a/tests/data/expected_sdrdm_full_schema.json b/tests/data/expected_internal_full_schema.json similarity index 100% rename from tests/data/expected_sdrdm_full_schema.json rename to tests/data/expected_internal_full_schema.json diff --git a/tests/data/expected_sdrdm_schema.json b/tests/data/expected_internal_schema.json similarity index 100% rename from tests/data/expected_sdrdm_schema.json rename to tests/data/expected_internal_schema.json diff --git a/tests/data/expected_sdrdm_schema_inheritance.json b/tests/data/expected_internal_schema_inheritance.json similarity index 100% rename from tests/data/expected_sdrdm_schema_inheritance.json rename to tests/data/expected_internal_schema_inheritance.json diff --git a/tests/data/invalid_dataset.json b/tests/data/invalid_dataset.json new file mode 100644 index 0000000..069659e --- /dev/null +++ b/tests/data/invalid_dataset.json @@ -0,0 +1,28 @@ +{ + "floating": "point", + "integer": "1", + "boolean": "true", + "string": 10.0, + "primitive_number_array": [ + "1", + "2", + "3" + ], + "complex": { + "name": 20.0, + "age": "20" + }, + "complex_array": [ + { + "name": 20.0, + "age": "20" + } + ], + "mixed_array": [ + "1", + { + "name": 20.0, + "age": "20" + } + ] +} \ No newline at end of file diff --git a/tests/data/model_json_validation.md b/tests/data/model_json_validation.md new file mode 100644 index 0000000..2e1e85d --- /dev/null +++ b/tests/data/model_json_validation.md @@ -0,0 +1,27 @@ +### Root + +- floating + - Type: float +- integer + - Type: integer +- boolean + - Type: boolean +- string + - Type: string +- primitive_number_array + - Type: float[] +- complex + - Type: Nested +- complex_array + - Type: Nested[] +- mixed_array + - Type: integer, Nested + - Multiple: True + + +### Nested + +- name + - Type: string +- age + - Type: integer diff --git a/tests/data/valid_dataset.json b/tests/data/valid_dataset.json new file mode 100644 index 0000000..b630ba8 --- /dev/null +++ b/tests/data/valid_dataset.json @@ -0,0 +1,28 @@ +{ + "floating": 10.0, + "integer": 1, + "boolean": true, + "string": "hello", + "primitive_number_array": [ + 1, + 2, + 3 + ], + "complex": { + "name": "Test", + "age": 20 + }, + "complex_array": [ + { + "name": "Test", + "age": 20 + } + ], + "mixed_array": [ + 1, + { + "name": "Test", + "age": 20 + } + ] +} \ No newline at end of file diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index ebdc925..bf14d9c 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -92,10 +92,10 @@ mod tests { // Assert let expected = - std::fs::read_to_string("tests/data/expected_sdrdm_full_schema.json").unwrap(); + std::fs::read_to_string("tests/data/expected_internal_full_schema.json").unwrap(); let expected: serde_json::Value = serde_json::from_str(&expected).unwrap(); - let schema = model.sdrdm_schema(); + let schema = model.internal_schema(); let schema: serde_json::Value = serde_json::from_str(&schema).unwrap(); assert_eq!(schema, expected); @@ -192,18 +192,18 @@ mod tests { } #[test] - fn test_sdrdm_schema() { + fn test_internal_schema() { // Arrange let path = Path::new("tests/data/model.md"); let model = DataModel::from_markdown(path).expect("Could not parse markdown"); // Act - let schema = model.sdrdm_schema(); + let schema = model.internal_schema(); let schema: serde_json::Value = serde_json::from_str(&schema).unwrap(); // Assert let expected_schema = - std::fs::read_to_string("tests/data/expected_sdrdm_schema.json").unwrap(); + std::fs::read_to_string("tests/data/expected_internal_schema.json").unwrap(); let expected_schema: serde_json::Value = serde_json::from_str(&expected_schema).unwrap(); assert_eq!(schema, expected_schema); @@ -211,12 +211,12 @@ mod tests { #[test] #[should_panic] - fn test_sdrdm_schema_no_objects() { + fn test_internal_schema_no_objects() { // Arrange let model = mdmodels::datamodel::DataModel::new(None, None); // Act - model.sdrdm_schema(); + model.internal_schema(); } #[test] @@ -278,11 +278,12 @@ mod tests { let model = DataModel::from_markdown(path).expect("Could not parse markdown"); // Assert - let schema = model.sdrdm_schema(); + let schema = model.internal_schema(); let schema: serde_json::Value = serde_json::from_str(&schema).unwrap(); let expected_schema = - std::fs::read_to_string("tests/data/expected_sdrdm_schema_inheritance.json").unwrap(); + std::fs::read_to_string("tests/data/expected_internal_schema_inheritance.json") + .unwrap(); assert_eq!(schema, expected_schema); } @@ -366,4 +367,28 @@ mod tests { assert_eq!(e, expected); } } + + #[test] + fn test_json_validation() { + let path = Path::new("tests/data/model_json_validation.md"); + let model = DataModel::from_markdown(path).expect("Could not parse markdown"); + + let validation = model + .validate_json(Path::new("tests/data/invalid_dataset.json"), None) + .expect("Could not validate JSON"); + + assert_eq!(validation.len(), 13); + } + + #[test] + fn test_json_validation_valid() { + let path = Path::new("tests/data/model_json_validation.md"); + let model = DataModel::from_markdown(path).expect("Could not parse markdown"); + + let validation = model + .validate_json(Path::new("tests/data/valid_dataset.json"), None) + .expect("Could not validate JSON"); + + assert_eq!(validation.len(), 0); + } }