diff --git a/.gitignore b/.gitignore index fc0e029..7448087 100644 --- a/.gitignore +++ b/.gitignore @@ -7,5 +7,6 @@ test.py rebalance_test.py split_comparison.py fp.py -*.npz -*.json \ No newline at end of file + +*.json +split_benchmark_process.py diff --git a/Data/CP/cp_test.txt b/Data/CP/cp_test.txt deleted file mode 100644 index 1df68e5..0000000 --- a/Data/CP/cp_test.txt +++ /dev/null @@ -1,228 +0,0 @@ -2024-08-29 14:21:03,255 - INFO - DROP CLASS RATIO 0.05 -2024-08-29 14:21:09,283 - INFO - Partition data using stratify reduction approach -2024-08-29 14:21:09,910 - INFO - Partition data using stratify approach -2024-08-29 14:21:10,173 - INFO - Train model... -2024-08-29 14:23:40,721 - INFO - Evaluation... -2024-08-29 14:23:46,369 - INFO - MCC_original: 0.9341941585174421 -2024-08-29 14:23:46,375 - INFO - MCC_calibrated: 0.8433538859247134 -2024-08-29 14:23:46,379 - INFO - MCC_certainty: 0.9896139503007725 -2024-08-29 14:23:46,465 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:23:46,467 - INFO - F1_calibrated_novelty: 0.84875 -2024-08-29 14:23:46,467 - INFO - Threshold: 0.1593790420706205 -2024-08-29 14:23:46,468 - INFO - Results for 0.05: {'Threshold': 0.1593790420706205, 'MCC_original': 0.9341941585174421, 'MCC_calibrated': 0.8433538859247134, 'MCC_certainty': 0.9896139503007725, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.84875} -2024-08-29 14:23:46,468 - INFO - DROP CLASS RATIO 0.1 -2024-08-29 14:23:52,261 - INFO - Partition data using stratify reduction approach -2024-08-29 14:23:52,922 - INFO - Partition data using stratify approach -2024-08-29 14:23:53,196 - INFO - Train model... -2024-08-29 14:26:05,134 - INFO - Evaluation... -2024-08-29 14:26:10,402 - INFO - MCC_original: 0.8828220974500053 -2024-08-29 14:26:10,408 - INFO - MCC_calibrated: 0.8515557170080337 -2024-08-29 14:26:10,413 - INFO - MCC_certainty: 0.9622079366584919 -2024-08-29 14:26:10,603 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:26:10,606 - INFO - F1_calibrated_novelty: 0.69675 -2024-08-29 14:26:10,606 - INFO - Threshold: 0.1300285274422259 -2024-08-29 14:26:10,607 - INFO - Results for 0.1: {'Threshold': 0.1300285274422259, 'MCC_original': 0.8828220974500053, 'MCC_calibrated': 0.8515557170080337, 'MCC_certainty': 0.9622079366584919, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.69675} -2024-08-29 14:26:10,607 - INFO - DROP CLASS RATIO 0.15000000000000002 -2024-08-29 14:26:18,195 - INFO - Partition data using stratify reduction approach -2024-08-29 14:26:19,082 - INFO - Partition data using stratify approach -2024-08-29 14:26:19,378 - INFO - Train model... -2024-08-29 14:28:18,920 - INFO - Evaluation... -2024-08-29 14:28:25,970 - INFO - MCC_original: 0.8473644709810452 -2024-08-29 14:28:25,976 - INFO - MCC_calibrated: 0.8582929426954985 -2024-08-29 14:28:25,981 - INFO - MCC_certainty: 0.9635400589446806 -2024-08-29 14:28:26,340 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:28:26,343 - INFO - F1_calibrated_novelty: 0.7996428571428571 -2024-08-29 14:28:26,343 - INFO - Threshold: 0.12868256892922364 -2024-08-29 14:28:26,346 - INFO - Results for 0.15000000000000002: {'Threshold': 0.12868256892922364, 'MCC_original': 0.8473644709810452, 'MCC_calibrated': 0.8582929426954985, 'MCC_certainty': 0.9635400589446806, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.7996428571428571} -2024-08-29 14:28:26,347 - INFO - DROP CLASS RATIO 0.2 -2024-08-29 14:28:32,779 - INFO - Partition data using stratify reduction approach -2024-08-29 14:28:33,433 - INFO - Partition data using stratify approach -2024-08-29 14:28:33,715 - INFO - Train model... -2024-08-29 14:30:28,235 - INFO - Evaluation... -2024-08-29 14:30:34,865 - INFO - MCC_original: 0.7954491798475631 -2024-08-29 14:30:34,871 - INFO - MCC_calibrated: 0.8496564061731191 -2024-08-29 14:30:34,876 - INFO - MCC_certainty: 0.9483571408358538 -2024-08-29 14:30:35,242 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:30:35,248 - INFO - F1_calibrated_novelty: 0.81425 -2024-08-29 14:30:35,248 - INFO - Threshold: 0.12646082445699725 -2024-08-29 14:30:35,251 - INFO - Results for 0.2: {'Threshold': 0.12646082445699725, 'MCC_original': 0.7954491798475631, 'MCC_calibrated': 0.8496564061731191, 'MCC_certainty': 0.9483571408358538, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.81425} -2024-08-29 14:30:35,251 - INFO - DROP CLASS RATIO 0.25 -2024-08-29 14:30:42,466 - INFO - Partition data using stratify reduction approach -2024-08-29 14:30:43,145 - INFO - Partition data using stratify approach -2024-08-29 14:30:43,372 - INFO - Train model... -2024-08-29 14:32:18,090 - INFO - Evaluation... -2024-08-29 14:32:23,721 - INFO - MCC_original: 0.7631117147236012 -2024-08-29 14:32:23,726 - INFO - MCC_calibrated: 0.8549841454816617 -2024-08-29 14:32:23,730 - INFO - MCC_certainty: 0.9458955041727738 -2024-08-29 14:32:24,110 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:32:24,114 - INFO - F1_calibrated_novelty: 0.8526041666666667 -2024-08-29 14:32:24,114 - INFO - Threshold: 0.12958420736715168 -2024-08-29 14:32:24,116 - INFO - Results for 0.25: {'Threshold': 0.12958420736715168, 'MCC_original': 0.7631117147236012, 'MCC_calibrated': 0.8549841454816617, 'MCC_certainty': 0.9458955041727738, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.8526041666666667} -2024-08-29 14:32:24,116 - INFO - DROP CLASS RATIO 0.3 -2024-08-29 14:32:30,008 - INFO - Partition data using stratify reduction approach -2024-08-29 14:32:30,651 - INFO - Partition data using stratify approach -2024-08-29 14:32:30,855 - INFO - Train model... -2024-08-29 14:33:52,973 - INFO - Evaluation... -2024-08-29 14:33:58,527 - INFO - MCC_original: 0.7145502726638812 -2024-08-29 14:33:58,532 - INFO - MCC_calibrated: 0.853811014792632 -2024-08-29 14:33:58,536 - INFO - MCC_certainty: 0.9639911068509706 -2024-08-29 14:33:58,961 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:33:58,966 - INFO - F1_calibrated_novelty: 0.92825 -2024-08-29 14:33:58,966 - INFO - Threshold: 0.1635861683129534 -2024-08-29 14:33:58,968 - INFO - Results for 0.3: {'Threshold': 0.1635861683129534, 'MCC_original': 0.7145502726638812, 'MCC_calibrated': 0.853811014792632, 'MCC_certainty': 0.9639911068509706, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.92825} -2024-08-29 14:33:58,968 - INFO - DROP CLASS RATIO 0.35000000000000003 -2024-08-29 14:34:04,626 - INFO - Partition data using stratify reduction approach -2024-08-29 14:34:05,281 - INFO - Partition data using stratify approach -2024-08-29 14:34:05,460 - INFO - Train model... -2024-08-29 14:35:20,803 - INFO - Evaluation... -2024-08-29 14:35:28,096 - INFO - MCC_original: 0.6825887114096666 -2024-08-29 14:35:28,101 - INFO - MCC_calibrated: 0.8254274261317943 -2024-08-29 14:35:28,105 - INFO - MCC_certainty: 0.9287472119530007 -2024-08-29 14:35:28,578 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:35:28,583 - INFO - F1_calibrated_novelty: 0.8774264705882353 -2024-08-29 14:35:28,583 - INFO - Threshold: 0.16354070947928706 -2024-08-29 14:35:28,585 - INFO - Results for 0.35000000000000003: {'Threshold': 0.16354070947928706, 'MCC_original': 0.6825887114096666, 'MCC_calibrated': 0.8254274261317943, 'MCC_certainty': 0.9287472119530007, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.8774264705882353} -2024-08-29 14:35:28,585 - INFO - DROP CLASS RATIO 0.4 -2024-08-29 14:35:36,023 - INFO - Partition data using stratify reduction approach -2024-08-29 14:35:36,743 - INFO - Partition data using stratify approach -2024-08-29 14:35:36,913 - INFO - Train model... -2024-08-29 14:36:43,443 - INFO - Evaluation... -2024-08-29 14:36:50,298 - INFO - MCC_original: 0.6556399862040148 -2024-08-29 14:36:50,304 - INFO - MCC_calibrated: 0.844793173992272 -2024-08-29 14:36:50,308 - INFO - MCC_certainty: 0.8914202947522641 -2024-08-29 14:36:50,843 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:36:50,848 - INFO - F1_calibrated_novelty: 0.8165789473684211 -2024-08-29 14:36:50,848 - INFO - Threshold: 0.09880301611117476 -2024-08-29 14:36:50,850 - INFO - Results for 0.4: {'Threshold': 0.09880301611117476, 'MCC_original': 0.6556399862040148, 'MCC_calibrated': 0.844793173992272, 'MCC_certainty': 0.8914202947522641, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.8165789473684211} -2024-08-29 14:36:50,850 - INFO - DROP CLASS RATIO 0.45 -2024-08-29 14:36:56,747 - INFO - Partition data using stratify reduction approach -2024-08-29 14:36:57,385 - INFO - Partition data using stratify approach -2024-08-29 14:36:57,532 - INFO - Train model... -2024-08-29 14:37:50,059 - INFO - Evaluation... -2024-08-29 14:37:56,150 - INFO - MCC_original: 0.6090781124670661 -2024-08-29 14:37:56,154 - INFO - MCC_calibrated: 0.8397033735625997 -2024-08-29 14:37:56,158 - INFO - MCC_certainty: 0.884432887185387 -2024-08-29 14:37:56,713 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:37:56,721 - INFO - F1_calibrated_novelty: 0.8466477272727273 -2024-08-29 14:37:56,721 - INFO - Threshold: 0.105895357986557 -2024-08-29 14:37:56,724 - INFO - Results for 0.45: {'Threshold': 0.105895357986557, 'MCC_original': 0.6090781124670661, 'MCC_calibrated': 0.8397033735625997, 'MCC_certainty': 0.884432887185387, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.8466477272727273} -2024-08-29 14:37:56,724 - INFO - DROP CLASS RATIO 0.5 -2024-08-29 14:38:02,492 - INFO - Partition data using stratify reduction approach -2024-08-29 14:38:03,137 - INFO - Partition data using stratify approach -2024-08-29 14:38:03,292 - INFO - Train model... -2024-08-29 14:38:47,060 - INFO - Evaluation... -2024-08-29 14:38:52,538 - INFO - MCC_original: 0.5802905931631231 -2024-08-29 14:38:52,542 - INFO - MCC_calibrated: 0.8596381201450173 -2024-08-29 14:38:52,546 - INFO - MCC_certainty: 0.8998765927326442 -2024-08-29 14:38:53,094 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:38:53,103 - INFO - F1_calibrated_novelty: 0.8897916666666666 -2024-08-29 14:38:53,128 - INFO - Threshold: 0.10492468833445744 -2024-08-29 14:38:53,131 - INFO - Results for 0.5: {'Threshold': 0.10492468833445744, 'MCC_original': 0.5802905931631231, 'MCC_calibrated': 0.8596381201450173, 'MCC_certainty': 0.8998765927326442, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.8897916666666666} -2024-08-29 14:38:53,131 - INFO - DROP CLASS RATIO 0.55 -2024-08-29 14:38:59,024 - INFO - Partition data using stratify reduction approach -2024-08-29 14:38:59,706 - INFO - Partition data using stratify approach -2024-08-29 14:38:59,830 - INFO - Train model... -2024-08-29 14:39:34,999 - INFO - Evaluation... -2024-08-29 14:39:39,741 - INFO - MCC_original: 0.5390664612315749 -2024-08-29 14:39:39,746 - INFO - MCC_calibrated: 0.8538463534547782 -2024-08-29 14:39:39,749 - INFO - MCC_certainty: 0.8769578293603982 -2024-08-29 14:39:40,338 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:39:40,349 - INFO - F1_calibrated_novelty: 0.8911111111111111 -2024-08-29 14:39:40,349 - INFO - Threshold: 0.0929624384328507 -2024-08-29 14:39:40,352 - INFO - Results for 0.55: {'Threshold': 0.0929624384328507, 'MCC_original': 0.5390664612315749, 'MCC_calibrated': 0.8538463534547782, 'MCC_certainty': 0.8769578293603982, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.8911111111111111} -2024-08-29 14:39:40,352 - INFO - DROP CLASS RATIO 0.6000000000000001 -2024-08-29 14:39:46,064 - INFO - Partition data using stratify reduction approach -2024-08-29 14:39:46,699 - INFO - Partition data using stratify approach -2024-08-29 14:39:46,799 - INFO - Train model... -2024-08-29 14:40:15,861 - INFO - Evaluation... -2024-08-29 14:40:20,150 - INFO - MCC_original: 0.5013531468322641 -2024-08-29 14:40:20,155 - INFO - MCC_calibrated: 0.7583348126874758 -2024-08-29 14:40:20,159 - INFO - MCC_certainty: 0.7226312529723593 -2024-08-29 14:40:20,676 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:40:20,685 - INFO - F1_calibrated_novelty: 0.699396551724138 -2024-08-29 14:40:20,685 - INFO - Threshold: 0.03856744708582016 -2024-08-29 14:40:20,687 - INFO - Results for 0.6000000000000001: {'Threshold': 0.03856744708582016, 'MCC_original': 0.5013531468322641, 'MCC_calibrated': 0.7583348126874758, 'MCC_certainty': 0.7226312529723593, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.699396551724138} -2024-08-29 14:40:20,687 - INFO - DROP CLASS RATIO 0.6500000000000001 -2024-08-29 14:40:26,698 - INFO - Partition data using stratify reduction approach -2024-08-29 14:40:27,371 - INFO - Partition data using stratify approach -2024-08-29 14:40:27,457 - INFO - Train model... -2024-08-29 14:40:50,980 - INFO - Evaluation... -2024-08-29 14:40:54,772 - INFO - MCC_original: 0.45540286866749574 -2024-08-29 14:40:54,776 - INFO - MCC_calibrated: 0.7346419874807618 -2024-08-29 14:40:54,780 - INFO - MCC_certainty: 0.6814274520916938 -2024-08-29 14:40:55,487 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:40:55,498 - INFO - F1_calibrated_novelty: 0.7076953125 -2024-08-29 14:40:55,498 - INFO - Threshold: 0.029971820071201465 -2024-08-29 14:40:55,501 - INFO - Results for 0.6500000000000001: {'Threshold': 0.029971820071201465, 'MCC_original': 0.45540286866749574, 'MCC_calibrated': 0.7346419874807618, 'MCC_certainty': 0.6814274520916938, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.7076953125} -2024-08-29 14:40:55,501 - INFO - DROP CLASS RATIO 0.7000000000000001 -2024-08-29 14:41:02,732 - INFO - Partition data using stratify reduction approach -2024-08-29 14:41:03,380 - INFO - Partition data using stratify approach -2024-08-29 14:41:03,453 - INFO - Train model... -2024-08-29 14:41:22,832 - INFO - Evaluation... -2024-08-29 14:41:26,295 - INFO - MCC_original: 0.4226766551661818 -2024-08-29 14:41:26,299 - INFO - MCC_calibrated: 0.6609208822294271 -2024-08-29 14:41:26,303 - INFO - MCC_certainty: 0.5984655539211732 -2024-08-29 14:41:26,797 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:41:26,806 - INFO - F1_calibrated_novelty: 0.6210661764705883 -2024-08-29 14:41:26,806 - INFO - Threshold: 0.029825218447173618 -2024-08-29 14:41:26,809 - INFO - Results for 0.7000000000000001: {'Threshold': 0.029825218447173618, 'MCC_original': 0.4226766551661818, 'MCC_calibrated': 0.6609208822294271, 'MCC_certainty': 0.5984655539211732, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.6210661764705883} -2024-08-29 14:41:26,809 - INFO - DROP CLASS RATIO 0.7500000000000001 -2024-08-29 14:41:32,542 - INFO - Partition data using stratify reduction approach -2024-08-29 14:41:33,316 - INFO - Partition data using stratify approach -2024-08-29 14:41:33,376 - INFO - Train model... -2024-08-29 14:41:46,703 - INFO - Evaluation... -2024-08-29 14:41:49,209 - INFO - MCC_original: 0.37398694225532275 -2024-08-29 14:41:49,213 - INFO - MCC_calibrated: 0.6422304092338768 -2024-08-29 14:41:49,216 - INFO - MCC_certainty: 0.5665146792125259 -2024-08-29 14:41:49,660 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:41:49,673 - INFO - F1_calibrated_novelty: 0.6697297297297298 -2024-08-29 14:41:49,673 - INFO - Threshold: 0.030187330709244866 -2024-08-29 14:41:49,676 - INFO - Results for 0.7500000000000001: {'Threshold': 0.030187330709244866, 'MCC_original': 0.37398694225532275, 'MCC_calibrated': 0.6422304092338768, 'MCC_certainty': 0.5665146792125259, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.6697297297297298} -2024-08-29 14:41:49,676 - INFO - DROP CLASS RATIO 0.8 -2024-08-29 14:41:55,587 - INFO - Partition data using stratify reduction approach -2024-08-29 14:41:56,308 - INFO - Partition data using stratify approach -2024-08-29 14:41:56,353 - INFO - Train model... -2024-08-29 14:42:06,499 - INFO - Evaluation... -2024-08-29 14:42:08,347 - INFO - MCC_original: 0.34134030614635164 -2024-08-29 14:42:08,352 - INFO - MCC_calibrated: 0.6335710178374708 -2024-08-29 14:42:08,354 - INFO - MCC_certainty: 0.5446817015713392 -2024-08-29 14:42:08,790 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:42:08,803 - INFO - F1_calibrated_novelty: 0.7000320512820513 -2024-08-29 14:42:08,803 - INFO - Threshold: 0.01570324414720139 -2024-08-29 14:42:08,806 - INFO - Results for 0.8: {'Threshold': 0.01570324414720139, 'MCC_original': 0.34134030614635164, 'MCC_calibrated': 0.6335710178374708, 'MCC_certainty': 0.5446817015713392, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.7000320512820513} -2024-08-29 14:42:08,807 - INFO - DROP CLASS RATIO 0.8500000000000001 -2024-08-29 14:42:14,999 - INFO - Partition data using stratify reduction approach -2024-08-29 14:42:15,934 - INFO - Partition data using stratify approach -2024-08-29 14:42:15,978 - INFO - Train model... -2024-08-29 14:42:21,353 - INFO - Evaluation... -2024-08-29 14:42:23,172 - INFO - MCC_original: 0.2851919858996399 -2024-08-29 14:42:23,177 - INFO - MCC_calibrated: 0.5456096886692423 -2024-08-29 14:42:23,180 - INFO - MCC_certainty: 0.45084627680370104 -2024-08-29 14:42:23,653 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:42:23,666 - INFO - F1_calibrated_novelty: 0.6676785714285715 -2024-08-29 14:42:23,667 - INFO - Threshold: 0.012415345505190353 -2024-08-29 14:42:23,672 - INFO - Results for 0.8500000000000001: {'Threshold': 0.012415345505190353, 'MCC_original': 0.2851919858996399, 'MCC_calibrated': 0.5456096886692423, 'MCC_certainty': 0.45084627680370104, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.6676785714285715} -2024-08-29 14:42:23,672 - INFO - DROP CLASS RATIO 0.9000000000000001 -2024-08-29 14:42:31,442 - INFO - Partition data using stratify reduction approach -2024-08-29 14:42:32,315 - INFO - Partition data using stratify approach -2024-08-29 14:42:32,342 - INFO - Train model... -2024-08-29 14:42:35,571 - INFO - Evaluation... -2024-08-29 14:42:36,804 - INFO - MCC_original: 0.24675303811642582 -2024-08-29 14:42:36,808 - INFO - MCC_calibrated: 0.3468445855034963 -2024-08-29 14:42:36,810 - INFO - MCC_certainty: 0.3009502290579004 -2024-08-29 14:42:37,146 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:42:37,163 - INFO - F1_calibrated_novelty: 0.37207386363636363 -2024-08-29 14:42:37,163 - INFO - Threshold: 0.00573431158852248 -2024-08-29 14:42:37,167 - INFO - Results for 0.9000000000000001: {'Threshold': 0.00573431158852248, 'MCC_original': 0.24675303811642582, 'MCC_calibrated': 0.3468445855034963, 'MCC_certainty': 0.3009502290579004, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.37207386363636363} -2024-08-29 14:42:37,167 - INFO - DROP CLASS RATIO 0.9500000000000001 -2024-08-29 14:42:44,245 - INFO - Partition data using stratify reduction approach -2024-08-29 14:42:44,915 - INFO - Partition data using stratify approach -2024-08-29 14:42:44,930 - INFO - Train model... -2024-08-29 14:42:46,016 - INFO - Evaluation... -2024-08-29 14:42:46,798 - INFO - MCC_original: 0.15622999394541084 -2024-08-29 14:42:46,803 - INFO - MCC_calibrated: 0.18264626962073724 -2024-08-29 14:42:46,806 - INFO - MCC_certainty: 0.16756922661407805 -2024-08-29 14:42:47,113 - INFO - F1_original_novelty: 0.0 -2024-08-29 14:42:47,132 - INFO - F1_calibrated_novelty: 0.1426595744680851 -2024-08-29 14:42:47,132 - INFO - Threshold: 0.006971721384260388 -2024-08-29 14:42:47,139 - INFO - Results for 0.9500000000000001: {'Threshold': 0.006971721384260388, 'MCC_original': 0.15622999394541084, 'MCC_calibrated': 0.18264626962073724, 'MCC_certainty': 0.16756922661407805, 'F1_original_novelty': 0.0, 'F1_calibrated_novelty': 0.1426595744680851} diff --git a/Data/CP/xgb.model b/Data/CP/xgb.model deleted file mode 100644 index 9a101c2..0000000 Binary files a/Data/CP/xgb.model and /dev/null differ diff --git a/Data/Split/split_benchmark.pdf b/Data/Split/split_benchmark.pdf deleted file mode 100644 index cbd2a87..0000000 Binary files a/Data/Split/split_benchmark.pdf and /dev/null differ diff --git a/Data/Split/split_benchmark.png b/Data/Split/split_benchmark.png deleted file mode 100644 index d5d31b1..0000000 Binary files a/Data/Split/split_benchmark.png and /dev/null differ diff --git a/Data/data_embedding.npz b/Data/data_embedding.npz new file mode 100644 index 0000000..5d3c5fe Binary files /dev/null and b/Data/data_embedding.npz differ diff --git a/Test/Uncertainty/__init__.py b/Test/Uncertainty/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/Test/Uncertainty/test_conformal_predictor.py b/Test/Uncertainty/test_conformal_predictor.py deleted file mode 100644 index afb7a5a..0000000 --- a/Test/Uncertainty/test_conformal_predictor.py +++ /dev/null @@ -1,78 +0,0 @@ -import unittest -from sklearn.datasets import make_classification -from sklearn.model_selection import train_test_split -from sklearn.ensemble import RandomForestClassifier -from synutils.Uncertainty.conformal_predictor import ConformalPredictor - - -class TestConformalPredictor(unittest.TestCase): - def setUp(self): - # Creating a dataset - X, y = make_classification( - n_samples=1000, - n_features=20, - n_informative=12, - n_redundant=4, - n_repeated=4, - n_classes=7, - n_clusters_per_class=2, - random_state=42, - ) - self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( - X, y, test_size=0.2, random_state=42, stratify=y - ) - self.model = RandomForestClassifier(random_state=42) - self.model.fit(self.X_train, self.y_train) - self.predictor = ConformalPredictor(self.model, 0.05) - - def test_initialization(self): - self.assertIsInstance(self.predictor, ConformalPredictor) - self.assertEqual(self.predictor.alpha, 0.05) - - def test_probability_predictions(self): - probas = ConformalPredictor.get_proba(self.model, self.X_test) - self.assertEqual(probas.shape, (self.X_test.shape[0], 7)) # Assuming 7 classes - - def test_nonconformity_scores(self): - probas = ConformalPredictor.get_proba(self.model, self.X_train) - scores = ConformalPredictor.get_nonconformity_score(probas, self.y_train) - self.assertEqual(scores.shape, self.y_train.shape) - - def test_threshold_calculation(self): - probas = ConformalPredictor.get_proba(self.model, self.X_train) - scores = ConformalPredictor.get_nonconformity_score(probas, self.y_train) - threshold = ConformalPredictor.get_threshold(scores, self.predictor.alpha) - self.assertIsInstance(threshold, float) - - def test_p_value_calculation(self): - probas = ConformalPredictor.get_proba(self.model, self.X_test) - probas_calib = ConformalPredictor.get_proba(self.model, self.X_train) - scores = ConformalPredictor.get_nonconformity_score(probas_calib, self.y_train) - p_values = ConformalPredictor.calculate_p_values(probas, scores) - self.assertEqual( - p_values.shape, (self.X_test.shape[0], 7) - ) # Assuming 7 classes - - def test_prediction_labels(self): - class_labels = [str(i) for i in range(7)] - probas = ConformalPredictor.get_proba(self.model, self.X_test) - probas_calib = ConformalPredictor.get_proba(self.model, self.X_train) - scores = ConformalPredictor.get_nonconformity_score(probas_calib, self.y_train) - threshold = ConformalPredictor.get_threshold(scores, self.predictor.alpha) - p_values = ConformalPredictor.calculate_p_values(probas, scores) - prediction_sets = ConformalPredictor.get_prediction_labels( - p_values, threshold, class_labels - ) - self.assertEqual(len(prediction_sets), self.X_test.shape[0]) - - def test_calibrated_prediction(self): - class_labels = list(range(7)) - prediction_sets = [set([0, 1]), set([1]), set([2, 3])] - calibrated_labels = ConformalPredictor.get_calibrated_prediction( - prediction_sets, class_labels - ) - self.assertEqual(calibrated_labels.shape, (3,)) - - -if __name__ == "__main__": - unittest.main() diff --git a/synutils/Analysis/__init__.py b/synutils/Analysis/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/synutils/Analysis/metrics.py b/synutils/Analysis/metrics.py deleted file mode 100644 index 62c8a20..0000000 --- a/synutils/Analysis/metrics.py +++ /dev/null @@ -1,125 +0,0 @@ -from sklearn.metrics import ( - matthews_corrcoef, - average_precision_score, - roc_auc_score, - accuracy_score, - f1_score, - precision_score, - recall_score, -) -from tqdm import tqdm -import numpy as np - - -class Metrics: - def __init__(self, X_train, y_train, X_test, y_test, models, verbose=True): - """ - Initializes the Metrics class. - - Parameters: - - X_train (pd.DataFrame or np.array): Training data features. - - y_train (pd.Series or np.array): Training data labels. - - X_test (pd.DataFrame or np.array): Testing data features. - - y_test (pd.Series or np.array): Testing data labels. - - models (dict): Dictionary of models to evaluate. Keys are model names, - values are the model objects. - - verbose (bool): If True, shows progress bar using tqdm. - """ - self.X_train = X_train - self.y_train = y_train - self.X_test = X_test - self.y_test = y_test - self.models = models - self.verbose = verbose - self.metric_dict = {} - - def evaluate_model(self, model, name): - """ - Evaluates a single model and computes multiple metrics. - - Parameters: - - model: The model to evaluate. - - name: The name of the model. - - Returns: - - A dictionary with metrics for the model. - """ - try: - # Fit the model - model.fit(self.X_train, self.y_train) - - # Predictions and probability estimates - y_pred = model.predict(self.X_test) - mcc = matthews_corrcoef(self.y_test, y_pred) - accuracy = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average="weighted") - precision = precision_score(self.y_test, y_pred, average="weighted") - recall = recall_score(self.y_test, y_pred, average="weighted") - - # Default values for AP and AUC in case predict_proba is not available - ap, auc = np.nan, np.nan - - # If the model has predict_proba, compute AP and AUC - if hasattr(model, "predict_proba"): - y_proba = model.predict_proba(self.X_test) - num_classes = len(np.unique(self.y_test)) - pred_classes = y_proba.shape[1] - - if num_classes == pred_classes: - ap = average_precision_score( - self.y_test, y_proba, average="weighted" - ) - auc = roc_auc_score( - self.y_test, y_proba, multi_class="ovo", average="weighted" - ) - else: - # Handle case where predicted classes don't match true classes - y_proba_pseudo = np.concatenate( - [y_proba, np.zeros(y_proba.shape[0]).reshape(-1, 1)], axis=1 - ) - ap = average_precision_score( - self.y_test, y_proba_pseudo, average="weighted" - ) - auc = roc_auc_score( - self.y_test, - y_proba_pseudo, - multi_class="ovo", - average="weighted", - ) - - return { - "MCC": mcc, - "Accuracy": accuracy, - "F1-Score": f1, - "Precision": precision, - "Recall": recall, - "Average_Precision": ap, - "ROC-AUC": auc, - } - except Exception as e: - print(f"Error evaluating model {name}: {str(e)}") - return None - - def cal_metric_df(self): - """ - Calculate metrics for all models and return a DataFrame containing the results. - - Returns: - - A pandas DataFrame with models as rows and metrics as columns. - """ - for name, model in tqdm(self.models.items(), disable=not self.verbose): - metrics = self.evaluate_model(model, name) - if metrics: - self.metric_dict[name] = metrics - - return self.metric_dict - - def fit(self): - """ - Fit all models, compute metrics, and return the metrics DataFrame. - - Returns: - - A pandas DataFrame containing the metrics for each model. - """ - _metrics = self.cal_metric_df() - return _metrics diff --git a/synutils/Analysis/split_benchmark.py b/synutils/Analysis/split_benchmark.py deleted file mode 100644 index f1defbc..0000000 --- a/synutils/Analysis/split_benchmark.py +++ /dev/null @@ -1,171 +0,0 @@ -from typing import Any, Dict, List, Optional -from joblib import Parallel, delayed -from synutils.Partition.data_partition import DataPartition -from sklearn.neighbors import KNeighborsClassifier -from sklearn.neural_network import MLPClassifier -from sklearn.linear_model import LogisticRegression -from xgboost import XGBClassifier -from synutils.Analysis.metrics import Metrics -import numpy as np - - -class SplitBenchmark: - """Class to benchmark different data partitioning methods using various classifiers. - - Attributes: - - data (Any): The dataset to be partitioned and used for model training. - - test_size (float): The proportion of the dataset to include in the test split. - - class_column (str): The name of the column to be used as the target for models. - - random_state (int): Random state for reproducibility of the splits. - - drop_class_ratio (float): The ratio of the class to drop during partitioning. - """ - - def __init__( - self, - data: Any, - test_size: float, - class_columns: str, - random_state: int, - drop_class_ratio: float, - ) -> None: - self.data = data - self.test_size = test_size - self.class_column = class_columns - self.random_state = random_state - self.drop_class_ratio = drop_class_ratio - - @staticmethod - def seed_generator( - number_of_seed: int, random_state: Optional[int] = None - ) -> List[int]: - """Generates a list of random seeds using NumPy. - - Parameters: - - number_of_seed (int): The number of seeds to generate. - - random_state (Optional[int]): The random state to ensure reproducibility. - - Returns: - - List[int]: A list of generated seeds. - """ - rng = np.random.default_rng(seed=random_state) - seeds = rng.integers(low=0, high=2**32 - 1, size=number_of_seed).tolist() - return seeds - - @staticmethod - def setup_models(random_state: int) -> Dict[str, Any]: - """Sets up a dictionary of different ML models configured with a common random state. - - Parametrs: - - random_state (int): The random state for model reproducibility. - - Returns: - - Dict[str, Any]: A dictionary of initialized models. - """ - return { - "kNN": KNeighborsClassifier(n_jobs=1), - "MLP": MLPClassifier(random_state=random_state), - "XGB": XGBClassifier( - random_state=random_state, objective="multi:softmax", n_jobs=1 - ), - "Logistic_regression": LogisticRegression( - random_state=random_state, n_jobs=1 - ), - } - - @staticmethod - def process_method( - seed: int, - data: Any, - test_size: float, - class_column: str, - method: str, - random_state: int, - drop_class_ratio: float, - ) -> Dict[str, Any]: - """Processes the data partitioning and model evaluation for a given method and seed. - - Parameters: - - seed (int): Random seed for partitioning reproducibility. - - data (Any): The dataset to be used. - - test_size (float): Proportion of the dataset to include in the test split. - - class_column (str): Target column name. - - method (str): Partitioning method name. - - random_state (int): Random state for model reproducibility. - - drop_class_ratio (float): Ratio of the class to be dropped in partitioning. - - Returns: - Dict[str, Any]: A dictionary containing the performance metrics of each model. - """ - models = SplitBenchmark.setup_models(random_state) - partitioner = DataPartition( - data, - test_size, - class_column, - method, - seed, - drop_class_ratio, - keep_data=False, - ) - data_train, data_test = partitioner.fit() - if method == "stratified_class_reduction": - X_train, y_train = ( - data_train.drop([class_column, "class_mapping"], axis=1), - data_train[class_column], - ) - X_test, y_test = ( - data_test.drop([class_column, "class_mapping"], axis=1), - data_test[class_column], - ) - else: - X_train, y_train = ( - data_train.drop([class_column], axis=1), - data_train[class_column], - ) - X_test, y_test = ( - data_test.drop([class_column], axis=1), - data_test[class_column], - ) - - return Metrics(X_train, y_train, X_test, y_test, models).fit() - - def fit( - self, - number_sample: int = 5, - n_jobs: int = 3, - verbose: int = 0, - methods: List[str] = [ - "random", - "stratified_target", - "stratified_class_reduction", - ], - ) -> Dict[str, List[Dict[str, Any]]]: - """Executes the benchmarking across different partition methods. - - Parameters: - - number_sample (int): Number of samples to generate seeds for. - - n_jobs (int): Number of parallel jobs to run. - - verbose (int): Verbosity level. - - methods (List[str]): List of partition methods to evaluate. - - Returns: - Dict[str, List[Dict[str, Any]]]: A dictionary with method names as keys and lists of results as values. - """ - seed_list = self.seed_generator(number_sample, self.random_state) - results = {} - - for method in methods: - method_results = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(self.process_method)( - seed, - self.data, - self.test_size, - self.class_column, - method, - self.random_state, - self.drop_class_ratio, - ) - for seed in seed_list - ) - results[method] = method_results - - return results diff --git a/synutils/Uncertainty/__init_.py b/synutils/Uncertainty/__init_.py deleted file mode 100644 index e69de29..0000000 diff --git a/synutils/Uncertainty/conformal_predictor.py b/synutils/Uncertainty/conformal_predictor.py deleted file mode 100644 index 5f68cc3..0000000 --- a/synutils/Uncertainty/conformal_predictor.py +++ /dev/null @@ -1,237 +0,0 @@ -from typing import Any, List, Set, Tuple, Union -import numpy as np - - -class ConformalPredictor: - def __init__(self, model: Any, alpha: float): - """ - Initialize the ConformalPredictor with a prediction model and a - significance level. - - Parameters: - - model (Any): A machine learning model that supports probability prediction. - - alpha (float): The significance level used to compute the threshold - for p-values. - """ - self.model = model - self.alpha = alpha - - @staticmethod - def get_proba(model: Any, X: np.ndarray) -> np.ndarray: - """ - Predict probability estimates for each class for each sample in X. - - Parameters: - - model (Any): The prediction model. - - X (np.ndarray): The input features for which to predict probabilities. - - Returns: - - np.ndarray: The predicted probabilities. - """ - return model.predict_proba(X) - - @staticmethod - def get_nonconformity_score( - probas_calib: np.ndarray, y_cal: np.ndarray - ) -> np.ndarray: - """ - Calculate the nonconformity scores using the calibration data. - - Parameters: - - probas_calib (np.ndarray): Probabilities obtained from the calibration dataset. - - y_cal (np.ndarray): Actual labels of the calibration dataset. - - Returns: - - np.ndarray: Nonconformity scores for the calibration dataset. - """ - return 1 - probas_calib[np.arange(len(y_cal)), y_cal] - - @staticmethod - def get_threshold(nonconformity_scores: np.ndarray, alpha: float) -> float: - """ - Compute the threshold value for determining significant p-values based on - the alpha level, with an adjustment for the finite sample size. - - Parameters: - - nonconformity_scores (np.ndarray): Calculated nonconformity scores from the - calibration data. - - alpha (float): Significance level to determine the threshold. - - Returns: - - float: The calculated threshold with a sample size adjustment. - """ - number_of_samples = len(nonconformity_scores) - qlevel = (1 - alpha) * ((number_of_samples + 1) / number_of_samples) - return np.percentile(nonconformity_scores, qlevel * 100) - - @staticmethod - def calculate_p_values( - probs_test: np.ndarray, nonconformity_scores: np.ndarray - ) -> np.ndarray: - """ - Calculate the p-values for the test dataset based on the nonconformity scores. - - Parameters: - - probs_test (np.ndarray): Probability estimates from the test data. - - nonconformity_scores (np.ndarray): Nonconformity scores from the - calibration data. - - Returns: - - np.ndarray: The calculated p-values for each test instance. - """ - margins = 1 - probs_test - nonconformity_scores = nonconformity_scores.reshape(1, -1) - comparisons = margins[:, :, np.newaxis] <= nonconformity_scores - p_values = comparisons.sum(axis=2) + 1 - p_values = p_values.astype(float) # Ensure the array is of float type - p_values /= len(nonconformity_scores[0]) + 1 # Safe division - return p_values - - @staticmethod - def get_prediction_labels( - p_values: np.ndarray, threshold: float, class_labels: List[str] - ) -> List[Set[str]]: - """ - Generate prediction sets based on p-values and class labels, where predictions - meet or exceed the threshold. - - Parameters: - - p_values (np.ndarray): The p-values for each class of each test instance. - - threshold (float): The threshold value to determine significant predictions. - - class_labels (List[str]): The list of class labels. - - Returns: - - List[Set[str]]: Sets of predicted labels for each test instance. - """ - class_labels = np.array(class_labels) - prediction_sets = p_values >= threshold - prediction_set_labels = [ - set(class_labels[prediction_set]) for prediction_set in prediction_sets - ] - return prediction_set_labels - - @staticmethod - def get_calibrated_prediction( - prediction_set_labels: list[set], class_labels: list, novelty: bool = False - ) -> np.ndarray: - """ - Transform prediction sets into calibrated labels using numpy vectorization for efficiency. - - Parameters: - - prediction_set_labels (list[set]): A list of sets, each containing predicted class labels. - - class_labels (list): A list of all possible class labels. - - Returns: - - np.ndarray: An array of calibrated labels, with a specific label for uncertainty. - """ - # Determine the uncertainty class as one more than the maximum class label - if novelty: - uncertain_class = 1 - else: - uncertain_class = max(class_labels) + 1 - - # Initialize the output array with the uncertain class by default - y_calibrated = np.full(len(prediction_set_labels), uncertain_class, dtype=int) - - # Iterate over prediction sets and assign calibrated labels - for idx, values in enumerate(prediction_set_labels): - if len(values) == 1: - if novelty: - y_calibrated[idx] = 0 - else: - y_calibrated[idx] = next(iter(values)) - - return y_calibrated - - def cal_non_cp(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: - """ - Calculate the nonconformity scores for given input features and labels. - - Parameters: - - X (np.ndarray): Input features. - - y (np.ndarray): Corresponding labels. - - Returns: - - np.ndarray: Nonconformity scores calculated using the model's probability - predictions. - """ - probas = self.get_proba(self.model, X) - return self.get_nonconformity_score(probas, y) - - def fit( - self, - X: np.ndarray, - class_labels: List[str], - nonconformity_scores: np.ndarray, - novelty: bool = False, - ) -> Tuple[List[Set[str]], np.ndarray]: - """ - Fit the conformal prediction model and generate prediction labels. - - Parameters: - - X (np.ndarray): Test dataset features. - - class_labels (List[str]): List of all class labels. - - nonconformity_scores (np.ndarray): Pre-computed nonconformity scores from the - calibration data. - - Returns: - - Tuple[List[Set[str]], np.ndarray]: A tuple containing sets of predicted labels - for each instance and the used nonconformity scores. - """ - probas = self.get_proba(self.model, X) - threshold = self.get_threshold(nonconformity_scores, self.alpha) - p_values = self.calculate_p_values(probas, nonconformity_scores) - prediction_set_labels = self.get_prediction_labels( - p_values, threshold, class_labels - ) - y_calibrated = self.get_calibrated_prediction( - prediction_set_labels, class_labels, novelty - ) - return prediction_set_labels, y_calibrated, threshold - - @staticmethod - def get_certain_results( - y: np.ndarray, y_calibrated: np.ndarray, uncertain_class: Union[int, float, str] - ) -> Tuple[np.ndarray, np.ndarray]: - """ - Extracts and returns results from the given arrays where the calibrated results - are certain. - - Parameters: - ----------- - - y (np.ndarray): An array of original labels or predictions. - - y_calibrated (np.ndarray): An array of calibrated labels or predictions - corresponding to `y`. - - uncertain_class (Union[int, float, str]): The value representing uncertainty - in the `y_calibrated` array. Any entry in `y_calibrated` that matches this value - will be excluded from the results. - - Returns: - -------- - Tuple[np.ndarray, np.ndarray] - A tuple containing: - - `certain_y`: A numpy array of original labels or predictions from `y` that are - certain. - - `certain_y_calibrated`: A numpy array of calibrated labels or predictions - from `y_calibrated` that are certain. - - Raises: - ------- - TypeError - If `y` or `y_calibrated` are not numpy arrays. - - ValueError - If `y` and `y_calibrated` do not have the same shape. - """ - - if not isinstance(y, np.ndarray) or not isinstance(y_calibrated, np.ndarray): - raise TypeError("Both `y` and `y_calibrated` must be numpy arrays.") - - if y.shape != y_calibrated.shape: - raise ValueError("`y` and `y_calibrated` must have the same shape.") - - # Boolean mask where y_calibrated does not equal the uncertain_class - certain_indices = y_calibrated != uncertain_class - - # Return the filtered results - return y[certain_indices], y_calibrated[certain_indices] diff --git a/synutils/Uncertainty/peformance.py b/synutils/Uncertainty/peformance.py deleted file mode 100644 index 6991178..0000000 --- a/synutils/Uncertainty/peformance.py +++ /dev/null @@ -1,106 +0,0 @@ -import numpy as np -from typing import Dict, Any -from sklearn.metrics import matthews_corrcoef, f1_score -from synutils.Uncertainty.conformal_predictor import ConformalPredictor -from synutils.utils import setup_logging - -logger = setup_logging() - - -def cp_evaluate( - model: Any, - data_test: Any, - data_cal: Any, - class_column: str, - class_mapping_column: str, - remove_train: Any, - alpha: float = 0.05, -) -> Dict[str, float]: - """ - Evaluate a conformal predictor model using training, test, calibration, and novelty datasets. - - Parameters: - ----------- - model : Any - The underlying predictive model used by the conformal predictor. - - data_test : Any - Test dataset including features and labels. - - data_cal : Any - Calibration dataset including features and labels. - - remove_train : Any - Dataset used for novelty detection, including features and labels. - - alpha : float, optional, default=0.05 - The significance level for the conformal predictor. - - Returns: - -------- - Dict[str, float] - A dictionary containing various evaluation metrics, including: - - 'MCC_original': Matthews correlation coefficient for original predictions. - - 'MCC_calibrated': Matthews correlation coefficient for calibrated predictions. - - 'MCC_certainty': Matthews correlation coefficient for predictions classified with certainty. - - 'F1_original_novelty': F1 score for novelty detection (original predictions). - - 'F1_calibrated_novelty': F1 score for novelty detection (calibrated predictions). - """ - - X_test = data_test.drop([class_column, class_mapping_column], axis=1).values - y_test = data_test[class_mapping_column].values - - X_cal = data_cal.drop([class_column, class_mapping_column], axis=1).values - y_cal = data_cal[class_mapping_column].values - - X_novelty = remove_train.drop([class_column, class_mapping_column], axis=1).values - y_novelty = remove_train[class_mapping_column].values - - class_labels = list(set(data_cal[class_mapping_column].unique())) - - # Initialize the conformal predictor - cp = ConformalPredictor(model, alpha) - - # Calculate nonconformity scores using the calibration dataset - nonconformity_scores = cp.cal_non_cp(X_cal, y_cal) - - # Fit the model to the test set and get calibrated predictions - _, y_calibrated_test, _ = cp.fit(X_test, class_labels, nonconformity_scores) - - # Extract certain predictions - y_certain, y_pre_certain = cp.get_certain_results( - y_test, y_calibrated_test, uncertain_class=np.max(y_calibrated_test) - ) - - # Fit the model to the novelty set and get calibrated predictions - _, y_calibrated_novelty, threshold = cp.fit( - X_novelty, class_labels, nonconformity_scores, novelty=True - ) - - # Calculate metrics - mcc_original = matthews_corrcoef(y_test, model.predict(X_test)) - logger.info(f"MCC_original: {mcc_original}") - - mcc_calibrated = matthews_corrcoef(y_test, y_calibrated_test) - logger.info(f"MCC_calibrated: {mcc_calibrated}") - - mcc_certainty = matthews_corrcoef(y_certain, y_pre_certain) - logger.info(f"MCC_certainty: {mcc_certainty}") - - f1_original_novelty = f1_score(y_novelty, model.predict(X_novelty), average="micro") - logger.info(f"F1_original_novelty: {f1_original_novelty}") - - f1_calibrated_novelty = f1_score( - np.ones(y_novelty.shape[0]), y_calibrated_novelty, average="micro" - ) - logger.info(f"F1_calibrated_novelty: {f1_calibrated_novelty}") - logger.info(f"Threshold: {threshold}") - - return { - "Threshold": threshold, - "MCC_original": mcc_original, - "MCC_calibrated": mcc_calibrated, - "MCC_certainty": mcc_certainty, - "F1_original_novelty": f1_original_novelty, - "F1_calibrated_novelty": f1_calibrated_novelty, - } diff --git a/synutils/Visualization/chemical_space.py b/synutils/Visualization/chemical_space.py new file mode 100644 index 0000000..14a0808 --- /dev/null +++ b/synutils/Visualization/chemical_space.py @@ -0,0 +1,83 @@ +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches + +plt.rc("text", usetex=True) # Enable LaTeX rendering +plt.rc("font", family="serif") # Optional: use serif font + + +def scatter_plot( + data_train, + data_test, + size_train=10, + size_test=10, + title=None, + ax=None, + xlabel="Coordinate 1", + ylabel="Coordinate 2", +): + # Check if data is empty + if data_train.empty or data_test.empty: + raise ValueError("Input data frames cannot be empty.") + + # Check for necessary columns + if data_train.columns.size < 3 or data_test.columns.size < 3: + raise ValueError("Data frames must have at least three columns.") + + # Adding 'Type' column to differentiate between train and test data + data_train["Type"] = "Train" + data_test["Type"] = "Test" + + # Combine the datasets + data_combined = pd.concat([data_train, data_test]) + + # If no axes object is passed, create one + if ax is None: + fig, ax = plt.subplots(figsize=(12, 8)) + + # Define a more distinct color palette + pastel_palette = { + "Train": "deepskyblue", + "Test": "magenta", + } # Using deepskyblue and magenta for better distinction + + # Create scatter plots with specified sizes + for dtype, color in pastel_palette.items(): + subset = data_combined[data_combined["Type"] == dtype] + ax.scatter( + subset[subset.columns[1]], + subset[subset.columns[2]], + color=color, + label=dtype, + s=size_train if dtype == "Train" else size_test, + alpha=0.1, + edgecolor="none", + ) + + # Set the title if provided + if title: + ax.set_title(rf"{title}", fontsize=24, fontweight="bold") + + # Set labels + ax.set_xlabel(xlabel, fontsize=18) + ax.set_ylabel(ylabel, fontsize=18) + + # Enhance grid and layout + ax.grid(True, which="both", linestyle="--", linewidth=0.5) + ax.set_axisbelow(True) + + # Get legend handles and labels for external usage + handles, labels = ax.get_legend_handles_labels() + + # Return the axes, handles, and labels for further customization outside the function + return ax, handles, labels + + +# Define a function that modifies the legend handles to full opacity for better visibility in the legend +def adjust_legend_handles(handles, colors): + new_handles = [] + for handle, color in zip(handles, colors): + # Create a new handle with the same properties but with full alpha for the legend + new_handle = mpatches.Patch(color=color, label=handle.get_label()) + new_handles.append(new_handle) + return new_handles