From e24c911b291f6fd84a5b3509789273fef197df2f Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 29 Nov 2024 11:49:00 +0100 Subject: [PATCH 01/25] First implementation of the token-safe retries --- .../WMCore/Storage/Backends/GFAL2Impl.py | 26 +++++++++-- src/python/WMCore/Storage/StageOutImpl.py | 45 +++++++++++++++---- 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index 4b00d16591..aeb806faa2 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -24,7 +24,7 @@ def __init__(self, stagein=False): # Next commands after separation are executed without env -i and this leads us with # mixed environment with COMP and system python. # GFAL2 is not build under COMP environment and it had failures with mixed environment. - self.setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" # Default initialization, it is tweaked in createStageOutCommand depending on the authentication method self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) @@ -113,7 +113,7 @@ def buildCopyCommandDict(self, sourcePFN, targetPFN, options=None, checksums=Non return copyCommandDict - def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=None): + def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=None, auth_method=None): """ Create gfal-cp command for stageOut @@ -121,8 +121,19 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No :targetPFN: str, destination PFN :options: str, additional options for gfal-cp :checksums: dict, collect checksums according to the algorithms saved as keys + :auth_method: str, the authentication method to be used ("X509", "TOKEN", or None) """ + # Adjust self.setups based on the selected authentication method + if auth_method == "X509": + self.setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + elif auth_method == "TOKEN": + self.setups = "env -i BEARER_TOKEN=$(cat $BEARER_TOKEN_FILE) JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + else: + logging.info("Warning! Running gfal without either a X509 certificate or a token!") + self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + + # Construct the gfal-cp command copyCommandDict = self.buildCopyCommandDict(sourcePFN, targetPFN, options, checksums) copyCommand = self.copyCommand.format_map(copyCommandDict) result = "#!/bin/bash\n" + copyCommand @@ -141,7 +152,7 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No return result - def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=None): + def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=None, auth_method=None): """ Debug a failed gfal-cp command for stageOut, without re-running it, providing information on the environment and the certifications @@ -150,8 +161,17 @@ def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=N :targetPFN: str, destination PFN :options: str, additional options for gfal-cp :checksums: dict, collect checksums according to the algorithms saved as keys + :auth_method: str, the authentication method to be used ("X509", "TOKEN", or None) """ + # Adjust self.setups based on the selected authentication method + if auth_method == "X509": + self.setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + elif auth_method == "TOKEN": + self.setups = "env -i BEARER_TOKEN=$(cat $BEARER_TOKEN_FILE) JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + else: + self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + copyCommandDict = self.buildCopyCommandDict(sourcePFN, targetPFN, options, checksums) copyCommand = self.copyCommand.format_map(copyCommandDict) diff --git a/src/python/WMCore/Storage/StageOutImpl.py b/src/python/WMCore/Storage/StageOutImpl.py index 46573aec4b..22f837299e 100644 --- a/src/python/WMCore/Storage/StageOutImpl.py +++ b/src/python/WMCore/Storage/StageOutImpl.py @@ -178,10 +178,9 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): This operator does the actual stage out by invoking the overridden plugin methods of the derived object. - - """ - # // + + # // # // Generate the source PFN from the plain PFN if needed # // sourcePFN = self.createSourceName(protocol, inputPFN) @@ -189,7 +188,7 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): # destination may also need PFN changed # i.e. if we are staging in a file from an SE targetPFN = self.createTargetName(protocol, targetPFN) - # // + # // # // Create the output directory if implemented # // for retryCount in range(self.numRetries + 1): @@ -203,9 +202,10 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): msg += "Error details:\n{}\n".format(str(ex)) logging.error(msg) if retryCount == self.numRetries: - # // + # // # // last retry, propagate exception # // + logging.error("Maximum retries exhausted when trying to create the output directory") raise ex time.sleep(self.retryPause) @@ -213,20 +213,49 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): # // Create the command to be used. # // command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums) - # // + # // # // Run the command # // stageOutEx = None # variable to store the possible StageOutError for retryCount in range(self.numRetries + 1): try: - logging.info("Running the stage out...") + logging.info(f"Running the stage out (attempt {retryCount + 1})...") self.executeCommand(command) + logging.info("Stage-out succeeded with the current environment.") break + except StageOutError as ex: - msg = "Attempt {} to stage out failed.\n".format(retryCount) + msg = "Attempt {} to stage out failed with default setup.\n".format(retryCount) msg += "Error details:\n{}\n".format(str(ex)) logging.error(msg) + + logging.info("Retrying with authentication-safe logic...") + + # Authentication-safe fallback logic + if os.getenv("X509_USER_PROXY"): + logging.info("Retrying with X509_USER_PROXY after unsetting BEARER_TOKEN...") + os.system("unset BEARER_TOKEN; unset BEARER_TOKEN_FILE") + command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, "X509") + try: + self.executeCommand(command) + logging.info("Stage-out succeeded with X509 after unsetting BEARER_TOKEN.") + return + except StageOutError as fallbackEx: + logging.warning(f"Fallback with X509_USER_PROXY failed: \n{fallbackEx}") + + if os.getenv("BEARER_TOKEN") or os.getenv("BEARER_TOKEN_FILE"): + logging.info("Retrying with BEARER_TOKEN after unsetting X509_USER_PROXY...") + os.system("unset X509_USER_PROXY") + command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, "TOKEN") + try: + self.executeCommand(command) + logging.info("Stage-out succeeded with TOKEN after unsetting X509_USER_PROXY.") + return + except StageOutError as fallbackEx: + logging.warning(f"Fallback with BEARER_TOKEN failed: \n{fallbackEx}") + + if retryCount == self.numRetries: # Last retry, propagate the information outside of the for loop stageOutEx = ex From 1fbbe461e9334d28ddcb55ef6fb0954cadde4285 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 29 Nov 2024 16:07:33 +0100 Subject: [PATCH 02/25] Small fixes --- src/python/WMCore/Storage/StageOutImpl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/WMCore/Storage/StageOutImpl.py b/src/python/WMCore/Storage/StageOutImpl.py index 22f837299e..c3edd0da71 100644 --- a/src/python/WMCore/Storage/StageOutImpl.py +++ b/src/python/WMCore/Storage/StageOutImpl.py @@ -236,7 +236,7 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): if os.getenv("X509_USER_PROXY"): logging.info("Retrying with X509_USER_PROXY after unsetting BEARER_TOKEN...") os.system("unset BEARER_TOKEN; unset BEARER_TOKEN_FILE") - command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, "X509") + command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, auth_method="X509") try: self.executeCommand(command) logging.info("Stage-out succeeded with X509 after unsetting BEARER_TOKEN.") @@ -247,7 +247,7 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): if os.getenv("BEARER_TOKEN") or os.getenv("BEARER_TOKEN_FILE"): logging.info("Retrying with BEARER_TOKEN after unsetting X509_USER_PROXY...") os.system("unset X509_USER_PROXY") - command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, "TOKEN") + command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, auth_method="TOKEN") try: self.executeCommand(command) logging.info("Stage-out succeeded with TOKEN after unsetting X509_USER_PROXY.") @@ -265,6 +265,6 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): # This block will now always be executed after retries are exhausted if stageOutEx is not None: logging.error("Maximum number of retries exhausted. Further details on the failed command reported below.") - command = self.createDebuggingCommand(sourcePFN, targetPFN, options, checksums) + command = self.createDebuggingCommand(sourcePFN, targetPFN, options, checksums, auth_method="TOKEN") self.executeCommand(command) raise stageOutEx from None From 5b787318982a38967c7eda34d07e5470ed02a132 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Mon, 2 Dec 2024 11:57:49 +0100 Subject: [PATCH 03/25] pylint fixes --- src/python/WMCore/Storage/StageOutImpl.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/python/WMCore/Storage/StageOutImpl.py b/src/python/WMCore/Storage/StageOutImpl.py index c3edd0da71..e720ebf915 100644 --- a/src/python/WMCore/Storage/StageOutImpl.py +++ b/src/python/WMCore/Storage/StageOutImpl.py @@ -132,7 +132,7 @@ def createOutputDirectory(self, targetPFN): If no directory is required, do not implement this method """ - def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=None): + def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=None, auth_method=None): """ _createStageOutCommand_ @@ -220,7 +220,7 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): stageOutEx = None # variable to store the possible StageOutError for retryCount in range(self.numRetries + 1): try: - logging.info(f"Running the stage out (attempt {retryCount + 1})...") + logging.info("Running the stage out (attempt %d)...", retryCount + 1) self.executeCommand(command) logging.info("Stage-out succeeded with the current environment.") break @@ -242,7 +242,7 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): logging.info("Stage-out succeeded with X509 after unsetting BEARER_TOKEN.") return except StageOutError as fallbackEx: - logging.warning(f"Fallback with X509_USER_PROXY failed: \n{fallbackEx}") + logging.warning("Fallback with X509_USER_PROXY failed:\n%s", str(fallbackEx)) if os.getenv("BEARER_TOKEN") or os.getenv("BEARER_TOKEN_FILE"): logging.info("Retrying with BEARER_TOKEN after unsetting X509_USER_PROXY...") @@ -253,9 +253,8 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): logging.info("Stage-out succeeded with TOKEN after unsetting X509_USER_PROXY.") return except StageOutError as fallbackEx: - logging.warning(f"Fallback with BEARER_TOKEN failed: \n{fallbackEx}") - - + logging.warning("Fallback with BEARER_TOKEN failed:\n%s", str(fallbackEx)) + if retryCount == self.numRetries: # Last retry, propagate the information outside of the for loop stageOutEx = ex From e752a34832f96224fb17f083f0d79424f61256bb Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Mon, 2 Dec 2024 12:26:50 +0100 Subject: [PATCH 04/25] pylint fixes --- src/python/WMCore/Storage/Backends/GFAL2Impl.py | 1 + src/python/WMCore/Storage/StageOutImpl.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index aeb806faa2..d6b3ff4497 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -5,6 +5,7 @@ """ import argparse import os +import logging from WMCore.Storage.Registry import registerStageOutImpl from WMCore.Storage.StageOutImpl import StageOutImpl diff --git a/src/python/WMCore/Storage/StageOutImpl.py b/src/python/WMCore/Storage/StageOutImpl.py index e720ebf915..542af88bfe 100644 --- a/src/python/WMCore/Storage/StageOutImpl.py +++ b/src/python/WMCore/Storage/StageOutImpl.py @@ -142,7 +142,7 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No """ raise NotImplementedError("StageOutImpl.createStageOutCommand") - def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=None): + def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=None, auth_method=None): """ Build a shell command that will report in the logs the details about failing stageOut commands @@ -254,7 +254,7 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): return except StageOutError as fallbackEx: logging.warning("Fallback with BEARER_TOKEN failed:\n%s", str(fallbackEx)) - + if retryCount == self.numRetries: # Last retry, propagate the information outside of the for loop stageOutEx = ex From 54856c92d47438fc554a20085e200c844a07a707 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Mon, 2 Dec 2024 13:25:09 +0100 Subject: [PATCH 05/25] aligning unit tests --- test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py b/test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py index d82a504416..0ac59809e2 100644 --- a/test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py +++ b/test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py @@ -14,9 +14,9 @@ def setUp(self): def testInit(self): testGFAL2Impl = GFAL2Impl() - removeCommand = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c " \ + removeCommand = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c " \ "'. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}'" - copyCommand = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '" \ + copyCommand = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '" \ ". $JOBSTARTDIR/startup_environment.sh; date; gfal-copy -t 2400 -T 2400 -p " \ "-v --abort-on-failure {checksum} {options} {source} {destination}'" self.assertEqual(removeCommand, testGFAL2Impl.removeCommand) From 37574dff71d5e9f5b84ed0534afe6c7a1398e79c Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Thu, 12 Dec 2024 23:53:52 +0100 Subject: [PATCH 06/25] forcing token auth --- src/python/WMCore/Storage/StageOutImpl.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/python/WMCore/Storage/StageOutImpl.py b/src/python/WMCore/Storage/StageOutImpl.py index 542af88bfe..03dfdd0723 100644 --- a/src/python/WMCore/Storage/StageOutImpl.py +++ b/src/python/WMCore/Storage/StageOutImpl.py @@ -212,7 +212,7 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): # // # // Create the command to be used. # // - command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums) + command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, auth_method="TOKEN") # // # // Run the command # // @@ -220,8 +220,9 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): stageOutEx = None # variable to store the possible StageOutError for retryCount in range(self.numRetries + 1): try: - logging.info("Running the stage out (attempt %d)...", retryCount + 1) + logging.info("Running the stage out with tokens (attempt %d)...", retryCount + 1) self.executeCommand(command) + logging.info(f"{command}") logging.info("Stage-out succeeded with the current environment.") break From 7aeb127971eea777562f19fa800852fbcc7c863b Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 13 Dec 2024 10:03:24 +0100 Subject: [PATCH 07/25] Addressing pylint --- src/python/WMCore/Storage/StageOutImpl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/WMCore/Storage/StageOutImpl.py b/src/python/WMCore/Storage/StageOutImpl.py index 03dfdd0723..c5d6f59c85 100644 --- a/src/python/WMCore/Storage/StageOutImpl.py +++ b/src/python/WMCore/Storage/StageOutImpl.py @@ -222,7 +222,7 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): try: logging.info("Running the stage out with tokens (attempt %d)...", retryCount + 1) self.executeCommand(command) - logging.info(f"{command}") + logging.info("Command to run: %s", command) logging.info("Stage-out succeeded with the current environment.") break From d77046fb1937914bf730c4cd79684bfcbca00067 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 13 Dec 2024 10:04:01 +0100 Subject: [PATCH 08/25] Fixing unit tests --- test/python/WMCore_t/Storage_t/StageOutImpl_t.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/python/WMCore_t/Storage_t/StageOutImpl_t.py b/test/python/WMCore_t/Storage_t/StageOutImpl_t.py index 05a161544e..17a0fe3965 100644 --- a/test/python/WMCore_t/Storage_t/StageOutImpl_t.py +++ b/test/python/WMCore_t/Storage_t/StageOutImpl_t.py @@ -91,7 +91,7 @@ def testCallable(self, mock_executeCommand, mock_createStageOutCommand, mock_cre mock_createSourceName.assert_called_with("protocol", "inputPFN") mock_createTargetName.assert_called_with("protocol", "targetPFN") mock_createOutputDirectory.assert_called_with("targetPFN") - mock_createStageOutCommand.assert_called_with("sourcePFN", "targetPFN", None, None) + mock_createStageOutCommand.assert_called_with("sourcePFN", "targetPFN", None, None, auth_method='TOKEN') mock_executeCommand.assert_called_with("command") @mock.patch('WMCore.Storage.StageOutImpl.StageOutImpl.createSourceName') @@ -111,7 +111,7 @@ def testCallable_StageOutError(self, mock_time, mock_executeCommand, mock_create mock_createSourceName.assert_called_with("protocol", "inputPFN") mock_createTargetName.assert_called_with("protocol", "targetPFN") mock_createOutputDirectory.assert_called_with("targetPFN") - mock_createStageOutCommand.assert_called_with("sourcePFN", "targetPFN", None, None) + mock_createStageOutCommand.assert_called_with("sourcePFN", "targetPFN", None, None, auth_method='TOKEN') mock_executeCommand.assert_called_with("command") calls = [call(600), call(600), call(600), call(600)] mock_time.sleep.assert_has_calls(calls) From 0ddd5158842957b7483f9740663e565d01658653 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 13 Dec 2024 14:48:02 +0100 Subject: [PATCH 09/25] Revisiting where self variables are defined --- src/python/WMCore/Storage/Backends/GFAL2Impl.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index d6b3ff4497..b80b5d2048 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -25,10 +25,10 @@ def __init__(self, stagein=False): # Next commands after separation are executed without env -i and this leads us with # mixed environment with COMP and system python. # GFAL2 is not build under COMP environment and it had failures with mixed environment. - self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" # Default initialization, it is tweaked in createStageOutCommand depending on the authentication method - self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') - self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' - self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) + ##self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" # Default initialization, it is tweaked in createStageOutCommand depending on the authentication method + ##self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') + ##self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' + ##self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) def createFinalPFN(self, pfn): """ @@ -172,6 +172,10 @@ def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=N self.setups = "env -i BEARER_TOKEN=$(cat $BEARER_TOKEN_FILE) JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" else: self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + #Need to define this variables here, otherwise the self.setups update is not propagated + self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') + self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' + self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) copyCommandDict = self.buildCopyCommandDict(sourcePFN, targetPFN, options, checksums) copyCommand = self.copyCommand.format_map(copyCommandDict) From 15d6a42441c9d77299ff73f0bd309bf3c639dfda Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 13 Dec 2024 15:53:23 +0100 Subject: [PATCH 10/25] Revisiting where self variables are defined --- src/python/WMCore/Storage/Backends/GFAL2Impl.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index b80b5d2048..8ab6c2a1ab 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -25,10 +25,10 @@ def __init__(self, stagein=False): # Next commands after separation are executed without env -i and this leads us with # mixed environment with COMP and system python. # GFAL2 is not build under COMP environment and it had failures with mixed environment. - ##self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" # Default initialization, it is tweaked in createStageOutCommand depending on the authentication method - ##self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') - ##self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' - ##self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) + self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" # Default initialization, it is tweaked in createStageOutCommand depending on the authentication method + self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') + self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' + self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) def createFinalPFN(self, pfn): """ @@ -134,6 +134,11 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No logging.info("Warning! Running gfal without either a X509 certificate or a token!") self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + #Need to define this variables here, otherwise the self.setups update is not propagated + self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') + self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' + self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) + # Construct the gfal-cp command copyCommandDict = self.buildCopyCommandDict(sourcePFN, targetPFN, options, checksums) copyCommand = self.copyCommand.format_map(copyCommandDict) From 20306b6bf7349d4f964822726efdf3aa45ec61f0 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 13 Dec 2024 16:57:35 +0100 Subject: [PATCH 11/25] Fixing unit tests --- .../Storage_t/Backends_t/GFAL2Impl_t.py | 65 ++++++++++++++----- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py b/test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py index 0ac59809e2..58a5e9fa84 100644 --- a/test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py +++ b/test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py @@ -14,11 +14,12 @@ def setUp(self): def testInit(self): testGFAL2Impl = GFAL2Impl() + # The default setup without a token removeCommand = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c " \ - "'. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}'" + "'. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}'" copyCommand = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '" \ - ". $JOBSTARTDIR/startup_environment.sh; date; gfal-copy -t 2400 -T 2400 -p " \ - "-v --abort-on-failure {checksum} {options} {source} {destination}'" + ". $JOBSTARTDIR/startup_environment.sh; date; gfal-copy -t 2400 -T 2400 -p " \ + "-v --abort-on-failure {checksum} {options} {source} {destination}'" self.assertEqual(removeCommand, testGFAL2Impl.removeCommand) self.assertEqual(copyCommand, testGFAL2Impl.copyCommand) @@ -79,10 +80,23 @@ def testCreateRemoveFileCommand_removeCommand(self, mock_path): def testCreateStageOutCommand_stageIn(self, mock_createRemoveFileCommand): self.GFAL2Impl.stageIn = True mock_createRemoveFileCommand.return_value = "targetPFN2" - result = self.GFAL2Impl.createStageOutCommand("sourcePFN", "targetPFN") + + # Call createStageOutCommand with auth_method='TOKEN' + result = self.GFAL2Impl.createStageOutCommand( + "sourcePFN", "targetPFN", auth_method='TOKEN' + ) + + # Generate the expected result with auth_method='TOKEN' expectedResult = self.getStageOutCommandResult( - self.getCopyCommandDict("-K adler32", "", "sourcePFN", "targetPFN"), "targetPFN2") + self.getCopyCommandDict("-K adler32", "", "sourcePFN", "targetPFN"), + "targetPFN2", + auth_method="TOKEN" + ) + + # Assert that the removeFileCommand was called correctly mock_createRemoveFileCommand.assert_called_with("targetPFN") + + # Compare the expected and actual result self.assertEqual(expectedResult, result) @mock.patch('WMCore.Storage.Backends.GFAL2Impl.GFAL2Impl.createRemoveFileCommand') @@ -94,20 +108,37 @@ def testCreateStageOutCommand_options(self, mock_createRemoveFileCommand): mock_createRemoveFileCommand.assert_called_with("file:targetPFN") self.assertEqual(expectedResult, result) - def getCopyCommandDict(self, checksum, options, source, destination): - copyCommandDict = {'checksum': '', 'options': '', 'source': '', 'destination': ''} - copyCommandDict['checksum'] = checksum - copyCommandDict['options'] = options - copyCommandDict['source'] = source - copyCommandDict['destination'] = destination + def getCopyCommandDict(self, checksum, options, source, destination, auth_method=None): + """ + Generate a dictionary for the gfal-copy command, dynamically adjusting for auth_method. + """ + copyCommandDict = { + 'checksum': checksum, + 'options': options, + 'source': source, + 'destination': destination + } return copyCommandDict - def getStageOutCommandResult(self, copyCommandDict, createRemoveFileCommandResult): + def getStageOutCommandResult(self, copyCommandDict, createRemoveFileCommandResult, auth_method=None): + """ + Generate the expected result for the gfal-copy command, including dynamic adjustments for auth_method. + """ + # Adjust the setup based on auth_method + if auth_method == "X509": + setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + elif auth_method == "TOKEN": + setups = "env -i BEARER_TOKEN=$(cat $BEARER_TOKEN_FILE) JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + else: + setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + + # Build the copy command dynamically + copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' + copyCommand = setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + copyOpts) + + # Construct the full result result = "#!/bin/bash\n" - - copyCommand = self.copyCommand.format_map(copyCommandDict) - result += copyCommand - + result += copyCommand.format_map(copyCommandDict) result += """ EXIT_STATUS=$? echo "gfal-copy exit status: $EXIT_STATUS" @@ -118,7 +149,7 @@ def getStageOutCommandResult(self, copyCommandDict, createRemoveFileCommandResul fi exit $EXIT_STATUS """.format(remove_command=createRemoveFileCommandResult) - + return result @mock.patch('WMCore.Storage.Backends.GFAL2Impl.os.path') From fd48ccc0c4c85ffd651c7d1bb724f53c8f3bcc9f Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 13 Dec 2024 16:57:49 +0100 Subject: [PATCH 12/25] Revisiting where self variables are defined --- .../WMCore/Storage/Backends/GFAL2Impl.py | 49 +++++++++---------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index 8ab6c2a1ab..1835e49591 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -30,6 +30,23 @@ def __init__(self, stagein=False): self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) + def adjustSetup(self, auth_method=None): + """ + Adjust the `self.setups` based on the selected authentication method and regenerate commands. + """ + if auth_method == "X509": + self.setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + elif auth_method == "TOKEN": + self.setups = "env -i BEARER_TOKEN=$(cat $BEARER_TOKEN_FILE) JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + else: + logging.info("Warning! Running gfal without either a X509 certificate or a token!") + self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" + + # Regenerate dependent commands + self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') + self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' + self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) + def createFinalPFN(self, pfn): """ _createFinalPFN_ @@ -124,20 +141,8 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No :checksums: dict, collect checksums according to the algorithms saved as keys :auth_method: str, the authentication method to be used ("X509", "TOKEN", or None) """ - - # Adjust self.setups based on the selected authentication method - if auth_method == "X509": - self.setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" - elif auth_method == "TOKEN": - self.setups = "env -i BEARER_TOKEN=$(cat $BEARER_TOKEN_FILE) JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" - else: - logging.info("Warning! Running gfal without either a X509 certificate or a token!") - self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" - - #Need to define this variables here, otherwise the self.setups update is not propagated - self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') - self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' - self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) + # Adjust the setup + self.adjustSetup(auth_method) # Construct the gfal-cp command copyCommandDict = self.buildCopyCommandDict(sourcePFN, targetPFN, options, checksums) @@ -154,7 +159,7 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No {remove_command} fi exit $EXIT_STATUS - """.format(remove_command=self.createRemoveFileCommand(targetPFN)) + """.format(remove_command=self.createRemoveFileCommand(targetPFN)) return result @@ -170,18 +175,10 @@ def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=N :auth_method: str, the authentication method to be used ("X509", "TOKEN", or None) """ - # Adjust self.setups based on the selected authentication method - if auth_method == "X509": - self.setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" - elif auth_method == "TOKEN": - self.setups = "env -i BEARER_TOKEN=$(cat $BEARER_TOKEN_FILE) JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" - else: - self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" - #Need to define this variables here, otherwise the self.setups update is not propagated - self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}') - self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}' - self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts) + # Adjust the setup + self.adjustSetup(auth_method) + # Build the gfal-cp command for debugging purposes copyCommandDict = self.buildCopyCommandDict(sourcePFN, targetPFN, options, checksums) copyCommand = self.copyCommand.format_map(copyCommandDict) From 4b5c6a69a3dfe049dbe2035f40eb0c9df645aed8 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Sun, 15 Dec 2024 22:08:01 +0100 Subject: [PATCH 13/25] hadnling of the cases still not implementing custom auth method --- src/python/WMCore/Storage/StageOutImpl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/python/WMCore/Storage/StageOutImpl.py b/src/python/WMCore/Storage/StageOutImpl.py index c5d6f59c85..6b92d6b0b2 100644 --- a/src/python/WMCore/Storage/StageOutImpl.py +++ b/src/python/WMCore/Storage/StageOutImpl.py @@ -212,7 +212,10 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): # // # // Create the command to be used. # // - command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, auth_method="TOKEN") + try: + command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, auth_method="TOKEN") + except: + command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums) # // # // Run the command # // From 18a433455228fb70c8afb5ed6f8b66644d127e57 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Sun, 15 Dec 2024 22:51:56 +0100 Subject: [PATCH 14/25] hadnling of the cases still not implementing custom auth method --- src/python/WMCore/Storage/StageOutImpl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/WMCore/Storage/StageOutImpl.py b/src/python/WMCore/Storage/StageOutImpl.py index 6b92d6b0b2..4289bfa2b1 100644 --- a/src/python/WMCore/Storage/StageOutImpl.py +++ b/src/python/WMCore/Storage/StageOutImpl.py @@ -214,7 +214,8 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None): # // try: command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, auth_method="TOKEN") - except: + except TypeError as ex: + logging.warning("Falling back to default createStageOutCommand due to: %s", str(ex)) command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums) # // # // Run the command From a230f8d79c55fd8b91ba0fd3cb18472d36f68140 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Mon, 23 Dec 2024 19:43:47 +0100 Subject: [PATCH 15/25] Disabling x509 -- TO REVERT --- src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py b/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py index 2605ba3fe5..6820776225 100644 --- a/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py +++ b/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py @@ -132,8 +132,8 @@ def __init__(self, config): self.reqStr = None # x509 proxy handling - proxy = Proxy({'logger': myThread.logger}) - self.x509userproxy = proxy.getProxyFilename() + ##proxy = Proxy({'logger': myThread.logger}) + ##self.x509userproxy = proxy.getProxyFilename() # These are added now by the condor client #self.x509userproxysubject = proxy.getSubject() @@ -519,7 +519,7 @@ def getJobParameters(self, jobList): if self.reqStr is not None: ad['Requirements'] = self.reqStr - ad['My.x509userproxy'] = classad.quote(self.x509userproxy) + #ad['My.x509userproxy'] = classad.quote(self.x509userproxy) sites = ','.join(sorted(job.get('possibleSites'))) ad['My.DESIRED_Sites'] = classad.quote(str(sites)) sites = ','.join(sorted(job.get('potentialSites'))) From 843f0e060df705df3299504b0742d2876e9b5abb Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Thu, 2 Jan 2025 12:38:41 +0100 Subject: [PATCH 16/25] Enabling token auth in SimpleCondorPlugin --- src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py b/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py index 6820776225..23521a1b54 100644 --- a/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py +++ b/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py @@ -520,6 +520,8 @@ def getJobParameters(self, jobList): ad['Requirements'] = self.reqStr #ad['My.x509userproxy'] = classad.quote(self.x509userproxy) + # Allow oauth based token authentication + ad['use_oauth_services'] = "cms" sites = ','.join(sorted(job.get('possibleSites'))) ad['My.DESIRED_Sites'] = classad.quote(str(sites)) sites = ','.join(sorted(job.get('potentialSites'))) From fab9d0356a7211052c3bdd03ec36b3b385ee1c3b Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Thu, 2 Jan 2025 14:30:48 +0100 Subject: [PATCH 17/25] Debugging token -- TO REMOVE --- src/python/WMCore/Storage/Backends/GFAL2Impl.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index 1835e49591..bc58d4094a 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -153,6 +153,11 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No result += """ EXIT_STATUS=$? echo "gfal-copy exit status: $EXIT_STATUS" + echo "BEARER_TOKEN" $BEARER_TOKEN + echo "BEARER_TOKEN_FILE" $BEARER_TOKEN_FILE + echo "X509_USER_PROXY" $X509_USER_PROXY + echo "_CONDOR_CREDS" ${_CONDOR_CREDS} + echo ${_CONDOR_CREDS}/cms.use if [[ $EXIT_STATUS != 0 ]]; then echo "ERROR: gfal-copy exited with $EXIT_STATUS" echo "Cleaning up failed file:" From 70b0f93c4237aac96e740366d0daafd12fa9cb82 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Thu, 2 Jan 2025 14:31:49 +0100 Subject: [PATCH 18/25] Debugging token -- TO REMOVE --- src/python/WMCore/Storage/Backends/GFAL2Impl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index bc58d4094a..18584effbc 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -154,10 +154,13 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No EXIT_STATUS=$? echo "gfal-copy exit status: $EXIT_STATUS" echo "BEARER_TOKEN" $BEARER_TOKEN + htdecodetoken -H $BEARER_TOKEN echo "BEARER_TOKEN_FILE" $BEARER_TOKEN_FILE + htdecodetoken -H $BEARER_TOKEN_FILE echo "X509_USER_PROXY" $X509_USER_PROXY echo "_CONDOR_CREDS" ${_CONDOR_CREDS} echo ${_CONDOR_CREDS}/cms.use + htdecodetoken -H if [[ $EXIT_STATUS != 0 ]]; then echo "ERROR: gfal-copy exited with $EXIT_STATUS" echo "Cleaning up failed file:" From 29af20158f6739a915d565f714010911b26586d0 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 3 Jan 2025 15:32:44 +0100 Subject: [PATCH 19/25] Fixing logs -- TO REVERT --- .../WMCore/Storage/Backends/GFAL2Impl.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index 18584effbc..62e06fe810 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -153,21 +153,21 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No result += """ EXIT_STATUS=$? echo "gfal-copy exit status: $EXIT_STATUS" - echo "BEARER_TOKEN" $BEARER_TOKEN - htdecodetoken -H $BEARER_TOKEN - echo "BEARER_TOKEN_FILE" $BEARER_TOKEN_FILE - htdecodetoken -H $BEARER_TOKEN_FILE - echo "X509_USER_PROXY" $X509_USER_PROXY - echo "_CONDOR_CREDS" ${_CONDOR_CREDS} - echo ${_CONDOR_CREDS}/cms.use - htdecodetoken -H + echo "BEARER_TOKEN: $BEARER_TOKEN" + htdecodetoken -H "$BEARER_TOKEN" + echo "BEARER_TOKEN_FILE: $BEARER_TOKEN_FILE" + htdecodetoken -H "$BEARER_TOKEN_FILE" + echo "X509_USER_PROXY: $X509_USER_PROXY" + echo "_CONDOR_CREDS: $_CONDOR_CREDS" + echo "${_CONDOR_CREDS}/cms.use" + htdecodetoken -H "$_CONDOR_CREDS/cms.use" if [[ $EXIT_STATUS != 0 ]]; then echo "ERROR: gfal-copy exited with $EXIT_STATUS" echo "Cleaning up failed file:" {remove_command} fi exit $EXIT_STATUS - """.format(remove_command=self.createRemoveFileCommand(targetPFN)) + """.format(remove_command=self.createRemoveFileCommand(targetPFN)) return result From 9f5b4523386afa68f8bd4c7fbec3b9e2a4b9ee8f Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 3 Jan 2025 17:44:50 +0100 Subject: [PATCH 20/25] Improving debugging -- TO REVERT --- .../WMCore/Storage/Backends/GFAL2Impl.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index 62e06fe810..f92dd89b5d 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -149,6 +149,24 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No copyCommand = self.copyCommand.format_map(copyCommandDict) result = "#!/bin/bash\n" + copyCommand + # List of environment variables to check + env_vars = ["BEARER_TOKEN", "BEARER_TOKEN_FILE", "X509_USER_PROXY", "_CONDOR_CREDS"] + + for var in env_vars: + value = os.environ.get(var, "Not defined") + logging.info(f"{var}: {value}") + + # Special case: for _CONDOR_CREDS, log its subpath if defined + if var == "_CONDOR_CREDS" and value != "Not defined": + subpath = os.path.join(value, "cms.use") + logging.info(f"{var}/cms.use: {subpath}") + # Assuming htdecodetoken should be run if the variable is defined + try: + decoded_output = os.popen(f"htdecodetoken -H {subpath}").read() + logging.info(f"Decoded token for {var}/cms.use:\n{decoded_output}") + except Exception as e: + logging.error(f"Failed to decode token for {var}/cms.use: {e}") + if _CheckExitCodeOption: result += """ EXIT_STATUS=$? From f1f59aa756790a67142ea4e65076b18bca4f6e13 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Fri, 3 Jan 2025 19:31:10 +0100 Subject: [PATCH 21/25] Improving debugging again -- TO REVERT --- .../WMCore/Storage/Backends/GFAL2Impl.py | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index f92dd89b5d..ecd9f8fc88 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -6,6 +6,7 @@ import argparse import os import logging +import subprocess from WMCore.Storage.Registry import registerStageOutImpl from WMCore.Storage.StageOutImpl import StageOutImpl @@ -159,26 +160,29 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No # Special case: for _CONDOR_CREDS, log its subpath if defined if var == "_CONDOR_CREDS" and value != "Not defined": subpath = os.path.join(value, "cms.use") - logging.info(f"{var}/cms.use: {subpath}") - # Assuming htdecodetoken should be run if the variable is defined - try: - decoded_output = os.popen(f"htdecodetoken -H {subpath}").read() - logging.info(f"Decoded token for {var}/cms.use:\n{decoded_output}") - except Exception as e: - logging.error(f"Failed to decode token for {var}/cms.use: {e}") + logger.info(f"{var}/cms.use: {subpath}") + + if os.path.exists(subpath): + try: + decoded_output = subprocess.check_output( + ["htdecodetoken", "-H", subpath], stderr=subprocess.STDOUT, text=True + ) + if decoded_output.strip(): + logger.info(f"Decoded token for {var}/cms.use:\n{decoded_output.strip()}") + else: + logger.warning(f"No output from htdecodetoken for {var}/cms.use.") + except subprocess.CalledProcessError as e: + logger.error(f"Error decoding token for {var}/cms.use: {e.output.strip()}") + except FileNotFoundError: + logger.error(f"htdecodetoken command not found. Ensure it is installed and in the PATH.") + else: + logger.warning(f"Subpath does not exist: {subpath}") + if _CheckExitCodeOption: result += """ EXIT_STATUS=$? echo "gfal-copy exit status: $EXIT_STATUS" - echo "BEARER_TOKEN: $BEARER_TOKEN" - htdecodetoken -H "$BEARER_TOKEN" - echo "BEARER_TOKEN_FILE: $BEARER_TOKEN_FILE" - htdecodetoken -H "$BEARER_TOKEN_FILE" - echo "X509_USER_PROXY: $X509_USER_PROXY" - echo "_CONDOR_CREDS: $_CONDOR_CREDS" - echo "${_CONDOR_CREDS}/cms.use" - htdecodetoken -H "$_CONDOR_CREDS/cms.use" if [[ $EXIT_STATUS != 0 ]]; then echo "ERROR: gfal-copy exited with $EXIT_STATUS" echo "Cleaning up failed file:" From 02df9d0e4d00e9dabb9154832a9125fb4d64a7ee Mon Sep 17 00:00:00 2001 From: Kenyi Hurtado Date: Thu, 5 Dec 2024 12:02:06 -0500 Subject: [PATCH 22/25] Add token authentication support --- etc/submit_py3.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/etc/submit_py3.sh b/etc/submit_py3.sh index 6fcf59f692..f102fa3e0e 100644 --- a/etc/submit_py3.sh +++ b/etc/submit_py3.sh @@ -192,6 +192,31 @@ else fi echo -e "======== WMAgent Python bootstrap finished at $(TZ=GMT date) ========\n" +echo -e "======= WMAgent token verification at $(TZ=GMT date) ========\n" +echo "Content under _CONDOR_CREDS: ${_CONDOR_CREDS}" +ls -l ${_CONDOR_CREDS} + +if [ -f "${_CONDOR_CREDS}/cms.use" ] +then + echo "CMS token found, setting BEARER_TOKEN_FILE=${_CONDOR_CREDS}/cms.use" + export BEARER_TOKEN_FILE=${_CONDOR_CREDS}/cms.use + + # Show token information + # This tool requires htgettoken package in the cmssw runtime apptainer image + if command -v httokendecode ls 2>&1 > /dev/null + then + httokendecode -H ${BEARER_TOKEN_FILE} + else + echo "Warning: [WMAgent Token verification] httokendecode tool could not be found." + echo "Warning: Token exists and can be used, but details will not be displayed." + fi +else + echo "[WMAgent token verification]: The bearer token file could not be found." + # Do not fail, we still support x509 proxies + # if we fail here in the future, we need to define an exit code number + # exit 1106 +fi + echo "======== WMAgent Unpack the job starting at $(TZ=GMT date) ========" # Should be ready to unpack and run this From 8dc967ec1c1793c50b516736a1207acbbf8dc02b Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Mon, 23 Dec 2024 19:43:47 +0100 Subject: [PATCH 23/25] Disabling x509 -- TO REVERT --- src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py b/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py index 23521a1b54..e322256158 100644 --- a/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py +++ b/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py @@ -520,8 +520,11 @@ def getJobParameters(self, jobList): ad['Requirements'] = self.reqStr #ad['My.x509userproxy'] = classad.quote(self.x509userproxy) +<<<<<<< HEAD # Allow oauth based token authentication ad['use_oauth_services'] = "cms" +======= +>>>>>>> 5f13933e5 (Disabling x509 -- TO REVERT) sites = ','.join(sorted(job.get('possibleSites'))) ad['My.DESIRED_Sites'] = classad.quote(str(sites)) sites = ','.join(sorted(job.get('potentialSites'))) From 6b355cfc03ca3a5d63037e95f896c3bec63d1364 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Thu, 2 Jan 2025 12:38:41 +0100 Subject: [PATCH 24/25] Enabling token auth in SimpleCondorPlugin --- src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py b/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py index e322256158..23521a1b54 100644 --- a/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py +++ b/src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py @@ -520,11 +520,8 @@ def getJobParameters(self, jobList): ad['Requirements'] = self.reqStr #ad['My.x509userproxy'] = classad.quote(self.x509userproxy) -<<<<<<< HEAD # Allow oauth based token authentication ad['use_oauth_services'] = "cms" -======= ->>>>>>> 5f13933e5 (Disabling x509 -- TO REVERT) sites = ','.join(sorted(job.get('possibleSites'))) ad['My.DESIRED_Sites'] = classad.quote(str(sites)) sites = ','.join(sorted(job.get('potentialSites'))) From db23225db5a6e09cd5a90d4004d506ce05858029 Mon Sep 17 00:00:00 2001 From: Andrea Piccinelli Date: Sun, 5 Jan 2025 22:07:17 +0100 Subject: [PATCH 25/25] Fixing error and committing changes to submit_py3 --- src/python/WMCore/Storage/Backends/GFAL2Impl.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/python/WMCore/Storage/Backends/GFAL2Impl.py b/src/python/WMCore/Storage/Backends/GFAL2Impl.py index ecd9f8fc88..ad78f8a6d6 100644 --- a/src/python/WMCore/Storage/Backends/GFAL2Impl.py +++ b/src/python/WMCore/Storage/Backends/GFAL2Impl.py @@ -160,7 +160,7 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No # Special case: for _CONDOR_CREDS, log its subpath if defined if var == "_CONDOR_CREDS" and value != "Not defined": subpath = os.path.join(value, "cms.use") - logger.info(f"{var}/cms.use: {subpath}") + logging.info(f"{var}/cms.use: {subpath}") if os.path.exists(subpath): try: @@ -168,15 +168,15 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No ["htdecodetoken", "-H", subpath], stderr=subprocess.STDOUT, text=True ) if decoded_output.strip(): - logger.info(f"Decoded token for {var}/cms.use:\n{decoded_output.strip()}") + logging.info(f"Decoded token for {var}/cms.use:\n{decoded_output.strip()}") else: - logger.warning(f"No output from htdecodetoken for {var}/cms.use.") + logging.warning(f"No output from htdecodetoken for {var}/cms.use.") except subprocess.CalledProcessError as e: - logger.error(f"Error decoding token for {var}/cms.use: {e.output.strip()}") + logging.error(f"Error decoding token for {var}/cms.use: {e.output.strip()}") except FileNotFoundError: - logger.error(f"htdecodetoken command not found. Ensure it is installed and in the PATH.") + logging.error(f"htdecodetoken command not found. Ensure it is installed and in the PATH.") else: - logger.warning(f"Subpath does not exist: {subpath}") + logging.warning(f"Subpath does not exist: {subpath}") if _CheckExitCodeOption: