Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BROKEN - Implementation of the token-safe retry logic for gfal #12191

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
e24c911
First implementation of the token-safe retries
anpicci Nov 29, 2024
1fbbe46
Small fixes
anpicci Nov 29, 2024
5b78731
pylint fixes
anpicci Dec 2, 2024
e752a34
pylint fixes
anpicci Dec 2, 2024
54856c9
aligning unit tests
anpicci Dec 2, 2024
37574df
forcing token auth
anpicci Dec 12, 2024
7aeb127
Addressing pylint
anpicci Dec 13, 2024
d77046f
Fixing unit tests
anpicci Dec 13, 2024
0ddd515
Revisiting where self variables are defined
anpicci Dec 13, 2024
15d6a42
Revisiting where self variables are defined
anpicci Dec 13, 2024
20306b6
Fixing unit tests
anpicci Dec 13, 2024
fd48ccc
Revisiting where self variables are defined
anpicci Dec 13, 2024
4b5c6a6
hadnling of the cases still not implementing custom auth method
anpicci Dec 15, 2024
18a4334
hadnling of the cases still not implementing custom auth method
anpicci Dec 15, 2024
a230f8d
Disabling x509 -- TO REVERT
anpicci Dec 23, 2024
843f0e0
Enabling token auth in SimpleCondorPlugin
anpicci Jan 2, 2025
fab9d03
Debugging token -- TO REMOVE
anpicci Jan 2, 2025
70b0f93
Debugging token -- TO REMOVE
anpicci Jan 2, 2025
29af201
Fixing logs -- TO REVERT
anpicci Jan 3, 2025
9f5b452
Improving debugging -- TO REVERT
anpicci Jan 3, 2025
f1f59aa
Improving debugging again -- TO REVERT
anpicci Jan 3, 2025
02df9d0
Add token authentication support
khurtado Dec 5, 2024
8dc967e
Disabling x509 -- TO REVERT
anpicci Dec 23, 2024
6b355cf
Enabling token auth in SimpleCondorPlugin
anpicci Jan 2, 2025
db23225
Fixing error and committing changes to submit_py3
anpicci Jan 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions etc/submit_py3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,31 @@ else
fi
echo -e "======== WMAgent Python bootstrap finished at $(TZ=GMT date) ========\n"

echo -e "======= WMAgent token verification at $(TZ=GMT date) ========\n"
echo "Content under _CONDOR_CREDS: ${_CONDOR_CREDS}"
ls -l ${_CONDOR_CREDS}

if [ -f "${_CONDOR_CREDS}/cms.use" ]
then
echo "CMS token found, setting BEARER_TOKEN_FILE=${_CONDOR_CREDS}/cms.use"
export BEARER_TOKEN_FILE=${_CONDOR_CREDS}/cms.use

# Show token information
# This tool requires htgettoken package in the cmssw runtime apptainer image
if command -v httokendecode ls 2>&1 > /dev/null
then
httokendecode -H ${BEARER_TOKEN_FILE}
else
echo "Warning: [WMAgent Token verification] httokendecode tool could not be found."
echo "Warning: Token exists and can be used, but details will not be displayed."
fi
else
echo "[WMAgent token verification]: The bearer token file could not be found."
# Do not fail, we still support x509 proxies
# if we fail here in the future, we need to define an exit code number
# exit 1106
fi


echo "======== WMAgent Unpack the job starting at $(TZ=GMT date) ========"
# Should be ready to unpack and run this
Expand Down
8 changes: 5 additions & 3 deletions src/python/WMCore/BossAir/Plugins/SimpleCondorPlugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,8 @@ def __init__(self, config):
self.reqStr = None

# x509 proxy handling
proxy = Proxy({'logger': myThread.logger})
self.x509userproxy = proxy.getProxyFilename()
##proxy = Proxy({'logger': myThread.logger})
##self.x509userproxy = proxy.getProxyFilename()

# These are added now by the condor client
#self.x509userproxysubject = proxy.getSubject()
Expand Down Expand Up @@ -519,7 +519,9 @@ def getJobParameters(self, jobList):
if self.reqStr is not None:
ad['Requirements'] = self.reqStr

ad['My.x509userproxy'] = classad.quote(self.x509userproxy)
#ad['My.x509userproxy'] = classad.quote(self.x509userproxy)
# Allow oauth based token authentication
ad['use_oauth_services'] = "cms"
sites = ','.join(sorted(job.get('possibleSites')))
ad['My.DESIRED_Sites'] = classad.quote(str(sites))
sites = ','.join(sorted(job.get('potentialSites')))
Expand Down
63 changes: 60 additions & 3 deletions src/python/WMCore/Storage/Backends/GFAL2Impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
"""
import argparse
import os
import logging
import subprocess

from WMCore.Storage.Registry import registerStageOutImpl
from WMCore.Storage.StageOutImpl import StageOutImpl
Expand All @@ -24,7 +26,24 @@ def __init__(self, stagein=False):
# Next commands after separation are executed without env -i and this leads us with
# mixed environment with COMP and system python.
# GFAL2 is not build under COMP environment and it had failures with mixed environment.
self.setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'"
self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'" # Default initialization, it is tweaked in createStageOutCommand depending on the authentication method
self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}')
self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}'
self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts)

def adjustSetup(self, auth_method=None):
"""
Adjust the `self.setups` based on the selected authentication method and regenerate commands.
"""
if auth_method == "X509":
self.setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'"
elif auth_method == "TOKEN":
self.setups = "env -i BEARER_TOKEN=$(cat $BEARER_TOKEN_FILE) JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'"
else:
logging.info("Warning! Running gfal without either a X509 certificate or a token!")
self.setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'"

# Regenerate dependent commands
self.removeCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}')
self.copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}'
self.copyCommand = self.setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + self.copyOpts)
Expand Down Expand Up @@ -113,20 +132,53 @@ def buildCopyCommandDict(self, sourcePFN, targetPFN, options=None, checksums=Non

return copyCommandDict

def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=None):
def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=None, auth_method=None):
"""
Create gfal-cp command for stageOut

:sourcePFN: str, PFN of the source file
:targetPFN: str, destination PFN
:options: str, additional options for gfal-cp
:checksums: dict, collect checksums according to the algorithms saved as keys
:auth_method: str, the authentication method to be used ("X509", "TOKEN", or None)
"""
# Adjust the setup
self.adjustSetup(auth_method)

# Construct the gfal-cp command
copyCommandDict = self.buildCopyCommandDict(sourcePFN, targetPFN, options, checksums)
copyCommand = self.copyCommand.format_map(copyCommandDict)
result = "#!/bin/bash\n" + copyCommand

# List of environment variables to check
env_vars = ["BEARER_TOKEN", "BEARER_TOKEN_FILE", "X509_USER_PROXY", "_CONDOR_CREDS"]

for var in env_vars:
value = os.environ.get(var, "Not defined")
logging.info(f"{var}: {value}")

# Special case: for _CONDOR_CREDS, log its subpath if defined
if var == "_CONDOR_CREDS" and value != "Not defined":
subpath = os.path.join(value, "cms.use")
logging.info(f"{var}/cms.use: {subpath}")

if os.path.exists(subpath):
try:
decoded_output = subprocess.check_output(
["htdecodetoken", "-H", subpath], stderr=subprocess.STDOUT, text=True
)
if decoded_output.strip():
logging.info(f"Decoded token for {var}/cms.use:\n{decoded_output.strip()}")
else:
logging.warning(f"No output from htdecodetoken for {var}/cms.use.")
except subprocess.CalledProcessError as e:
logging.error(f"Error decoding token for {var}/cms.use: {e.output.strip()}")
except FileNotFoundError:
logging.error(f"htdecodetoken command not found. Ensure it is installed and in the PATH.")
else:
logging.warning(f"Subpath does not exist: {subpath}")


if _CheckExitCodeOption:
result += """
EXIT_STATUS=$?
Expand All @@ -141,7 +193,7 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No

return result

def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=None):
def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=None, auth_method=None):
"""
Debug a failed gfal-cp command for stageOut, without re-running it,
providing information on the environment and the certifications
Expand All @@ -150,8 +202,13 @@ def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=N
:targetPFN: str, destination PFN
:options: str, additional options for gfal-cp
:checksums: dict, collect checksums according to the algorithms saved as keys
:auth_method: str, the authentication method to be used ("X509", "TOKEN", or None)
"""

# Adjust the setup
self.adjustSetup(auth_method)

# Build the gfal-cp command for debugging purposes
copyCommandDict = self.buildCopyCommandDict(sourcePFN, targetPFN, options, checksums)
copyCommand = self.copyCommand.format_map(copyCommandDict)

Expand Down
57 changes: 45 additions & 12 deletions src/python/WMCore/Storage/StageOutImpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def createOutputDirectory(self, targetPFN):
If no directory is required, do not implement this method
"""

def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=None):
def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=None, auth_method=None):
"""
_createStageOutCommand_

Expand All @@ -142,7 +142,7 @@ def createStageOutCommand(self, sourcePFN, targetPFN, options=None, checksums=No
"""
raise NotImplementedError("StageOutImpl.createStageOutCommand")

def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=None):
def createDebuggingCommand(self, sourcePFN, targetPFN, options=None, checksums=None, auth_method=None):
"""
Build a shell command that will report in the logs the details about
failing stageOut commands
Expand Down Expand Up @@ -178,18 +178,17 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None):

This operator does the actual stage out by invoking the overridden
plugin methods of the derived object.


"""
# //

# //
# // Generate the source PFN from the plain PFN if needed
# //
sourcePFN = self.createSourceName(protocol, inputPFN)

# destination may also need PFN changed
# i.e. if we are staging in a file from an SE
targetPFN = self.createTargetName(protocol, targetPFN)
# //
# //
# // Create the output directory if implemented
# //
for retryCount in range(self.numRetries + 1):
Expand All @@ -203,30 +202,64 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None):
msg += "Error details:\n{}\n".format(str(ex))
logging.error(msg)
if retryCount == self.numRetries:
# //
# //
# // last retry, propagate exception
# //
logging.error("Maximum retries exhausted when trying to create the output directory")
raise ex
time.sleep(self.retryPause)

# //
# // Create the command to be used.
# //
command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums)
# //
try:
command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, auth_method="TOKEN")
except TypeError as ex:
logging.warning("Falling back to default createStageOutCommand due to: %s", str(ex))
command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums)
# //
# // Run the command
# //

stageOutEx = None # variable to store the possible StageOutError
for retryCount in range(self.numRetries + 1):
try:
logging.info("Running the stage out...")
logging.info("Running the stage out with tokens (attempt %d)...", retryCount + 1)
self.executeCommand(command)
logging.info("Command to run: %s", command)
logging.info("Stage-out succeeded with the current environment.")
break

except StageOutError as ex:
msg = "Attempt {} to stage out failed.\n".format(retryCount)
msg = "Attempt {} to stage out failed with default setup.\n".format(retryCount)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amaltaro @stlammel as default, do we want to set the BEARER_TOKEN env var to force trying with token authentication?

msg += "Error details:\n{}\n".format(str(ex))
logging.error(msg)

logging.info("Retrying with authentication-safe logic...")

# Authentication-safe fallback logic
if os.getenv("X509_USER_PROXY"):
logging.info("Retrying with X509_USER_PROXY after unsetting BEARER_TOKEN...")
os.system("unset BEARER_TOKEN; unset BEARER_TOKEN_FILE")
command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, auth_method="X509")
try:
self.executeCommand(command)
logging.info("Stage-out succeeded with X509 after unsetting BEARER_TOKEN.")
return
except StageOutError as fallbackEx:
logging.warning("Fallback with X509_USER_PROXY failed:\n%s", str(fallbackEx))

if os.getenv("BEARER_TOKEN") or os.getenv("BEARER_TOKEN_FILE"):
logging.info("Retrying with BEARER_TOKEN after unsetting X509_USER_PROXY...")
os.system("unset X509_USER_PROXY")
command = self.createStageOutCommand(sourcePFN, targetPFN, options, checksums, auth_method="TOKEN")
try:
self.executeCommand(command)
logging.info("Stage-out succeeded with TOKEN after unsetting X509_USER_PROXY.")
return
except StageOutError as fallbackEx:
logging.warning("Fallback with BEARER_TOKEN failed:\n%s", str(fallbackEx))

if retryCount == self.numRetries:
# Last retry, propagate the information outside of the for loop
stageOutEx = ex
Expand All @@ -236,6 +269,6 @@ def __call__(self, protocol, inputPFN, targetPFN, options=None, checksums=None):
# This block will now always be executed after retries are exhausted
if stageOutEx is not None:
logging.error("Maximum number of retries exhausted. Further details on the failed command reported below.")
command = self.createDebuggingCommand(sourcePFN, targetPFN, options, checksums)
command = self.createDebuggingCommand(sourcePFN, targetPFN, options, checksums, auth_method="TOKEN")
self.executeCommand(command)
raise stageOutEx from None
69 changes: 50 additions & 19 deletions test/python/WMCore_t/Storage_t/Backends_t/GFAL2Impl_t.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ def setUp(self):

def testInit(self):
testGFAL2Impl = GFAL2Impl()
removeCommand = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c " \
"'. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}'"
copyCommand = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '" \
". $JOBSTARTDIR/startup_environment.sh; date; gfal-copy -t 2400 -T 2400 -p " \
"-v --abort-on-failure {checksum} {options} {source} {destination}'"
# The default setup without a token
removeCommand = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c " \
"'. $JOBSTARTDIR/startup_environment.sh; date; gfal-rm -t 600 {}'"
copyCommand = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '" \
". $JOBSTARTDIR/startup_environment.sh; date; gfal-copy -t 2400 -T 2400 -p " \
"-v --abort-on-failure {checksum} {options} {source} {destination}'"
self.assertEqual(removeCommand, testGFAL2Impl.removeCommand)
self.assertEqual(copyCommand, testGFAL2Impl.copyCommand)

Expand Down Expand Up @@ -79,10 +80,23 @@ def testCreateRemoveFileCommand_removeCommand(self, mock_path):
def testCreateStageOutCommand_stageIn(self, mock_createRemoveFileCommand):
self.GFAL2Impl.stageIn = True
mock_createRemoveFileCommand.return_value = "targetPFN2"
result = self.GFAL2Impl.createStageOutCommand("sourcePFN", "targetPFN")

# Call createStageOutCommand with auth_method='TOKEN'
result = self.GFAL2Impl.createStageOutCommand(
"sourcePFN", "targetPFN", auth_method='TOKEN'
)

# Generate the expected result with auth_method='TOKEN'
expectedResult = self.getStageOutCommandResult(
self.getCopyCommandDict("-K adler32", "", "sourcePFN", "targetPFN"), "targetPFN2")
self.getCopyCommandDict("-K adler32", "", "sourcePFN", "targetPFN"),
"targetPFN2",
auth_method="TOKEN"
)

# Assert that the removeFileCommand was called correctly
mock_createRemoveFileCommand.assert_called_with("targetPFN")

# Compare the expected and actual result
self.assertEqual(expectedResult, result)

@mock.patch('WMCore.Storage.Backends.GFAL2Impl.GFAL2Impl.createRemoveFileCommand')
Expand All @@ -94,20 +108,37 @@ def testCreateStageOutCommand_options(self, mock_createRemoveFileCommand):
mock_createRemoveFileCommand.assert_called_with("file:targetPFN")
self.assertEqual(expectedResult, result)

def getCopyCommandDict(self, checksum, options, source, destination):
copyCommandDict = {'checksum': '', 'options': '', 'source': '', 'destination': ''}
copyCommandDict['checksum'] = checksum
copyCommandDict['options'] = options
copyCommandDict['source'] = source
copyCommandDict['destination'] = destination
def getCopyCommandDict(self, checksum, options, source, destination, auth_method=None):
"""
Generate a dictionary for the gfal-copy command, dynamically adjusting for auth_method.
"""
copyCommandDict = {
'checksum': checksum,
'options': options,
'source': source,
'destination': destination
}
return copyCommandDict

def getStageOutCommandResult(self, copyCommandDict, createRemoveFileCommandResult):
def getStageOutCommandResult(self, copyCommandDict, createRemoveFileCommandResult, auth_method=None):
"""
Generate the expected result for the gfal-copy command, including dynamic adjustments for auth_method.
"""
# Adjust the setup based on auth_method
if auth_method == "X509":
setups = "env -i X509_USER_PROXY=$X509_USER_PROXY JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'"
elif auth_method == "TOKEN":
setups = "env -i BEARER_TOKEN=$(cat $BEARER_TOKEN_FILE) JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'"
else:
setups = "env -i JOBSTARTDIR=$JOBSTARTDIR bash -c '{}'"

# Build the copy command dynamically
copyOpts = '-t 2400 -T 2400 -p -v --abort-on-failure {checksum} {options} {source} {destination}'
copyCommand = setups.format('. $JOBSTARTDIR/startup_environment.sh; date; gfal-copy ' + copyOpts)

# Construct the full result
result = "#!/bin/bash\n"

copyCommand = self.copyCommand.format_map(copyCommandDict)
result += copyCommand

result += copyCommand.format_map(copyCommandDict)
result += """
EXIT_STATUS=$?
echo "gfal-copy exit status: $EXIT_STATUS"
Expand All @@ -118,7 +149,7 @@ def getStageOutCommandResult(self, copyCommandDict, createRemoveFileCommandResul
fi
exit $EXIT_STATUS
""".format(remove_command=createRemoveFileCommandResult)

return result

@mock.patch('WMCore.Storage.Backends.GFAL2Impl.os.path')
Expand Down
Loading