diff --git a/cartography/intel/github/repos.py b/cartography/intel/github/repos.py index 1ea0c67f45..5bd12b91b6 100644 --- a/cartography/intel/github/repos.py +++ b/cartography/intel/github/repos.py @@ -14,6 +14,8 @@ from cartography.intel.github.util import fetch_all from cartography.intel.github.util import PaginatedGraphqlData +from cartography.util import backoff_handler +from cartography.util import retries_with_backoff from cartography.util import run_cleanup_job from cartography.util import timeit @@ -134,13 +136,35 @@ """ +def _get_repo_collaborators_inner_func( + org: str, + api_url: str, + token: str, + repo_name: str, + affiliation: str, + collab_users: list[dict[str, Any]], + collab_permission: list[str], +) -> None: + logger.info(f"Loading {affiliation} collaborators for repo {repo_name}.") + collaborators = _get_repo_collaborators(token, api_url, org, repo_name, affiliation) + + # nodes and edges are expected to always be present given that we only call for them if totalCount is > 0 + # however sometimes GitHub returns None, as in issue 1334 and 1404. + for collab in collaborators.nodes or []: + collab_users.append(collab) + + # The `or []` is because `.edges` can be None. + for perm in collaborators.edges or []: + collab_permission.append(perm['permission']) + + def _get_repo_collaborators_for_multiple_repos( repo_raw_data: list[dict[str, Any]], affiliation: str, org: str, api_url: str, token: str, -) -> dict[str, List[UserAffiliationAndRepoPermission]]: +) -> dict[str, list[UserAffiliationAndRepoPermission]]: """ For every repo in the given list, retrieve the collaborators. :param repo_raw_data: A list of dicts representing repos. See tests.data.github.repos.GET_REPOS for data shape. @@ -151,7 +175,7 @@ def _get_repo_collaborators_for_multiple_repos( :param token: The Github API token as string. :return: A dictionary of repo URL to list of UserAffiliationAndRepoPermission """ - result: dict[str, List[UserAffiliationAndRepoPermission]] = {} + result: dict[str, list[UserAffiliationAndRepoPermission]] = {} for repo in repo_raw_data: repo_name = repo['name'] repo_url = repo['url'] @@ -162,14 +186,23 @@ def _get_repo_collaborators_for_multiple_repos( result[repo_url] = [] continue - collab_users = [] - collab_permission = [] - collaborators = _get_repo_collaborators(token, api_url, org, repo_name, affiliation) - # nodes and edges are expected to always be present given that we only call for them if totalCount is > 0 - for collab in collaborators.nodes: - collab_users.append(collab) - for perm in collaborators.edges: - collab_permission.append(perm['permission']) + collab_users: List[dict[str, Any]] = [] + collab_permission: List[str] = [] + + retries_with_backoff( + _get_repo_collaborators_inner_func, + TypeError, + 5, + backoff_handler, + )( + org=org, + api_url=api_url, + token=token, + repo_name=repo_name, + affiliation=affiliation, + collab_users=collab_users, + collab_permission=collab_permission, + ) result[repo_url] = [ UserAffiliationAndRepoPermission(user, permission, affiliation) diff --git a/cartography/intel/github/teams.py b/cartography/intel/github/teams.py index b32ad3a804..4bd619f9d2 100644 --- a/cartography/intel/github/teams.py +++ b/cartography/intel/github/teams.py @@ -65,6 +65,26 @@ def get_teams(org: str, api_url: str, token: str) -> Tuple[PaginatedGraphqlData, return fetch_all(token, api_url, org, org_teams_gql, 'teams') +def _get_teams_repos_inner_func( + org: str, + api_url: str, + token: str, + team_name: str, + repo_urls: list[str], + repo_permissions: list[str], +) -> None: + logger.info(f"Loading team repos for {team_name}.") + team_repos = _get_team_repos(org, api_url, token, team_name) + + # The `or []` is because `.nodes` can be None. See: + # https://docs.github.com/en/graphql/reference/objects#teamrepositoryconnection + for repo in team_repos.nodes or []: + repo_urls.append(repo['url']) + # The `or []` is because `.edges` can be None. + for edge in team_repos.edges or []: + repo_permissions.append(edge['permission']) + + @timeit def _get_team_repos_for_multiple_teams( team_raw_data: list[dict[str, Any]], @@ -85,23 +105,18 @@ def _get_team_repos_for_multiple_teams( repo_urls: List[str] = [] repo_permissions: List[str] = [] - def get_teams_repos_inner_func( - org: str, api_url: str, token: str, team_name: str, - repo_urls: List[str], repo_permissions: List[str], - ) -> None: - logger.info(f"Loading team repos for {team_name}.") - team_repos = _get_team_repos(org, api_url, token, team_name) - # The `or []` is because `.nodes` can be None. See: - # https://docs.github.com/en/graphql/reference/objects#teamrepositoryconnection - for repo in team_repos.nodes or []: - repo_urls.append(repo['url']) - # The `or []` is because `.edges` can be None. - for edge in team_repos.edges or []: - repo_permissions.append(edge['permission']) - - retries_with_backoff(get_teams_repos_inner_func, TypeError, 5, backoff_handler)( - org=org, api_url=api_url, token=token, team_name=team_name, - repo_urls=repo_urls, repo_permissions=repo_permissions, + retries_with_backoff( + _get_teams_repos_inner_func, + TypeError, + 5, + backoff_handler, + )( + org=org, + api_url=api_url, + token=token, + team_name=team_name, + repo_urls=repo_urls, + repo_permissions=repo_permissions, ) # Shape = [(repo_url, 'WRITE'), ...]] result[team_name] = [RepoPermission(url, perm) for url, perm in zip(repo_urls, repo_permissions)] diff --git a/cartography/util.py b/cartography/util.py index fbb9358d02..9c50e14d9b 100644 --- a/cartography/util.py +++ b/cartography/util.py @@ -290,8 +290,10 @@ def inner_function(*args, **kwargs): # type: ignore def retries_with_backoff( - func: Callable, - exceptionType: Type[Exception], max_tries: int, on_backoff: Callable, + func: Callable, + exception_type: Type[Exception], + max_tries: int, + on_backoff: Callable, ) -> Callable: """ Adds retry with backoff to the given function. (Could expand the possible input parameters as needed.) @@ -299,7 +301,7 @@ def retries_with_backoff( @wraps(func) @backoff.on_exception( backoff.expo, - exceptionType, + exception_type, max_tries=max_tries, on_backoff=on_backoff, ) diff --git a/setup.py b/setup.py index f5c7da76a8..edd987bc50 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages from setuptools import setup -__version__ = '0.96.0' +__version__ = '0.96.1' setup( diff --git a/tests/unit/cartography/intel/github/test_repos.py b/tests/unit/cartography/intel/github/test_repos.py new file mode 100644 index 0000000000..3123dbbc68 --- /dev/null +++ b/tests/unit/cartography/intel/github/test_repos.py @@ -0,0 +1,37 @@ +from unittest.mock import MagicMock +from unittest.mock import patch + +import pytest + +from cartography.intel.github.repos import _get_repo_collaborators_for_multiple_repos + + +@patch('time.sleep', return_value=None) +@patch('cartography.intel.github.repos._get_repo_collaborators') +@patch('cartography.intel.github.repos.backoff_handler', spec=True) +def test_get_team_users_github_returns_none(mock_backoff_handler, mock_get_team_collaborators, mock_sleep): + """ + This test happens to use 'OUTSIDE' affiliation, but it's irrelevant for the test, it just needs either valid value. + """ + # Arrange + repo_data = [{'name': 'repo1', 'url': 'https://github.com/repo1', 'outsideCollaborators': {'totalCount': 1}}] + mock_repo_collabs = MagicMock() + # Set up the condition where GitHub returns a None url and None edge as in #1334. + mock_repo_collabs.nodes = [None] + mock_repo_collabs.edges = [None] + mock_get_team_collaborators.return_value = mock_repo_collabs + + # Assert we raise an exception + with pytest.raises(TypeError): + _get_repo_collaborators_for_multiple_repos( + repo_data, + 'OUTSIDE', + 'test-org', + 'https://api.github.com', + 'test-token', + ) + + # Assert that we retry and give up + assert mock_sleep.call_count == 4 + assert mock_get_team_collaborators.call_count == 5 + assert mock_backoff_handler.call_count == 4 diff --git a/tests/unit/cartography/intel/github/test_teams.py b/tests/unit/cartography/intel/github/test_teams.py index defeaea15b..7fd2c6c831 100644 --- a/tests/unit/cartography/intel/github/test_teams.py +++ b/tests/unit/cartography/intel/github/test_teams.py @@ -122,9 +122,10 @@ def test_get_team_users_happy_path(mock_get_team_users): mock_get_team_users.assert_called_once_with('test-org', 'https://api.github.com', 'test-token', 'team1') +@patch('time.sleep', return_value=None) @patch('cartography.intel.github.teams._get_team_repos') @patch('cartography.intel.github.teams.backoff_handler', spec=True) -def test_get_team_repos_github_returns_none(mock_backoff_handler, mock_get_team_repos): +def test_get_team_repos_github_returns_none(mock_backoff_handler, mock_get_team_repos, mock_sleep): # Arrange team_data = [{'slug': 'team1', 'repositories': {'totalCount': 1}}] mock_team_repos = MagicMock() @@ -143,13 +144,15 @@ def test_get_team_repos_github_returns_none(mock_backoff_handler, mock_get_team_ ) # Assert that we retry and give up + assert mock_sleep.call_count == 4 assert mock_get_team_repos.call_count == 5 assert mock_backoff_handler.call_count == 4 +@patch('time.sleep', return_value=None) @patch('cartography.intel.github.teams._get_team_users') @patch('cartography.intel.github.teams.backoff_handler', spec=True) -def test_get_team_users_github_returns_none(mock_backoff_handler, mock_get_team_users): +def test_get_team_users_github_returns_none(mock_backoff_handler, mock_get_team_users, mock_sleep): # Arrange team_data = [{'slug': 'team1', 'members': {'totalCount': 1}}] mock_team_users = MagicMock() @@ -168,6 +171,7 @@ def test_get_team_users_github_returns_none(mock_backoff_handler, mock_get_team_ ) # Assert that we retry and give up + assert mock_sleep.call_count == 4 assert mock_get_team_users.call_count == 5 assert mock_backoff_handler.call_count == 4