From 0fc98f4ac4fc36eda32d91f6cafee6f1e9b126a2 Mon Sep 17 00:00:00 2001 From: Uditi Mehta <57388785+uditijmehta@users.noreply.github.com> Date: Wed, 9 Oct 2024 13:54:38 -0400 Subject: [PATCH 01/35] [ENG-6195] Fix admin confirmation link generation and handling (#10734) * Fix admin confirmation link generation and handling * Fix test failures * Refactor confirmation link logic and add unit tests * cleanup unused code --------- Co-authored-by: Uditi Mehta Co-authored-by: Uditi Mehta --- admin/users/views.py | 16 ++++++++--- admin_tests/users/test_views.py | 48 +++++++++++++++++++++++++++++++-- osf/models/user.py | 17 ++++++++++++ 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/admin/users/views.py b/admin/users/views.py index 69bfa821c5c..1e6d6e3b09a 100644 --- a/admin/users/views.py +++ b/admin/users/views.py @@ -16,6 +16,7 @@ from django.core.mail import send_mail from django.shortcuts import redirect from django.core.paginator import Paginator +from django.core.exceptions import ValidationError from osf.exceptions import UserStateError from osf.models.base import Guid @@ -456,10 +457,19 @@ def get_context_data(self, **kwargs): class GetUserConfirmationLink(GetUserLink): def get_link(self, user): + if user.is_confirmed: + return f'User {user._id} is already confirmed' + + if user.deleted or user.is_merged: + return f'User {user._id} is deleted or merged' + try: - return user.get_confirmation_url(user.username, force=True) - except KeyError as e: - return str(e) + confirmation_link = user.get_or_create_confirmation_url(user.username, force=True, renew=True) + return confirmation_link + except ValidationError: + return f'Invalid email for user {user._id}' + except KeyError: + return 'Could not generate or refresh confirmation link' def get_link_type(self): return 'User Confirmation' diff --git a/admin_tests/users/test_views.py b/admin_tests/users/test_views.py index 80da9721651..cd51459e134 100644 --- a/admin_tests/users/test_views.py +++ b/admin_tests/users/test_views.py @@ -486,10 +486,15 @@ def test_get_user_confirmation_link(self): view = views.GetUserConfirmationLink() view = setup_view(view, request, guid=user._id) + link = view.get_link(user) + + user.refresh_from_db() + user_token = list(user.email_verifications.keys())[0] + ideal_link_path = f'/confirm/{user._id}/{user_token}/' - link = view.get_link(user) - link_path = str(furl(link).path) + + link_path = str(furl(link).path).rstrip('/') + '/' assert link_path == ideal_link_path @@ -511,6 +516,45 @@ def test_get_user_confirmation_link_with_expired_token(self): assert link_path == ideal_link_path + def test_get_user_confirmation_link_generates_new_token_if_expired(self): + user = UnconfirmedUserFactory() + request = RequestFactory().get('/fake_path') + view = views.GetUserConfirmationLink() + view = setup_view(view, request, guid=user._id) + + old_user_token = list(user.email_verifications.keys())[0] + user.email_verifications[old_user_token]['expiration'] = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(hours=24) + user.save() + + link = view.get_link(user) + user.refresh_from_db() + + new_user_token = list(user.email_verifications.keys())[0] + + assert new_user_token != old_user_token + + link_path = str(furl(link).path) + ideal_link_path = f'/confirm/{user._id}/{new_user_token}/' + assert link_path == ideal_link_path + + def test_get_user_confirmation_link_does_not_change_unexpired_token(self): + user = UnconfirmedUserFactory() + request = RequestFactory().get('/fake_path') + view = views.GetUserConfirmationLink() + view = setup_view(view, request, guid=user._id) + + user_token_before = list(user.email_verifications.keys())[0] + + user.email_verifications[user_token_before]['expiration'] = datetime.utcnow().replace(tzinfo=pytz.utc) + timedelta(hours=24) + user.save() + + with mock.patch('osf.models.user.OSFUser.get_or_create_confirmation_url') as mock_method: + mock_method.return_value = user.get_confirmation_url(user.username, force=False, renew=False) + + user_token_after = list(user.email_verifications.keys())[0] + + assert user_token_before == user_token_after + def test_get_password_reset_link(self): user = UnconfirmedUserFactory() request = RequestFactory().get('/fake_path') diff --git a/osf/models/user.py b/osf/models/user.py index 29e10efa991..d0783c208aa 100644 --- a/osf/models/user.py +++ b/osf/models/user.py @@ -1342,6 +1342,23 @@ def get_confirmation_url(self, email, destination = '?{}'.format(urlencode({'destination': destination})) if destination else '' return f'{base}confirm/{external}{self._primary_key}/{token}/{destination}' + def get_or_create_confirmation_url(self, email, force=False, renew=False): + """ + Get or create a confirmation URL for the given email. + + :param email: The email to generate a confirmation URL for. + :param force: Force generating a new confirmation link. + :param renew: Renew an expired token. + :raises ValidationError: If email is invalid or domain is banned. + :return: Confirmation URL for the email. + """ + try: + self.get_confirmation_token(email, force=force, renew=renew) + except KeyError: + self.add_unconfirmed_email(email) + self.save() + return self.get_confirmation_url(email) + def register(self, username, password=None, accepted_terms_of_service=None): """Registers the user. """ From 5ae3ed2da3c0882734f4408fcfdae9d527bb0f4c Mon Sep 17 00:00:00 2001 From: Uditi Mehta Date: Fri, 20 Sep 2024 13:34:49 -0400 Subject: [PATCH 02/35] Resolve issue with updating preprint fields and validation errors --- api/preprints/serializers.py | 62 ++- .../preprints/views/test_preprint_detail.py | 362 ++++++++++++++++++ osf/models/preprint.py | 50 +-- 3 files changed, 449 insertions(+), 25 deletions(-) diff --git a/api/preprints/serializers.py b/api/preprints/serializers.py index 97cc3f3fb7c..7d3ebdfaa00 100644 --- a/api/preprints/serializers.py +++ b/api/preprints/serializers.py @@ -369,7 +369,67 @@ def update(self, preprint, validated_data): preprint.custom_publication_citation = validated_data['custom_publication_citation'] or None save_preprint = True - self.handle_author_assertions(preprint, validated_data, auth) + if 'has_coi' in validated_data: + try: + preprint.update_has_coi(auth, validated_data['has_coi']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'conflict_of_interest_statement' in validated_data: + try: + preprint.update_conflict_of_interest_statement(auth, validated_data['conflict_of_interest_statement']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'has_data_links' in validated_data: + try: + preprint.update_has_data_links(auth, validated_data['has_data_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'why_no_data' in validated_data: + try: + preprint.update_why_no_data(auth, validated_data['why_no_data']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'data_links' in validated_data: + try: + preprint.update_data_links(auth, validated_data['data_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'has_prereg_links' in validated_data: + try: + preprint.update_has_prereg_links(auth, validated_data['has_prereg_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'why_no_prereg' in validated_data: + try: + preprint.update_why_no_prereg(auth, validated_data['why_no_prereg']) + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'prereg_links' in validated_data: + try: + preprint.update_prereg_links(auth, validated_data['prereg_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'prereg_link_info' in validated_data: + try: + preprint.update_prereg_link_info(auth, validated_data['prereg_link_info']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) if published is not None: if not preprint.primary_file: diff --git a/api_tests/preprints/views/test_preprint_detail.py b/api_tests/preprints/views/test_preprint_detail.py index 7e3b279c406..df50db8166d 100644 --- a/api_tests/preprints/views/test_preprint_detail.py +++ b/api_tests/preprints/views/test_preprint_detail.py @@ -835,6 +835,368 @@ def test_update_preprint_task_called_on_api_update( assert mock_on_preprint_updated.called + def test_update_has_coi(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'has_coi': True} + ) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_coi'] + + preprint.reload() + assert preprint.has_coi + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_HAS_COI + assert log.params == {'preprint': preprint._id, 'user': user._id, 'value': True} + + def test_update_conflict_of_interest_statement(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'conflict_of_interest_statement': 'Owns shares in Closed Science Corporation.'} + ) + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + preprint.has_coi = False + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 200 + assert res.json['data']['attributes']['conflict_of_interest_statement'] ==\ + 'Owns shares in Closed Science Corporation.' + + preprint.has_coi = True + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['conflict_of_interest_statement'] ==\ + 'Owns shares in Closed Science Corporation.' + + preprint.reload() + assert preprint.conflict_of_interest_statement == 'Owns shares in Closed Science Corporation.' + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_COI_STATEMENT + assert log.params == { + 'preprint': preprint._id, + 'user': user._id, + 'value': 'Owns shares in Closed Science Corporation.' + } + + def test_update_has_data_links(self, app, user, preprint, url): + update_payload = build_preprint_update_payload(preprint._id, attributes={'has_data_links': 'available'}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_data_links'] == 'available' + + preprint.reload() + assert preprint.has_data_links + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_HAS_DATA_LINKS + assert log.params == {'value': 'available', 'user': user._id, 'preprint': preprint._id} + + def test_update_why_no_data(self, app, user, preprint, url): + update_payload = build_preprint_update_payload(preprint._id, attributes={'why_no_data': 'My dog ate it.'}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 200 + assert res.json['data']['attributes']['why_no_data'] == 'My dog ate it.' + + preprint.has_data_links = 'no' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['why_no_data'] == 'My dog ate it.' + + preprint.reload() + assert preprint.why_no_data + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_WHY_NO_DATA + assert log.params == {'user': user._id, 'preprint': preprint._id} + + def test_update_data_links(self, app, user, preprint, url): + data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] + update_payload = build_preprint_update_payload(preprint._id, attributes={'data_links': data_links}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + preprint.has_data_links = 'no' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 200 + assert res.json['data']['attributes']['data_links'] == data_links + + preprint.has_data_links = 'available' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['data_links'] == data_links + + preprint.reload() + assert preprint.data_links == data_links + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_DATA_LINKS + assert log.params == {'user': user._id, 'preprint': preprint._id} + + update_payload = build_preprint_update_payload(preprint._id, attributes={'data_links': 'maformed payload'}) + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Expected a list of items but got type "str".' + + def test_invalid_data_links(self, app, user, preprint, url): + preprint.has_data_links = 'available' + preprint.save() + + update_payload = build_preprint_update_payload(preprint._id, attributes={'data_links': ['thisaintright']}) + + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Enter a valid URL.' + + def test_update_has_prereg_links(self, app, user, preprint, url): + update_payload = build_preprint_update_payload(preprint._id, attributes={'has_prereg_links': 'available'}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_prereg_links'] == 'available' + + preprint.reload() + assert preprint.has_prereg_links + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_HAS_PREREG_LINKS + assert log.params == {'value': 'available', 'user': user._id, 'preprint': preprint._id} + + def test_invalid_prereg_links(self, app, user, preprint, url): + preprint.has_prereg_links = 'available' + preprint.save() + + update_payload = build_preprint_update_payload(preprint._id, attributes={'prereg_links': ['thisaintright']}) + + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Enter a valid URL.' + + def test_no_data_links_clears_links(self, app, user, preprint, url): + preprint.has_data_links = 'available' + preprint.data_links = ['http://www.apple.com'] + preprint.save() + + update_payload = build_preprint_update_payload(preprint._id, attributes={'has_data_links': 'no'}) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_data_links'] == 'no' + assert res.json['data']['attributes']['data_links'] == [] + + preprint.reload() + assert preprint.has_data_links == 'no' + assert preprint.data_links == [] + + def test_no_prereg_links_clears_links(self, app, user, preprint, url): + preprint.has_prereg_links = 'available' + preprint.prereg_links = ['http://example.com'] + preprint.prereg_link_info = 'prereg_analysis' + preprint.save() + + update_payload = build_preprint_update_payload(preprint._id, attributes={'has_prereg_links': 'no'}) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_prereg_links'] == 'no' + preprint.reload() + assert res.json['data']['attributes']['prereg_links'] == [] + assert not res.json['data']['attributes']['prereg_link_info'] + + assert preprint.prereg_links == [] + assert preprint.prereg_link_info is None + + def test_update_why_no_prereg(self, app, user, preprint, url): + update_payload = build_preprint_update_payload(preprint._id, attributes={'why_no_prereg': 'My dog ate it.'}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 200 + assert res.json['data']['attributes']['why_no_prereg'] == 'My dog ate it.' + + preprint.has_prereg_links = False + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['why_no_prereg'] == 'My dog ate it.' + + preprint.reload() + assert preprint.why_no_prereg + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_WHY_NO_PREREG + assert log.params == {'user': user._id, 'preprint': preprint._id} + + def test_update_prereg_links(self, app, user, preprint, url): + + prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] + update_payload = build_preprint_update_payload(preprint._id, attributes={'prereg_links': prereg_links}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + preprint.has_prereg_links = 'no' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 200 + assert res.json['data']['attributes']['prereg_links'] == prereg_links + + preprint.has_prereg_links = 'available' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['prereg_links'] == prereg_links + + preprint.reload() + assert preprint.prereg_links == prereg_links + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_PREREG_LINKS + assert log.params == {'user': user._id, 'preprint': preprint._id} + + update_payload = build_preprint_update_payload(preprint._id, attributes={'prereg_links': 'maformed payload'}) + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Expected a list of items but got type "str".' + + def test_update_prereg_link_info(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'prereg_link_info': 'prereg_designs'} + ) + + preprint.has_prereg_links = 'no' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 200 + assert res.json['data']['attributes']['prereg_link_info'] == 'prereg_designs' + + preprint.has_prereg_links = 'available' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['prereg_link_info'] == 'prereg_designs' + + preprint.reload() + assert preprint.prereg_link_info == 'prereg_designs' + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_PREREG_LINKS_INFO + assert log.params == {'user': user._id, 'preprint': preprint._id} + + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'prereg_link_info': 'maformed payload'} + ) + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == '"maformed payload" is not a valid choice.' + + def test_sloan_updates(self, app, user, preprint, url): + """ + - Tests to ensure updating a preprint with unchanged data does not create superfluous log statements. + - Tests to ensure various dependent fields can be updated in a single request. + """ + preprint.has_prereg_links = 'available' + preprint.prereg_links = ['http://no-sf.io'] + preprint.prereg_link_info = 'prereg_designs' + preprint.save() + + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_prereg_links': 'available', + 'prereg_link_info': 'prereg_designs', + 'prereg_links': ['http://osf.io'], # changing here should be only non-factory created log. + } + ) + app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + # Any superfluous log statements? + logs = preprint.logs.all().values_list('action', 'params') + assert logs.count() == 3 # actions should be: 'subjects_updated', 'published', 'prereg_links_updated' + assert logs.latest() == ('prereg_links_updated', {'user': user._id, 'preprint': preprint._id}) + + # Can we set `has_prereg_links` to false and update `why_no_prereg` in a single request? + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_prereg_links': 'no', + 'why_no_prereg': 'My dog ate it.' + } + ) + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_prereg_links'] == 'no' + assert res.json['data']['attributes']['why_no_prereg'] == 'My dog ate it.' + + preprint.refresh_from_db() + assert preprint.has_prereg_links == 'no' + assert preprint.why_no_prereg == 'My dog ate it.' + @pytest.mark.django_db class TestPreprintUpdateSubjects(UpdateSubjectsMixin): diff --git a/osf/models/preprint.py b/osf/models/preprint.py index 4ce426ccaed..87c337532cc 100644 --- a/osf/models/preprint.py +++ b/osf/models/preprint.py @@ -996,6 +996,9 @@ def update_has_coi(self, auth: Auth, has_coi: bool, log: bool = True, save: bool This method brought to you via a grant from the Alfred P Sloan Foundation. """ + if has_coi is None: + has_coi = False + if self.has_coi == has_coi: return @@ -1028,17 +1031,14 @@ def update_conflict_of_interest_statement(self, auth: Auth, coi_statement: str, if self.conflict_of_interest_statement == coi_statement: return - if not self.has_coi: - raise PreprintStateError('You do not have the ability to edit a conflict of interest while the has_coi field is ' - 'set to false or unanswered') - - self.conflict_of_interest_statement = coi_statement + self.conflict_of_interest_statement = coi_statement or '' if log: self.add_log( action=PreprintLog.UPDATE_COI_STATEMENT, params={ 'user': auth.user._id, + 'value': self.conflict_of_interest_statement }, auth=auth, ) @@ -1061,6 +1061,9 @@ def update_has_data_links(self, auth: Auth, has_data_links: bool, log: bool = Tr if self.has_data_links == has_data_links: return + if has_data_links == 'no': + self.data_links = [] + self.has_data_links = has_data_links if log: @@ -1072,7 +1075,7 @@ def update_has_data_links(self, auth: Auth, has_data_links: bool, log: bool = Tr }, auth=auth ) - if has_data_links != 'available': + if not has_data_links: self.update_data_links(auth, data_links=[], log=False) if save: self.save() @@ -1093,9 +1096,8 @@ def update_data_links(self, auth: Auth, data_links: list, log: bool = True, save if self.data_links == data_links: return - if not self.has_data_links == 'available' and data_links: - raise PreprintStateError('You cannot edit this statement while your data links availability is set to false' - ' or is unanswered.') + if not self.has_data_links and data_links: + self.data_links = [] self.data_links = data_links @@ -1126,11 +1128,10 @@ def update_why_no_data(self, auth: Auth, why_no_data: str, log: bool = True, sav if self.why_no_data == why_no_data: return - if not self.has_data_links == 'no': - raise PreprintStateError('You cannot edit this statement while your data links availability is set to true or' - ' is unanswered.') - else: - self.why_no_data = why_no_data + if self.has_data_links: + self.why_no_data = '' + + self.why_no_data = why_no_data if log: self.add_log( @@ -1159,6 +1160,10 @@ def update_has_prereg_links(self, auth: Auth, has_prereg_links: bool, log: bool if has_prereg_links == self.has_prereg_links: return + if has_prereg_links == 'no': + self.prereg_links = [] + self.prereg_link_info = None + self.has_prereg_links = has_prereg_links if log: @@ -1170,7 +1175,7 @@ def update_has_prereg_links(self, auth: Auth, has_prereg_links: bool, log: bool }, auth=auth ) - if has_prereg_links != 'available': + if not has_prereg_links: self.update_prereg_links(auth, prereg_links=[], log=False) self.update_prereg_link_info(auth, prereg_link_info=None, log=False) if save: @@ -1192,9 +1197,8 @@ def update_why_no_prereg(self, auth: Auth, why_no_prereg: str, log: bool = True, if why_no_prereg == self.why_no_prereg: return - if self.has_prereg_links == 'available' or self.has_prereg_links is None: - raise PreprintStateError('You cannot edit this statement while your prereg links ' - 'availability is set to true or is unanswered.') + if self.has_prereg_links or self.has_prereg_links is None: + self.why_no_prereg = '' self.why_no_prereg = why_no_prereg @@ -1225,9 +1229,8 @@ def update_prereg_links(self, auth: Auth, prereg_links: list, log: bool = True, if prereg_links == self.prereg_links: return - if not self.has_prereg_links == 'available' and prereg_links: - raise PreprintStateError('You cannot edit this field while your prereg links' - ' availability is set to false or is unanswered.') + if not self.has_prereg_links and prereg_links: + self.prereg_links = [] self.prereg_links = prereg_links @@ -1259,9 +1262,8 @@ def update_prereg_link_info(self, auth: Auth, prereg_link_info: str, log: bool = if self.prereg_link_info == prereg_link_info: return - if not self.has_prereg_links == 'available' and prereg_link_info: - raise PreprintStateError('You cannot edit this field while your prereg links' - ' availability is set to false or is unanswered.') + if not self.has_prereg_links and prereg_link_info: + self.prereg_link_info = None self.prereg_link_info = prereg_link_info From 008488affbfcf7ed84ff4e18f085ac2afd43884b Mon Sep 17 00:00:00 2001 From: Uditi Mehta Date: Wed, 2 Oct 2024 15:53:06 -0400 Subject: [PATCH 03/35] Fix PreprintSerializer validation to handle has_coi and has_data_links updates --- api/preprints/serializers.py | 40 +++++++++++++++++++ .../preprints/views/test_preprint_detail.py | 13 +++--- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/api/preprints/serializers.py b/api/preprints/serializers.py index 7d3ebdfaa00..4562bb901da 100644 --- a/api/preprints/serializers.py +++ b/api/preprints/serializers.py @@ -289,6 +289,46 @@ def update(self, preprint, validated_data): if not preprint.has_permission(auth.user, osf_permissions.WRITE): raise exceptions.PermissionDenied(detail='User must have admin or write permissions to update a preprint.') + save_preprint = False + recently_published = False + + if 'has_coi' in validated_data: + try: + preprint.update_has_coi(auth, validated_data['has_coi']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + conflict_statement = validated_data.get('conflict_of_interest_statement', None) + has_coi = preprint.has_coi + if has_coi is False and conflict_statement: + raise exceptions.ValidationError( + detail='Cannot provide conflict of interest statement when has_coi is set to False.', + ) + + if 'has_data_links' in validated_data: + try: + preprint.update_has_data_links(auth, validated_data['has_data_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + why_no_data = validated_data.get('why_no_data', None) + has_data_links = preprint.has_data_links + if has_data_links != 'no' and why_no_data: + raise exceptions.ValidationError( + detail='You cannot edit this statement while your data links availability is set to true or is unanswered.', + ) + + if has_data_links == 'no': + if 'data_links' in validated_data and validated_data['data_links']: + raise exceptions.ValidationError( + detail='Cannot provide data links when has_data_links is set to "no".', + ) + if preprint.data_links: + preprint.update_data_links(auth, []) + save_preprint = True + published = validated_data.pop('is_published', None) if published and preprint.provider.is_reviewed: url = absolute_reverse( diff --git a/api_tests/preprints/views/test_preprint_detail.py b/api_tests/preprints/views/test_preprint_detail.py index df50db8166d..b283345fefd 100644 --- a/api_tests/preprints/views/test_preprint_detail.py +++ b/api_tests/preprints/views/test_preprint_detail.py @@ -873,9 +873,8 @@ def test_update_conflict_of_interest_statement(self, app, user, preprint, url): preprint.save() res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) - assert res.status_code == 200 - assert res.json['data']['attributes']['conflict_of_interest_statement'] ==\ - 'Owns shares in Closed Science Corporation.' + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Cannot provide conflict of interest statement when has_coi is set to False.' preprint.has_coi = True preprint.save() @@ -926,8 +925,8 @@ def test_update_why_no_data(self, app, user, preprint, url): res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) - assert res.status_code == 200 - assert res.json['data']['attributes']['why_no_data'] == 'My dog ate it.' + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'You cannot edit this statement while your data links availability is set to true or is unanswered.' preprint.has_data_links = 'no' preprint.save() @@ -956,8 +955,8 @@ def test_update_data_links(self, app, user, preprint, url): preprint.save() res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) - assert res.status_code == 200 - assert res.json['data']['attributes']['data_links'] == data_links + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Cannot provide data links when has_data_links is set to "no".' preprint.has_data_links = 'available' preprint.save() From 5688949ac7589b3e315c33976db132a82d20e620 Mon Sep 17 00:00:00 2001 From: Uditi Mehta Date: Thu, 3 Oct 2024 13:44:38 -0400 Subject: [PATCH 04/35] Update tests to reflect the corrected validations --- api/preprints/serializers.py | 104 +++++++++++++++--- .../preprints/views/test_preprint_detail.py | 70 ++++++++++-- 2 files changed, 145 insertions(+), 29 deletions(-) diff --git a/api/preprints/serializers.py b/api/preprints/serializers.py index 4562bb901da..cd4680e69fb 100644 --- a/api/preprints/serializers.py +++ b/api/preprints/serializers.py @@ -292,6 +292,51 @@ def update(self, preprint, validated_data): save_preprint = False recently_published = False + for field in ['conflict_of_interest_statement', 'why_no_data', 'why_no_prereg']: + if field in validated_data: + value = validated_data[field] + if isinstance(value, str) and not value.strip(): + validated_data[field] = None + + updated_has_coi = validated_data.get('has_coi', preprint.has_coi) + updated_conflict_statement = validated_data.get('conflict_of_interest_statement', preprint.conflict_of_interest_statement) + + updated_has_data_links = validated_data.get('has_data_links', preprint.has_data_links) + updated_why_no_data = validated_data.get('why_no_data', preprint.why_no_data) + + updated_has_prereg_links = validated_data.get('has_prereg_links', preprint.has_prereg_links) + updated_why_no_prereg = validated_data.get('why_no_prereg', preprint.why_no_prereg) + + if updated_has_coi is False and updated_conflict_statement: + raise exceptions.ValidationError( + detail='Cannot provide conflict of interest statement when has_coi is set to False.', + ) + + if updated_has_data_links != 'no' and updated_why_no_data: + raise exceptions.ValidationError( + detail='You cannot edit this statement while your data links availability is set to true or is unanswered.', + ) + + if updated_has_data_links == 'no' and 'data_links' in validated_data and validated_data['data_links']: + raise exceptions.ValidationError( + detail='Cannot provide data links when has_data_links is set to "no".', + ) + + if updated_has_prereg_links != 'no' and updated_why_no_prereg: + raise exceptions.ValidationError( + detail='You cannot edit this statement while your prereg links availability is set to true or is unanswered.', + ) + + if updated_has_prereg_links != 'available': + if 'prereg_links' in validated_data and validated_data['prereg_links']: + raise exceptions.ValidationError( + detail='You cannot edit this field while your prereg links availability is set to false or is unanswered.', + ) + if 'prereg_link_info' in validated_data and validated_data['prereg_link_info']: + raise exceptions.ValidationError( + detail='You cannot edit this field while your prereg links availability is set to false or is unanswered.', + ) + if 'has_coi' in validated_data: try: preprint.update_has_coi(auth, validated_data['has_coi']) @@ -299,12 +344,12 @@ def update(self, preprint, validated_data): except PreprintStateError as e: raise exceptions.ValidationError(detail=str(e)) - conflict_statement = validated_data.get('conflict_of_interest_statement', None) - has_coi = preprint.has_coi - if has_coi is False and conflict_statement: - raise exceptions.ValidationError( - detail='Cannot provide conflict of interest statement when has_coi is set to False.', - ) + if 'conflict_of_interest_statement' in validated_data: + try: + preprint.update_conflict_of_interest_statement(auth, validated_data['conflict_of_interest_statement']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) if 'has_data_links' in validated_data: try: @@ -313,22 +358,45 @@ def update(self, preprint, validated_data): except PreprintStateError as e: raise exceptions.ValidationError(detail=str(e)) - why_no_data = validated_data.get('why_no_data', None) - has_data_links = preprint.has_data_links - if has_data_links != 'no' and why_no_data: - raise exceptions.ValidationError( - detail='You cannot edit this statement while your data links availability is set to true or is unanswered.', - ) + if 'why_no_data' in validated_data: + try: + preprint.update_why_no_data(auth, validated_data['why_no_data']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) - if has_data_links == 'no': - if 'data_links' in validated_data and validated_data['data_links']: - raise exceptions.ValidationError( - detail='Cannot provide data links when has_data_links is set to "no".', - ) - if preprint.data_links: + if 'data_links' in validated_data: + try: + preprint.update_data_links(auth, validated_data['data_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + else: + if updated_has_data_links == 'no' and preprint.data_links: preprint.update_data_links(auth, []) save_preprint = True + if 'why_no_prereg' in validated_data: + try: + preprint.update_why_no_prereg(auth, validated_data['why_no_prereg']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'prereg_links' in validated_data: + try: + preprint.update_prereg_links(auth, validated_data['prereg_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'prereg_link_info' in validated_data: + try: + preprint.update_prereg_link_info(auth, validated_data['prereg_link_info']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + published = validated_data.pop('is_published', None) if published and preprint.provider.is_reviewed: url = absolute_reverse( diff --git a/api_tests/preprints/views/test_preprint_detail.py b/api_tests/preprints/views/test_preprint_detail.py index b283345fefd..5426e4023eb 100644 --- a/api_tests/preprints/views/test_preprint_detail.py +++ b/api_tests/preprints/views/test_preprint_detail.py @@ -1063,21 +1063,26 @@ def test_update_why_no_prereg(self, app, user, preprint, url): assert res.status_code == 403 assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + preprint.has_prereg_links = 'available' + preprint.save() res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'You cannot edit this statement while your prereg links availability is set to true or is unanswered.' - assert res.status_code == 200 - assert res.json['data']['attributes']['why_no_prereg'] == 'My dog ate it.' - - preprint.has_prereg_links = False - preprint.save() + update_payload = build_preprint_update_payload(preprint._id, attributes={ + 'why_no_prereg': 'My dog ate it.', + 'has_prereg_links': 'no' + }) res = app.patch_json_api(url, update_payload, auth=user.auth) assert res.status_code == 200 assert res.json['data']['attributes']['why_no_prereg'] == 'My dog ate it.' preprint.reload() - assert preprint.why_no_prereg - log = preprint.logs.first() + assert preprint.why_no_prereg == 'My dog ate it.' + + log = preprint.logs.filter(action=PreprintLog.UPDATE_WHY_NO_PREREG).first() + assert log is not None, 'Expected log entry for why_no_prereg_updated not found.' assert log.action == PreprintLog.UPDATE_WHY_NO_PREREG assert log.params == {'user': user._id, 'preprint': preprint._id} @@ -1096,8 +1101,8 @@ def test_update_prereg_links(self, app, user, preprint, url): preprint.save() res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) - assert res.status_code == 200 - assert res.json['data']['attributes']['prereg_links'] == prereg_links + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'You cannot edit this field while your prereg links availability is set to false or is unanswered.' preprint.has_prereg_links = 'available' preprint.save() @@ -1128,8 +1133,8 @@ def test_update_prereg_link_info(self, app, user, preprint, url): preprint.save() res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) - assert res.status_code == 200 - assert res.json['data']['attributes']['prereg_link_info'] == 'prereg_designs' + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'You cannot edit this field while your prereg links availability is set to false or is unanswered.' preprint.has_prereg_links = 'available' preprint.save() @@ -1153,6 +1158,49 @@ def test_update_prereg_link_info(self, app, user, preprint, url): assert res.status_code == 400 assert res.json['errors'][0]['detail'] == '"maformed payload" is not a valid choice.' + def test_update_has_coi_false_with_null_conflict_statement(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_coi': False, + 'conflict_of_interest_statement': None + } + ) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_coi'] is False + assert res.json['data']['attributes']['conflict_of_interest_statement'] is None + + preprint.reload() + assert preprint.has_coi is False + assert preprint.conflict_of_interest_statement is None + + def test_update_has_data_links_no_with_data_links_provided(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_data_links': 'no', + 'data_links': ['http://example.com/data'] + } + ) + + initial_has_data_links = preprint.has_data_links + initial_data_links = preprint.data_links + + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Cannot provide data links when has_data_links is set to "no".' + + preprint.reload() + + assert preprint.has_data_links == initial_has_data_links + assert preprint.data_links == initial_data_links + + assert preprint.has_data_links != 'no' + def test_sloan_updates(self, app, user, preprint, url): """ - Tests to ensure updating a preprint with unchanged data does not create superfluous log statements. From 5c02749a97605e7afd727f10f528630727f507c4 Mon Sep 17 00:00:00 2001 From: Uditi Mehta Date: Thu, 3 Oct 2024 16:38:06 -0400 Subject: [PATCH 05/35] Add tests for data links and preregistration validation --- .../preprints/views/test_preprint_detail.py | 42 +++ .../test_preprint_detail_author_assertions.py | 300 ------------------ 2 files changed, 42 insertions(+), 300 deletions(-) delete mode 100644 api_tests/preprints/views/test_preprint_detail_author_assertions.py diff --git a/api_tests/preprints/views/test_preprint_detail.py b/api_tests/preprints/views/test_preprint_detail.py index 5426e4023eb..3106835a940 100644 --- a/api_tests/preprints/views/test_preprint_detail.py +++ b/api_tests/preprints/views/test_preprint_detail.py @@ -18,6 +18,7 @@ from osf.models import ( NodeLicense, PreprintContributor, + PreprintLog ) from osf.utils.permissions import WRITE from osf.utils.workflows import DefaultStates @@ -1201,6 +1202,47 @@ def test_update_has_data_links_no_with_data_links_provided(self, app, user, prep assert preprint.has_data_links != 'no' + def test_update_has_data_links_no_with_empty_data_links(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_data_links': 'no', + 'data_links': [] + } + ) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_data_links'] == 'no' + assert res.json['data']['attributes']['data_links'] == [] + + preprint.reload() + assert preprint.has_data_links == 'no' + assert preprint.data_links == [] + + def test_update_has_prereg_links_no_with_empty_prereg_links(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_prereg_links': 'no', + 'prereg_links': [], + 'prereg_link_info': '' + } + ) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_prereg_links'] == 'no' + assert res.json['data']['attributes']['prereg_links'] == [] + assert res.json['data']['attributes']['prereg_link_info'] == '' + + preprint.reload() + assert preprint.has_prereg_links == 'no' + assert preprint.prereg_links == [] + assert preprint.prereg_link_info == '' + def test_sloan_updates(self, app, user, preprint, url): """ - Tests to ensure updating a preprint with unchanged data does not create superfluous log statements. diff --git a/api_tests/preprints/views/test_preprint_detail_author_assertions.py b/api_tests/preprints/views/test_preprint_detail_author_assertions.py deleted file mode 100644 index 63dc8696d41..00000000000 --- a/api_tests/preprints/views/test_preprint_detail_author_assertions.py +++ /dev/null @@ -1,300 +0,0 @@ -import pytest - -from osf.utils.permissions import READ, WRITE, ADMIN -from api.base.settings.defaults import API_BASE -from osf.models import PreprintLog -from osf_tests.factories import PreprintFactory, AuthUserFactory - - -def build_preprint_update_payload( - node_id, attributes=None, relationships=None, - jsonapi_type='preprints'): - payload = { - 'data': { - 'id': node_id, - 'type': jsonapi_type, - 'attributes': attributes, - 'relationships': relationships - } - } - return payload - - -@pytest.mark.django_db -@pytest.mark.enable_enqueue_task -class TestPreprintUpdateWithAuthorAssertion: - - @pytest.fixture() - def user(self): - return AuthUserFactory() - - @pytest.fixture() - def preprint(self, user): - """ - Creator is not admin permission - """ - preprint = PreprintFactory(creator=user) - admin = AuthUserFactory() - preprint.add_contributor(admin, ADMIN) - preprint.add_contributor(user, READ) - return preprint - - @pytest.fixture() - def url(self, preprint): - return f'/{API_BASE}preprints/{preprint._id}/' - - @pytest.fixture() - def read_contrib(self, preprint): - contrib = AuthUserFactory() - preprint.add_contributor(contrib, READ) - return contrib - - @pytest.fixture() - def write_contrib(self, preprint): - contrib = AuthUserFactory() - preprint.add_contributor(contrib, WRITE) - return contrib - - @pytest.fixture() - def admin_contrib(self, preprint): - contrib = AuthUserFactory() - preprint.add_contributor(contrib, ADMIN) - return contrib - - def assert_permission(self, app, url, contrib, attributes, expected_status): - update_payload = build_preprint_update_payload(node_id=contrib._id, attributes=attributes) - res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) - assert res.status_code == expected_status - - # Testing permissions for updating has_coi - def test_update_has_coi_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'has_coi': True}, 403) - - def test_update_has_coi_permission_granted_write(self, app, write_contrib, url): - self.assert_permission(app, url, write_contrib, {'has_coi': True}, 403) - - def test_update_has_coi_permission_granted_admin(self, app, admin_contrib, url): - self.assert_permission(app, url, admin_contrib, {'has_coi': True}, 200) - - def test_update_has_coi_permission_granted_creator(self, app, user, url): - self.assert_permission(app, url, user, {'has_coi': True}, 403) - - # Testing permissions for updating conflict_of_interest_statement - def test_update_conflict_of_interest_statement_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'conflict_of_interest_statement': 'Test'}, 403) - - def test_update_conflict_of_interest_statement_permission_granted_write(self, app, write_contrib, preprint, url): - preprint.has_coi = True - preprint.save() - self.assert_permission(app, url, write_contrib, {'conflict_of_interest_statement': 'Test'}, 403) - - def test_update_conflict_of_interest_statement_permission_granted_admin(self, app, admin_contrib, preprint, url): - preprint.has_coi = True - preprint.save() - self.assert_permission(app, url, admin_contrib, {'conflict_of_interest_statement': 'Test'}, 200) - - def test_update_conflict_of_interest_statement_permission_granted_creator(self, app, user, preprint, url): - preprint.has_coi = True - preprint.save() - self.assert_permission(app, url, user, {'conflict_of_interest_statement': 'Test'}, 403) - - # Testing permissions for updating has_data_links - def test_update_has_data_links_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'has_data_links': 'available'}, 403) - - def test_update_has_data_links_permission_granted_write(self, app, write_contrib, url): - self.assert_permission(app, url, write_contrib, {'has_data_links': 'available'}, 403) - - def test_update_has_data_links_permission_granted_admin(self, app, admin_contrib, url): - self.assert_permission(app, url, admin_contrib, {'has_data_links': 'available'}, 200) - - def test_update_has_data_links_permission_granted_creator(self, app, user, url): - self.assert_permission(app, url, user, {'has_data_links': 'available'}, 403) - - # Testing permissions for updating why_no_data - def test_update_why_no_data_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'why_no_data': 'My dog ate it.'}, 403) - - def test_update_why_no_data_permission_granted_write(self, app, write_contrib, preprint, url): - preprint.has_data_links = 'no' - preprint.save() - self.assert_permission(app, url, write_contrib, {'why_no_data': 'My dog ate it.'}, 403) - - def test_update_why_no_data_permission_granted_admin(self, app, admin_contrib, preprint, url): - preprint.has_data_links = 'no' - preprint.save() - self.assert_permission(app, url, admin_contrib, {'why_no_data': 'My dog ate it.'}, 200) - - def test_update_why_no_data_permission_granted_creator(self, app, user, preprint, url): - preprint.has_data_links = 'no' - preprint.save() - self.assert_permission(app, url, user, {'why_no_data': 'My dog ate it.'}, 403) - - # Testing permissions for updating data_links - def test_update_data_links_permission_denied(self, app, read_contrib, url): - data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - self.assert_permission(app, url, read_contrib, {'data_links': data_links}, 403) - - def test_update_data_links_permission_granted_write(self, app, write_contrib, preprint, url): - data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_data_links = 'available' - preprint.save() - self.assert_permission(app, url, write_contrib, {'data_links': data_links}, 403) - - def test_update_data_links_permission_granted_admin(self, app, admin_contrib, preprint, url): - data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_data_links = 'available' - preprint.save() - self.assert_permission(app, url, admin_contrib, {'data_links': data_links}, 200) - - def test_update_data_links_permission_granted_creator(self, app, user, preprint, url): - data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_data_links = 'available' - preprint.save() - self.assert_permission(app, url, user, {'data_links': data_links}, 403) - - def test_update_data_links_invalid_payload(self, app, admin_contrib, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'data_links': 'maformed payload'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Expected a list of items but got type "str".' - - def test_update_data_links_invalid_url(self, app, admin_contrib, preprint, url): - preprint.has_data_links = 'available' - preprint.save() - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'data_links': ['thisaintright']}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Enter a valid URL.' - - # Testing permissions for updating has_prereg_links - def test_update_has_prereg_links_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'has_prereg_links': 'available'}, 403) - - def test_update_has_prereg_links_permission_granted_write(self, app, write_contrib, url): - self.assert_permission(app, url, write_contrib, {'has_prereg_links': 'available'}, 403) - - def test_update_has_prereg_links_permission_granted_admin(self, app, admin_contrib, url): - self.assert_permission(app, url, admin_contrib, {'has_prereg_links': 'available'}, 200) - - def test_update_has_prereg_links_permission_granted_creator(self, app, user, url): - self.assert_permission(app, url, user, {'has_prereg_links': 'available'}, 403) - - # Testing permissions for updating prereg_links - def test_update_prereg_links_permission_denied(self, app, read_contrib, url): - prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - self.assert_permission(app, url, read_contrib, {'prereg_links': prereg_links}, 403) - - def test_update_prereg_links_permission_granted_write(self, app, write_contrib, preprint, url): - prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_prereg_links = 'available' - preprint.save() - self.assert_permission(app, url, write_contrib, {'prereg_links': prereg_links}, 403) - - def test_update_prereg_links_permission_granted_admin(self, app, admin_contrib, preprint, url): - prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_prereg_links = 'available' - preprint.save() - self.assert_permission(app, url, admin_contrib, {'prereg_links': prereg_links}, 200) - - def test_update_prereg_links_permission_granted_creator(self, app, user, preprint, url): - prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_prereg_links = 'available' - preprint.save() - self.assert_permission(app, url, user, {'prereg_links': prereg_links}, 403) - - def test_update_prereg_links_invalid_payload(self, app, admin_contrib, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_links': 'maformed payload'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Expected a list of items but got type "str".' - - def test_update_prereg_links_invalid_url(self, app, admin_contrib, preprint, url): - preprint.has_prereg_links = 'available' - preprint.save() - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_links': ['thisaintright']}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Enter a valid URL.' - - def test_update_prereg_link_info_fail_prereg_links(self, app, admin_contrib, preprint, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_link_info': 'prereg_designs'}) - preprint.has_prereg_links = 'no' - preprint.save() - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'You cannot edit this field while your prereg links availability is set to false or is unanswered.' - - def test_update_prereg_link_info_success(self, app, admin_contrib, preprint, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_link_info': 'prereg_designs'}) - preprint.has_prereg_links = 'available' - preprint.save() - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth) - assert res.status_code == 200 - assert res.json['data']['attributes']['prereg_link_info'] == 'prereg_designs' - preprint.reload() - assert preprint.prereg_link_info == 'prereg_designs' - log = preprint.logs.first() - assert log.action == PreprintLog.UPDATE_PREREG_LINKS_INFO - assert log.params == {'user': admin_contrib._id, 'preprint': preprint._id} - - def test_update_prereg_link_info_invalid_payload(self, app, admin_contrib, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_link_info': 'maformed payload'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == '"maformed payload" is not a valid choice.' - - def test_no_prereg_links_clears_links(self, app, admin_contrib, preprint, url): - preprint.has_prereg_links = 'available' - preprint.prereg_links = ['http://example.com'] - preprint.prereg_link_info = 'prereg_analysis' - preprint.save() - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'has_prereg_links': 'no'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth) - assert res.status_code == 200 - assert res.json['data']['attributes']['has_prereg_links'] == 'no' - assert res.json['data']['attributes']['prereg_links'] == [] - assert not res.json['data']['attributes']['prereg_link_info'] - - def test_no_data_links_clears_links(self, app, admin_contrib, preprint, url): - preprint.has_data_links = 'available' - preprint.data_links = ['http://www.apple.com'] - preprint.save() - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'has_data_links': 'no'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth) - assert res.status_code == 200 - assert res.json['data']['attributes']['has_data_links'] == 'no' - assert res.json['data']['attributes']['data_links'] == [] - - def test_sloan_updates(self, app, admin_contrib, preprint, url): - preprint.has_prereg_links = 'available' - preprint.prereg_links = ['http://no-sf.io'] - preprint.prereg_link_info = 'prereg_designs' - preprint.save() - update_payload = build_preprint_update_payload( - node_id=preprint._id, - attributes={ - 'has_prereg_links': 'available', - 'prereg_link_info': 'prereg_designs', - 'prereg_links': ['http://osf.io'], - } - ) - app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - logs = preprint.logs.all().values_list('action', 'params') - assert logs.count() == 5 - assert logs.latest() == ('prereg_links_updated', {'user': admin_contrib._id, 'preprint': preprint._id}) - - update_payload = build_preprint_update_payload( - node_id=preprint._id, - attributes={ - 'has_prereg_links': 'no', - 'why_no_prereg': 'My dog ate it.' - } - ) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 200 - assert res.json['data']['attributes']['has_prereg_links'] == 'no' - assert res.json['data']['attributes']['why_no_prereg'] == 'My dog ate it.' - preprint.refresh_from_db() - assert preprint.has_prereg_links == 'no' - assert preprint.why_no_prereg == 'My dog ate it.' From 5ffde1ba32541a0fe50dc55209de94c8ce25c625 Mon Sep 17 00:00:00 2001 From: Uditi Mehta Date: Wed, 9 Oct 2024 10:40:50 -0400 Subject: [PATCH 06/35] Add admin permission checks --- api/preprints/serializers.py | 11 +++++++++++ .../preprints/views/test_preprint_detail.py | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/api/preprints/serializers.py b/api/preprints/serializers.py index cd4680e69fb..28ca617ee70 100644 --- a/api/preprints/serializers.py +++ b/api/preprints/serializers.py @@ -337,7 +337,12 @@ def update(self, preprint, validated_data): detail='You cannot edit this field while your prereg links availability is set to false or is unanswered.', ) + def require_admin_permission(): + if not preprint.has_permission(auth.user, osf_permissions.ADMIN): + raise exceptions.PermissionDenied(detail='Must have admin permissions to update author assertion fields.') + if 'has_coi' in validated_data: + require_admin_permission() try: preprint.update_has_coi(auth, validated_data['has_coi']) save_preprint = True @@ -345,6 +350,7 @@ def update(self, preprint, validated_data): raise exceptions.ValidationError(detail=str(e)) if 'conflict_of_interest_statement' in validated_data: + require_admin_permission() try: preprint.update_conflict_of_interest_statement(auth, validated_data['conflict_of_interest_statement']) save_preprint = True @@ -352,6 +358,7 @@ def update(self, preprint, validated_data): raise exceptions.ValidationError(detail=str(e)) if 'has_data_links' in validated_data: + require_admin_permission() try: preprint.update_has_data_links(auth, validated_data['has_data_links']) save_preprint = True @@ -366,6 +373,7 @@ def update(self, preprint, validated_data): raise exceptions.ValidationError(detail=str(e)) if 'data_links' in validated_data: + require_admin_permission() try: preprint.update_data_links(auth, validated_data['data_links']) save_preprint = True @@ -377,6 +385,7 @@ def update(self, preprint, validated_data): save_preprint = True if 'why_no_prereg' in validated_data: + require_admin_permission() try: preprint.update_why_no_prereg(auth, validated_data['why_no_prereg']) save_preprint = True @@ -384,6 +393,7 @@ def update(self, preprint, validated_data): raise exceptions.ValidationError(detail=str(e)) if 'prereg_links' in validated_data: + require_admin_permission() try: preprint.update_prereg_links(auth, validated_data['prereg_links']) save_preprint = True @@ -391,6 +401,7 @@ def update(self, preprint, validated_data): raise exceptions.ValidationError(detail=str(e)) if 'prereg_link_info' in validated_data: + require_admin_permission() try: preprint.update_prereg_link_info(auth, validated_data['prereg_link_info']) save_preprint = True diff --git a/api_tests/preprints/views/test_preprint_detail.py b/api_tests/preprints/views/test_preprint_detail.py index 3106835a940..ffec9722514 100644 --- a/api_tests/preprints/views/test_preprint_detail.py +++ b/api_tests/preprints/views/test_preprint_detail.py @@ -20,6 +20,7 @@ PreprintContributor, PreprintLog ) +from osf.utils import permissions as osf_permissions from osf.utils.permissions import WRITE from osf.utils.workflows import DefaultStates from osf_tests.factories import ( @@ -1243,6 +1244,22 @@ def test_update_has_prereg_links_no_with_empty_prereg_links(self, app, user, pre assert preprint.prereg_links == [] assert preprint.prereg_link_info == '' + def test_non_admin_cannot_update_has_coi(self, app, user, preprint, url): + write_contrib = AuthUserFactory() + preprint.add_contributor(write_contrib, permissions=osf_permissions.WRITE, auth=Auth(user), save=True) + + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'has_coi': True} + ) + + res = app.patch_json_api(url, update_payload, auth=write_contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'Must have admin permissions to update author assertion fields.' + + preprint.reload() + assert preprint.has_coi is None + def test_sloan_updates(self, app, user, preprint, url): """ - Tests to ensure updating a preprint with unchanged data does not create superfluous log statements. From 59fab7d0f19f15784f50dac9c4f1c4932fae8018 Mon Sep 17 00:00:00 2001 From: Uditi Mehta Date: Thu, 10 Oct 2024 10:12:32 -0400 Subject: [PATCH 07/35] remove redundan and duplicate code --- api/preprints/serializers.py | 142 +++-------------------------------- 1 file changed, 10 insertions(+), 132 deletions(-) diff --git a/api/preprints/serializers.py b/api/preprints/serializers.py index 28ca617ee70..b8ad259aa3e 100644 --- a/api/preprints/serializers.py +++ b/api/preprints/serializers.py @@ -366,6 +366,7 @@ def require_admin_permission(): raise exceptions.ValidationError(detail=str(e)) if 'why_no_data' in validated_data: + require_admin_permission() try: preprint.update_why_no_data(auth, validated_data['why_no_data']) save_preprint = True @@ -384,6 +385,15 @@ def require_admin_permission(): preprint.update_data_links(auth, []) save_preprint = True + if 'has_prereg_links' in validated_data: + require_admin_permission() + + try: + preprint.update_has_prereg_links(auth, validated_data['has_prereg_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + if 'why_no_prereg' in validated_data: require_admin_permission() try: @@ -488,68 +498,6 @@ def require_admin_permission(): preprint.custom_publication_citation = validated_data['custom_publication_citation'] or None save_preprint = True - if 'has_coi' in validated_data: - try: - preprint.update_has_coi(auth, validated_data['has_coi']) - save_preprint = True - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'conflict_of_interest_statement' in validated_data: - try: - preprint.update_conflict_of_interest_statement(auth, validated_data['conflict_of_interest_statement']) - save_preprint = True - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'has_data_links' in validated_data: - try: - preprint.update_has_data_links(auth, validated_data['has_data_links']) - save_preprint = True - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'why_no_data' in validated_data: - try: - preprint.update_why_no_data(auth, validated_data['why_no_data']) - save_preprint = True - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'data_links' in validated_data: - try: - preprint.update_data_links(auth, validated_data['data_links']) - save_preprint = True - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'has_prereg_links' in validated_data: - try: - preprint.update_has_prereg_links(auth, validated_data['has_prereg_links']) - save_preprint = True - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'why_no_prereg' in validated_data: - try: - preprint.update_why_no_prereg(auth, validated_data['why_no_prereg']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'prereg_links' in validated_data: - try: - preprint.update_prereg_links(auth, validated_data['prereg_links']) - save_preprint = True - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'prereg_link_info' in validated_data: - try: - preprint.update_prereg_link_info(auth, validated_data['prereg_link_info']) - save_preprint = True - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - if published is not None: if not preprint.primary_file: raise exceptions.ValidationError( @@ -575,76 +523,6 @@ def require_admin_permission(): return preprint - def handle_author_assertions(self, preprint, validated_data, auth): - author_assertions = { - 'has_coi', - 'conflict_of_interest_statement', - 'has_data_links', - 'why_no_data', - 'data_links', - 'why_no_prereg', - 'prereg_links', - 'has_prereg_links', - 'prereg_link_info', - } - if author_assertions & validated_data.keys(): - if not preprint.is_admin_contributor(auth.user): - raise exceptions.PermissionDenied('User must be admin to add author assertions') - - if 'has_coi' in validated_data: - try: - preprint.update_has_coi(auth, validated_data['has_coi']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'conflict_of_interest_statement' in validated_data: - try: - preprint.update_conflict_of_interest_statement(auth, validated_data['conflict_of_interest_statement']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'has_data_links' in validated_data: - try: - preprint.update_has_data_links(auth, validated_data['has_data_links']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'why_no_data' in validated_data: - try: - preprint.update_why_no_data(auth, validated_data['why_no_data']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'data_links' in validated_data: - try: - preprint.update_data_links(auth, validated_data['data_links']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'has_prereg_links' in validated_data: - try: - preprint.update_has_prereg_links(auth, validated_data['has_prereg_links']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'why_no_prereg' in validated_data: - try: - preprint.update_why_no_prereg(auth, validated_data['why_no_prereg']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'prereg_links' in validated_data: - try: - preprint.update_prereg_links(auth, validated_data['prereg_links']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'prereg_link_info' in validated_data: - try: - preprint.update_prereg_link_info(auth, validated_data['prereg_link_info']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - def set_field(self, func, val, auth, save=False): try: func(val, auth) From 53206aadfbfb64cd6bb13d5bc54abc74289a27af Mon Sep 17 00:00:00 2001 From: Longze Chen Date: Wed, 30 Oct 2024 12:44:08 -0400 Subject: [PATCH 08/35] Update changelog and bump version --- CHANGELOG | 5 +++++ package.json | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 205d47de13c..94705b38a79 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,11 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO. +24.08.0 (2024-10-30) +==================== + +- Fix admin confirmation link generation and handling + 24.07.0 (2024-09-19) ==================== diff --git a/package.json b/package.json index be5c3b44a30..8b0edd12961 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "OSF", - "version": "24.07.0", + "version": "24.08.0", "description": "Facilitating Open Science", "repository": "https://github.com/CenterForOpenScience/osf.io", "author": "Center for Open Science", From 78bc9e822a2741d4a7b2eb7fd2fb0980a4d05ca4 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Mon, 4 Nov 2024 09:40:04 -0500 Subject: [PATCH 09/35] Handle edge case in confirmation link generation --- osf/models/user.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/osf/models/user.py b/osf/models/user.py index d0783c208aa..22bbfc5baf9 100644 --- a/osf/models/user.py +++ b/osf/models/user.py @@ -1227,12 +1227,12 @@ def update_guessed_names(self): self.family_name = parsed['family'] self.suffix = parsed['suffix'] - def add_unconfirmed_email(self, email, expiration=None, external_identity=None): + def add_unconfirmed_email(self, email, expiration=None, external_identity=None, force=False): """ Add an email verification token for a given email. :param email: the email to confirm - :param email: overwrite default expiration time + :param expiration: overwrite default expiration time :param external_identity: the user's external identity :return: a token :raises: ValueError if email already confirmed, except for login through external idp. @@ -1249,7 +1249,8 @@ def add_unconfirmed_email(self, email, expiration=None, external_identity=None): validate_email(email) if not external_identity and self.emails.filter(address=email).exists(): - raise ValueError('Email already confirmed to this user.') + if not force or self.is_confirmed: + raise ValueError('Email already confirmed to this user.') # If the unconfirmed email is already present, refresh the token if email in self.unconfirmed_emails: @@ -1304,14 +1305,14 @@ def get_confirmation_token(self, email, force=False, renew=False): # assume the token is expired expiration = info.get('expiration') if renew: - new_token = self.add_unconfirmed_email(email) + new_token = self.add_unconfirmed_email(email, force=force) self.save() return new_token if not expiration or (expiration and expiration < timezone.now()): if not force: raise ExpiredTokenError(f'Token for email "{email}" is expired') else: - new_token = self.add_unconfirmed_email(email) + new_token = self.add_unconfirmed_email(email, force=force) self.save() return new_token return token @@ -1355,7 +1356,7 @@ def get_or_create_confirmation_url(self, email, force=False, renew=False): try: self.get_confirmation_token(email, force=force, renew=renew) except KeyError: - self.add_unconfirmed_email(email) + self.add_unconfirmed_email(email, force=force) self.save() return self.get_confirmation_url(email) From b49605351e1e3a35941bfb831548f8ee460bf6e0 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Mon, 14 Oct 2024 15:40:59 -0400 Subject: [PATCH 10/35] Add change_node_region script [ENG-5242] --- osf/management/commands/change_node_region.py | 160 ++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 osf/management/commands/change_node_region.py diff --git a/osf/management/commands/change_node_region.py b/osf/management/commands/change_node_region.py new file mode 100644 index 00000000000..abce28672bf --- /dev/null +++ b/osf/management/commands/change_node_region.py @@ -0,0 +1,160 @@ +import logging +import json + +from django.core.management.base import BaseCommand +from django.db import transaction +from google.cloud.storage.client import Client +from google.oauth2.service_account import Credentials + +from osf.models import AbstractNode +from osf.utils.migrations import disable_auto_now_fields +from addons.osfstorage.models import Region + +logger = logging.getLogger(__name__) + +def _get_file_block_map(node): + file_block_map = {} + file_input_qids = node.registration_schema.schema_blocks.filter( + block_type='file-input' + ).values_list('registration_response_key', flat=True) + for schema_response in node.schema_responses.all(): + for block in schema_response.response_blocks.filter(schema_key__in=file_input_qids): + for file_response in block.response: + if file_block_map.get(file_response['file_id'], False): + file_block_map[file_response['file_id']].append(block) + else: + file_block_map[file_response['file_id']] = [block] + return file_block_map + +def _update_blocks(file_block_map, original_id, cloned_id): + for block in file_block_map[original_id]: + logger.info(f'Updating block {block._id} file info') + response = [] + for file_response in block.response: + if original_id == file_response['file_id']: + for key in file_response['file_urls'].keys(): + file_response['file_urls'][key] = file_response['file_urls'][key].replace(original_id, cloned_id) + response.append(file_response) + block.response = response + block.save() + +def _update_schema_meta(node): + logger.info('Updating legacy schema information...') + node.registration_responses = node.schema_responses.latest('-created').all_responses + node.registered_meta[node.registration_schema._id] = node.expand_registration_responses() + node.save() + logger.info('Updated legacy schema information.') + +def _copy_and_clone_versions(original_file, cloned_file, src_bucket, dest_bucket, dest_bucket_name, dest_region): + for v in original_file.versions.order_by('identifier').all(): + blob_hash = v.location['object'] + logger.info(f'Preparing to move version {blob_hash}') + # Copy each version to dest_bucket + src_blob = src_bucket.get_blob(blob_hash) + src_bucket.copy_blob(src_blob, dest_bucket) + logger.info(f'Blob {blob_hash} copied to destination, cloning version object.') + # Clone each version, update location + cloned_v = v.clone() + cloned_v.location['bucket'] = dest_bucket_name + # Set FKs + cloned_v.creator = v.creator + cloned_v.region = dest_region + # Save before M2M's can be set + cloned_v.save() + cloned_file.add_version(cloned_v) + # Retain original timestamps + cloned_v.created = v.created + cloned_v.modified = v.modified + cloned_v.save() + logger.info(f'Version {blob_hash} cloned.') + +def _clone_file(file_obj): + # Clone each file, so that the originals will be purged from src_region + cloned_f = file_obj.clone() + # Set (G)FKs + cloned_f.target = file_obj.target + cloned_f.parent = file_obj.parent + cloned_f.checkout = file_obj.checkout + cloned_f.copied_from = file_obj.copied_from + # Save before M2M's can be set, assigning both id and _id + cloned_f.save() + # Repoint Guids + assert cloned_f.id, f'Cloned file ID not assigned for {file_obj._id}' + file_obj.guids.update(object_id=cloned_f.id) + # Retain original timestamps + cloned_f.created = file_obj.created + cloned_f.modified = file_obj.modified + cloned_f.save() + return cloned_f + +def change_node_region(node, dest_region, gcs_creds): + creds = Credentials.from_service_account_info(gcs_creds) + client = Client(credentials=creds) + osfstorage_addon = node.get_addon('osfstorage') + src_region = osfstorage_addon.region + if src_region.id == dest_region.id: + logger.warning(f'Source and destination regions match: {src_region._id}. Exiting.') + return + src_bucket_name = src_region.waterbutler_settings['storage']['bucket'] + dest_bucket_name = dest_region.waterbutler_settings['storage']['bucket'] + src_bucket = client.get_bucket(src_bucket_name) + dest_bucket = client.get_bucket(dest_bucket_name) + response_blocks_by_file_id = {} + with transaction.atomic(): + with disable_auto_now_fields(): + if node.type == 'osf.registration': + response_blocks_by_file_id = _get_file_block_map(node) + for f in node.files.all(): + logger.info(f'Prepraring to move file {f._id}') + cloned_f = _clone_file(f) + if f._id in response_blocks_by_file_id: + logger.info(f'Prepraring to update ResponseBlocks for file {f._id}') + _update_blocks(response_blocks_by_file_id, f._id, cloned_f._id) + logger.info(f'File {f._id} cloned, copying versions...') + _copy_and_clone_versions(f, cloned_f, src_bucket, dest_bucket, dest_bucket_name, dest_region) + # Trash original file + f.delete() + logger.info('All files complete.') + if response_blocks_by_file_id: + _update_schema_meta(node) + osfstorage_addon.region = dest_region + osfstorage_addon.save() + logger.info('Region updated. Exiting.') + +class Command(BaseCommand): + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + '-n', + '--node', + type=str, + action='store', + dest='node', + help='Node._id to migrate.', + ) + parser.add_argument( + '-r', + '--region', + type=str, + action='store', + dest='region', + help='Region._id to migrate files to.', + ) + parser.add_argument( + '-c', + '--credentials', + type=str, + action='store', + dest='gcs_creds', + help='GCS Credentials to use. JSON string.', + ) + + def handle(self, *args, **options): + node = AbstractNode.load(options.get('node', None)) + region = Region.load(options.get('region', None)) + gcs_creds = json.loads(options.get('gcs_creds', '{}')) + assert node, 'Node not found' + assert region, 'Region not found' + assert gcs_creds, 'Credentials required' + change_node_region(node, region, gcs_creds) From be98bc0f70647048320b7d44e1cc5f46eb5decc5 Mon Sep 17 00:00:00 2001 From: mfraezz Date: Thu, 14 Nov 2024 13:49:54 -0500 Subject: [PATCH 11/35] [Feature Release][ENG-5024] Institutional Dashboard Improvements (#10797) Add support for new Institutional Dashboard gated by a waffle flag --------- Co-authored-by: abram axel booth Co-authored-by: John Tordoff Co-authored-by: John Tordoff <> --- admin/management/views.py | 7 +- api/base/elasticsearch_dsl_views.py | 172 +++++++ api/base/pagination.py | 9 +- api/base/serializers.py | 28 ++ api/base/settings/defaults.py | 3 + api/base/utils.py | 19 + api/caching/tasks.py | 64 ++- api/institutions/serializers.py | 124 ++++- api/institutions/urls.py | 4 +- api/institutions/views.py | 146 +++++- api/metrics/renderers.py | 27 +- api/share/utils.py | 124 +++-- api_tests/base/test_views.py | 4 +- .../views/test_institution_department_list.py | 2 +- .../views/test_institution_detail.py | 90 ++-- .../views/test_institution_summary_metrics.py | 256 ++++++++++- .../test_institution_user_metric_list.py | 425 ++++++++++++++++-- api_tests/metrics/test_composite_query.py | 2 +- api_tests/metrics/test_preprint_metrics.py | 2 +- api_tests/metrics/test_raw_metrics.py | 10 +- .../test_registries_moderation_metrics.py | 4 +- api_tests/share/_utils.py | 53 ++- api_tests/share/test_share_preprint.py | 2 +- conftest.py | 49 +- osf/admin.py | 18 + osf/features.yaml | 4 + .../make_dummy_pageviews_for_metrics.py | 6 +- .../commands/monthly_reporters_go.py | 13 +- osf/metadata/gather/basket.py | 19 +- osf/metadata/gather/gatherer.py | 9 +- osf/metadata/osf_gathering.py | 196 +++++++- osf/metadata/rdfutils.py | 2 + osf/metadata/serializers/turtle.py | 8 +- osf/metrics/counted_usage.py | 12 +- osf/metrics/reporters/__init__.py | 6 + osf/metrics/reporters/_base.py | 19 +- .../reporters/institution_summary_monthly.py | 105 +++++ osf/metrics/reporters/institutional_users.py | 161 +++++++ osf/metrics/reporters/public_item_usage.py | 286 ++++++++++++ osf/metrics/reporters/spam_count.py | 7 +- osf/metrics/reports.py | 157 ++++++- osf/metrics/utils.py | 41 +- ...tution_link_to_external_reports_archive.py | 18 + osf/models/institution.py | 6 + osf/models/node.py | 3 +- osf/models/user.py | 2 +- osf_tests/factories.py | 2 +- osf_tests/metadata/_utils.py | 17 +- .../expected_metadata_files/file_basic.turtle | 6 +- .../expected_metadata_files/file_full.turtle | 6 +- .../file_monthly_supplement.turtle | 13 + .../file_supplement.turtle | 1 + .../preprint_basic.turtle | 5 +- .../preprint_full.turtle | 5 +- .../preprint_monthly_supplement.turtle | 13 + .../preprint_supplement.turtle | 7 + .../project_basic.turtle | 19 +- .../project_full.turtle | 19 +- .../project_monthly_supplement.turtle | 13 + .../project_supplement.turtle | 13 + .../registration_basic.turtle | 28 +- .../registration_full.turtle | 28 +- .../registration_monthly_supplement.turtle | 13 + .../registration_supplement.turtle | 7 + .../user_monthly_supplement.turtle | 1 + .../user_supplement.turtle | 1 + osf_tests/metadata/test_basket.py | 5 +- osf_tests/metadata/test_gatherer_registry.py | 4 + osf_tests/metadata/test_osf_gathering.py | 142 +++++- .../metadata/test_serialized_metadata.py | 184 +++++--- .../test_institutional_summary_reporter.py | 286 ++++++++++++ .../test_institutional_users_reporter.py | 262 +++++++++++ .../test_public_item_usage_reporter.py | 238 ++++++++++ osf_tests/metrics/test_daily_report.py | 19 +- osf_tests/metrics/test_monthly_report.py | 146 ++++++ osf_tests/metrics/test_yearmonth.txt | 48 ++ osf_tests/test_management_commands.py | 2 +- 77 files changed, 3914 insertions(+), 363 deletions(-) create mode 100644 api/base/elasticsearch_dsl_views.py create mode 100644 osf/metrics/reporters/institution_summary_monthly.py create mode 100644 osf/metrics/reporters/institutional_users.py create mode 100644 osf/metrics/reporters/public_item_usage.py create mode 100644 osf/migrations/0024_institution_link_to_external_reports_archive.py create mode 100644 osf_tests/metadata/expected_metadata_files/file_monthly_supplement.turtle create mode 100644 osf_tests/metadata/expected_metadata_files/file_supplement.turtle create mode 100644 osf_tests/metadata/expected_metadata_files/preprint_monthly_supplement.turtle create mode 100644 osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle create mode 100644 osf_tests/metadata/expected_metadata_files/project_monthly_supplement.turtle create mode 100644 osf_tests/metadata/expected_metadata_files/project_supplement.turtle create mode 100644 osf_tests/metadata/expected_metadata_files/registration_monthly_supplement.turtle create mode 100644 osf_tests/metadata/expected_metadata_files/registration_supplement.turtle create mode 100644 osf_tests/metadata/expected_metadata_files/user_monthly_supplement.turtle create mode 100644 osf_tests/metadata/expected_metadata_files/user_supplement.turtle create mode 100644 osf_tests/metrics/reporters/test_institutional_summary_reporter.py create mode 100644 osf_tests/metrics/reporters/test_institutional_users_reporter.py create mode 100644 osf_tests/metrics/reporters/test_public_item_usage_reporter.py create mode 100644 osf_tests/metrics/test_monthly_report.py create mode 100644 osf_tests/metrics/test_yearmonth.txt diff --git a/admin/management/views.py b/admin/management/views.py index 3bd675790dd..88548a518d1 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -1,4 +1,3 @@ -import datetime from dateutil.parser import isoparse from django.views.generic import TemplateView, View from django.contrib import messages @@ -120,11 +119,11 @@ def post(self, request, *args, **kwargs): if monthly_report_date: report_date = isoparse(monthly_report_date).date() else: - report_date = datetime.datetime.now().date() + report_date = None errors = monthly_reporters_go( - report_month=report_date.month, - report_year=report_date.year + report_month=getattr(report_date, 'month', None), + report_year=getattr(report_date, 'year', None) ) if errors: diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py new file mode 100644 index 00000000000..6199fd82d0e --- /dev/null +++ b/api/base/elasticsearch_dsl_views.py @@ -0,0 +1,172 @@ +from __future__ import annotations +import abc +import datetime +import typing + +import elasticsearch_dsl as edsl +from rest_framework import generics, exceptions as drf_exceptions +from rest_framework.settings import api_settings as drf_settings +from api.base.settings.defaults import REPORT_FILENAME_FORMAT + +if typing.TYPE_CHECKING: + from rest_framework import serializers + +from api.base.filters import FilterMixin +from api.base.views import JSONAPIBaseView +from api.metrics.renderers import ( + MetricsReportsCsvRenderer, + MetricsReportsTsvRenderer, + MetricsReportsJsonRenderer, +) +from api.base.pagination import ElasticsearchQuerySizeMaximumPagination, JSONAPIPagination +from api.base.renderers import JSONAPIRenderer + + +class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC): + '''abstract view class using `elasticsearch_dsl.Search` as a queryset-analogue + + builds a `Search` based on `self.get_default_search()` and the request's + query parameters for filtering, sorting, and pagination -- fetches only + the data required for the response, just like with a queryset! + ''' + serializer_class: type[serializers.BaseSerializer] # required on subclasses + + default_ordering: str | None = None # name of a serializer field, prepended with "-" for descending sort + ordering_fields: frozenset[str] = frozenset() # serializer field names + + @abc.abstractmethod + def get_default_search(self) -> edsl.Search | None: + '''the base `elasticsearch_dsl.Search` for this list, based on url path + + (common jsonapi query parameters will be considered automatically) + ''' + ... + + FILE_RENDERER_CLASSES = { + MetricsReportsCsvRenderer, + MetricsReportsTsvRenderer, + MetricsReportsJsonRenderer, + } + + def set_content_disposition(self, response, renderer: str): + """Set the Content-Disposition header to prompt a file download with the appropriate filename. + + Args: + response: The HTTP response object to modify. + renderer: The renderer instance used for the response, which determines the file extension. + """ + current_date = datetime.datetime.now().strftime('%Y-%m') + + if isinstance(renderer, JSONAPIRenderer): + extension = 'json' + else: + extension = getattr(renderer, 'extension', renderer.format) + + filename = REPORT_FILENAME_FORMAT.format( + view_name=self.view_name, + date_created=current_date, + extension=extension, + ) + + response['Content-Disposition'] = f'attachment; filename="{filename}"' + + def finalize_response(self, request, response, *args, **kwargs): + # Call the parent method to finalize the response first + response = super().finalize_response(request, response, *args, **kwargs) + # Check if this is a direct download request or file renderer classes, set to the Content-Disposition header + # so filename and attachment for browser download + if isinstance(request.accepted_renderer, tuple(self.FILE_RENDERER_CLASSES)): + self.set_content_disposition(response, request.accepted_renderer) + + return response + + ### + # beware! inheritance shenanigans below + + # override FilterMixin to disable all operators besides 'eq' and 'ne' + MATCHABLE_FIELDS = () + COMPARABLE_FIELDS = () + DEFAULT_OPERATOR_OVERRIDES = {} + # (if you want to add fulltext-search or range-filter support, remove the override + # and update `__add_search_filter` to handle those operators -- tho note that the + # underlying elasticsearch field mapping will need to be compatible with the query) + + # override DEFAULT_FILTER_BACKENDS rest_framework setting + # (filtering handled in-view to reuse logic from FilterMixin) + filter_backends = () + + # note: because elasticsearch_dsl.Search supports slicing and gives results when iterated on, + # it works fine with default pagination + + # override rest_framework.generics.GenericAPIView + @property + def pagination_class(self): + """ + When downloading a file assume no pagination is necessary unless the user specifies + """ + is_file_download = any( + self.request.accepted_renderer.format == renderer.format + for renderer in self.FILE_RENDERER_CLASSES + ) + # if it's a file download of the JSON respect default page size + if is_file_download: + return ElasticsearchQuerySizeMaximumPagination + return JSONAPIPagination + + def get_queryset(self): + _search = self.get_default_search() + if _search is None: + return [] + # using parsing logic from FilterMixin (oddly nested dict and all) + for _parsed_param in self.parse_query_params(self.request.query_params).values(): + for _parsed_filter in _parsed_param.values(): + _search = self.__add_search_filter( + _search, + elastic_field_name=_parsed_filter['source_field_name'], + operator=_parsed_filter['op'], + value=_parsed_filter['value'], + ) + return self.__add_sort(_search) + + ### + # private methods + + def __add_sort(self, search: edsl.Search) -> edsl.Search: + _elastic_sort = self.__get_elastic_sort() + return (search if _elastic_sort is None else search.sort(_elastic_sort)) + + def __get_elastic_sort(self) -> str | None: + _sort_param = self.request.query_params.get(drf_settings.ORDERING_PARAM, self.default_ordering) + if not _sort_param: + return None + _sort_field, _ascending = ( + (_sort_param[1:], False) + if _sort_param.startswith('-') + else (_sort_param, True) + ) + if _sort_field not in self.ordering_fields: + raise drf_exceptions.ValidationError( + f'invalid value for {drf_settings.ORDERING_PARAM} query param (valid values: {", ".join(self.ordering_fields)})', + ) + _serializer_field = self.get_serializer().fields[_sort_field] + _elastic_sort_field = _serializer_field.source + return (_elastic_sort_field if _ascending else f'-{_elastic_sort_field}') + + def __add_search_filter( + self, + search: edsl.Search, + elastic_field_name: str, + operator: str, + value: str, + ) -> edsl.Search: + match operator: # operators from FilterMixin + case 'eq': + if value == '': + return search.exclude('exists', field=elastic_field_name) + return search.filter('term', **{elastic_field_name: value}) + case 'ne': + if value == '': + return search.filter('exists', field=elastic_field_name) + return search.exclude('term', **{elastic_field_name: value}) + case _: + raise NotImplementedError(f'unsupported filter operator "{operator}"') diff --git a/api/base/pagination.py b/api/base/pagination.py index 7ed3db5f6e3..676f0baa8fb 100644 --- a/api/base/pagination.py +++ b/api/base/pagination.py @@ -10,7 +10,7 @@ replace_query_param, remove_query_param, ) from api.base.serializers import is_anonymized -from api.base.settings import MAX_PAGE_SIZE +from api.base.settings import MAX_PAGE_SIZE, MAX_SIZE_OF_ES_QUERY from api.base.utils import absolute_reverse from osf.models import AbstractNode, Comment, Preprint, Guid, DraftRegistration @@ -172,6 +172,13 @@ class MaxSizePagination(JSONAPIPagination): max_page_size = None page_size_query_param = None + +class ElasticsearchQuerySizeMaximumPagination(JSONAPIPagination): + page_size = MAX_SIZE_OF_ES_QUERY + max_page_size = MAX_SIZE_OF_ES_QUERY + page_size_query_param = None + + class NoMaxPageSizePagination(JSONAPIPagination): max_page_size = None diff --git a/api/base/serializers.py b/api/base/serializers.py index ac28139da97..3c8c518ea16 100644 --- a/api/base/serializers.py +++ b/api/base/serializers.py @@ -17,6 +17,7 @@ from api.base import utils from api.base.exceptions import EnumFieldMemberError +from osf.metrics.utils import YearMonth from osf.utils import permissions as osf_permissions from osf.utils import sanitize from osf.utils import functional @@ -171,6 +172,18 @@ def should_show(self, instance): return request and (request.user.is_anonymous or has_admin_scope) +class ShowIfObjectPermission(ConditionalField): + """Show the field only for users with a given object permission + """ + def __init__(self, field, *, permission: str, **kwargs): + super().__init__(field, **kwargs) + self._required_object_permission = permission + + def should_show(self, instance): + _request = self.context.get('request') + return _request.user.has_perm(self._required_object_permission, obj=instance) + + class HideIfRegistration(ConditionalField): """ If node is a registration, this field will return None. @@ -2012,3 +2025,18 @@ def to_internal_value(self, data): return self._enum_class[data.upper()].value except KeyError: raise EnumFieldMemberError(self._enum_class, data) + + +class YearmonthField(ser.Field): + def to_representation(self, value: YearMonth | None) -> str | None: + if value is None: + return None + return str(value) + + def to_internal_value(self, data: str | None) -> YearMonth | None: + if data is None: + return None + try: + return YearMonth.from_str(data) + except ValueError as e: + raise ser.ValidationError(str(e)) diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index 136f7f48b6b..367ca1b04f9 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -359,8 +359,11 @@ MAX_SIZE_OF_ES_QUERY = 10000 DEFAULT_ES_NULL_VALUE = 'N/A' +REPORT_FILENAME_FORMAT = '{view_name}_{date_created}.{extension}' CI_ENV = False CITATION_STYLES_REPO_URL = 'https://github.com/CenterForOpenScience/styles/archive/88e6ed31a91e9f5a480b486029cda97b535935d4.zip' DEFAULT_AUTO_FIELD = 'django.db.models.AutoField' + +WAFFLE_ENABLE_ADMIN_PAGES = False # instead, customized waffle admins in osf/admin.py diff --git a/api/base/utils.py b/api/base/utils.py index 9e0dcbc7e8c..1da52026d7e 100644 --- a/api/base/utils.py +++ b/api/base/utils.py @@ -2,6 +2,7 @@ from urllib.parse import urlunsplit, urlsplit, parse_qs, urlencode from packaging.version import Version from hashids import Hashids +import waffle from django.apps import apps from django.core.exceptions import ObjectDoesNotExist @@ -275,3 +276,21 @@ def __len__(self): def add_dict_as_item(self, dict): item = type('item', (object,), dict) self.append(item) + + +def toggle_view_by_flag(flag_name, old_view, new_view): + '''toggle between view implementations based on a feature flag + + returns a wrapper view function that: + - when the given flag is inactive, passes thru to `old_view` + - when the given flag is active, passes thru to `new_view` + ''' + def _view_by_flag(request, *args, **kwargs): + if waffle.flag_is_active(request, flag_name): + return new_view(request, *args, **kwargs) + return old_view(request, *args, **kwargs) + if hasattr(new_view, 'view_class'): + # set view_class to masquerade as a class-based view, for sake of assumptions + # in `api_tests.base.test_views` and `api.base.serializers.RelationshipField` + _view_by_flag.view_class = new_view.view_class # type: ignore[attr-defined] + return _view_by_flag diff --git a/api/caching/tasks.py b/api/caching/tasks.py index 0b7a4b6670f..b3afba02c2e 100644 --- a/api/caching/tasks.py +++ b/api/caching/tasks.py @@ -1,11 +1,11 @@ +import logging from urllib.parse import urlparse + +from django.apps import apps from django.db import connection from django.db.models import Sum - import requests -import logging -from django.apps import apps from api.caching.utils import storage_usage_cache from framework.postcommit_tasks.handlers import enqueue_postcommit_task @@ -16,6 +16,9 @@ logger = logging.getLogger(__name__) +_DEFAULT_FILEVERSION_PAGE_SIZE = 500000 + + def get_varnish_servers(): # TODO: this should get the varnish servers from HAProxy or a setting return settings.VARNISH_SERVERS @@ -111,35 +114,60 @@ def ban_url(instance): @app.task(max_retries=5, default_retry_delay=10) -def update_storage_usage_cache(target_id, target_guid, per_page=500000): +def update_storage_usage_cache(target_id, target_guid, per_page=_DEFAULT_FILEVERSION_PAGE_SIZE): if not settings.ENABLE_STORAGE_USAGE_CACHE: return + from osf.models import Guid + storage_usage_total = compute_storage_usage_total(Guid.load(target_guid).referent, per_page=per_page) + key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) + storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + + +def compute_storage_usage_total(target_obj, per_page=_DEFAULT_FILEVERSION_PAGE_SIZE): + from django.contrib.contenttypes.models import ContentType sql = """ SELECT count(size), sum(size) from (SELECT size FROM osf_basefileversionsthrough AS obfnv LEFT JOIN osf_basefilenode file ON obfnv.basefilenode_id = file.id LEFT JOIN osf_fileversion version ON obfnv.fileversion_id = version.id - LEFT JOIN django_content_type type on file.target_content_type_id = type.id WHERE file.provider = 'osfstorage' - AND type.model = 'abstractnode' AND file.deleted_on IS NULL - AND file.target_object_id=%s + AND file.target_object_id=%(target_pk)s + AND file.target_content_type_id=%(target_content_type_pk)s ORDER BY version.id - LIMIT %s OFFSET %s) file_page + LIMIT %(per_page)s OFFSET %(offset)s + ) file_page """ - count = per_page + last_count = 1 # initialize non-zero offset = 0 storage_usage_total = 0 + content_type_pk = ContentType.objects.get_for_model(target_obj).pk with connection.cursor() as cursor: - while count: - cursor.execute(sql, [target_id, per_page, offset]) - result = cursor.fetchall() - storage_usage_total += int(result[0][1]) if result[0][1] else 0 - count = int(result[0][0]) if result[0][0] else 0 - offset += count - - key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) - storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + while last_count: + cursor.execute( + sql, { + 'target_pk': target_obj.pk, + 'target_content_type_pk': content_type_pk, + 'per_page': per_page, + 'offset': offset, + }, + ) + this_count, size_sum = cursor.fetchall()[0] + storage_usage_total += int(size_sum or 0) + last_count = (this_count or 0) + offset += last_count + return storage_usage_total + + +def get_storage_usage_total(target_obj): + if not settings.ENABLE_STORAGE_USAGE_CACHE: + return compute_storage_usage_total(target_obj) + _cache_key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_obj._id) + _storage_usage_total = storage_usage_cache.get(_cache_key) + if _storage_usage_total is None: + _storage_usage_total = compute_storage_usage_total(target_obj) + storage_usage_cache.set(_cache_key, _storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + return _storage_usage_total def update_storage_usage(target): diff --git a/api/institutions/serializers.py b/api/institutions/serializers.py index f1124d896f8..e3679b2a9c5 100644 --- a/api/institutions/serializers.py +++ b/api/institutions/serializers.py @@ -12,8 +12,10 @@ BaseAPISerializer, ShowIfVersion, IDField, + ShowIfObjectPermission, ) +from api.base.serializers import YearmonthField from api.nodes.serializers import CompoundIDField from api.base.exceptions import RelationshipPostMakesNoChanges from api.base.utils import absolute_reverse @@ -35,6 +37,10 @@ class InstitutionSerializer(JSONAPISerializer): ror_iri = ser.CharField(read_only=True, source='ror_uri') iris = ser.SerializerMethodField(read_only=True) assets = ser.SerializerMethodField(read_only=True) + link_to_external_reports_archive = ShowIfObjectPermission( + ser.CharField(read_only=True), + permission='view_institutional_metrics', + ) links = LinksField({ 'self': 'get_api_url', 'html': 'get_absolute_html_url', @@ -55,19 +61,28 @@ class InstitutionSerializer(JSONAPISerializer): related_view_kwargs={'institution_id': '<_id>'}, ) - department_metrics = RelationshipField( - related_view='institutions:institution-department-metrics', - related_view_kwargs={'institution_id': '<_id>'}, + department_metrics = ShowIfObjectPermission( + RelationshipField( + related_view='institutions:institution-department-metrics', + related_view_kwargs={'institution_id': '<_id>'}, + ), + permission='view_institutional_metrics', ) - user_metrics = RelationshipField( - related_view='institutions:institution-user-metrics', - related_view_kwargs={'institution_id': '<_id>'}, + user_metrics = ShowIfObjectPermission( + RelationshipField( + related_view='institutions:institution-user-metrics', + related_view_kwargs={'institution_id': '<_id>'}, + ), + permission='view_institutional_metrics', ) - summary_metrics = RelationshipField( - related_view='institutions:institution-summary-metrics', - related_view_kwargs={'institution_id': '<_id>'}, + summary_metrics = ShowIfObjectPermission( + RelationshipField( + related_view='institutions:institution-summary-metrics', + related_view_kwargs={'institution_id': '<_id>'}, + ), + permission='view_institutional_metrics', ) def get_api_url(self, obj): @@ -256,7 +271,12 @@ def get_absolute_url(self, obj): ) -class InstitutionUserMetricsSerializer(JSONAPISerializer): +class OldInstitutionUserMetricsSerializer(JSONAPISerializer): + '''serializer for institution-users metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is NOT active + (and should be removed when that flag is permanently active) + ''' class Meta: type_ = 'institution-users' @@ -294,6 +314,90 @@ def get_absolute_url(self, obj): ) +class NewInstitutionUserMetricsSerializer(JSONAPISerializer): + '''serializer for institution-users metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is active + (and should be renamed without "New" when that flag is permanently active) + ''' + + class Meta: + type_ = 'institution-users' + + filterable_fields = frozenset({ + 'department', + 'orcid_id', + }) + + id = IDField(source='meta.id', read_only=True) + user_name = ser.CharField(read_only=True) + department = ser.CharField(read_only=True, source='department_name') + orcid_id = ser.CharField(read_only=True) + month_last_login = YearmonthField(read_only=True) + month_last_active = YearmonthField(read_only=True) + account_creation_date = YearmonthField(read_only=True) + + public_projects = ser.IntegerField(read_only=True, source='public_project_count') + private_projects = ser.IntegerField(read_only=True, source='private_project_count') + public_registration_count = ser.IntegerField(read_only=True) + embargoed_registration_count = ser.IntegerField(read_only=True) + published_preprint_count = ser.IntegerField(read_only=True) + public_file_count = ser.IntegerField(read_only=True) + storage_byte_count = ser.IntegerField(read_only=True) + + user = RelationshipField( + related_view='users:user-detail', + related_view_kwargs={'user_id': ''}, + ) + institution = RelationshipField( + related_view='institutions:institution-detail', + related_view_kwargs={'institution_id': ''}, + ) + + links = LinksField({}) + + def get_absolute_url(self): + return None # there is no detail view for institution-users + + +class NewInstitutionSummaryMetricsSerializer(JSONAPISerializer): + '''serializer for institution-summary metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is active + (and should be renamed without "New" when that flag is permanently active) + ''' + + class Meta: + type_ = 'institution-summary-metrics' + + id = IDField(read_only=True) + + user_count = ser.IntegerField(read_only=True) + public_project_count = ser.IntegerField(read_only=True) + private_project_count = ser.IntegerField(read_only=True) + public_registration_count = ser.IntegerField(read_only=True) + embargoed_registration_count = ser.IntegerField(read_only=True) + published_preprint_count = ser.IntegerField(read_only=True) + public_file_count = ser.IntegerField(read_only=True) + storage_byte_count = ser.IntegerField(read_only=True) + monthly_logged_in_user_count = ser.IntegerField(read_only=True) + monthly_active_user_count = ser.IntegerField(read_only=True) + + user = RelationshipField( + related_view='users:user-detail', + related_view_kwargs={'user_id': ''}, + ) + institution = RelationshipField( + related_view='institutions:institution-detail', + related_view_kwargs={'institution_id': ''}, + ) + + links = LinksField({}) + + def get_absolute_url(self): + return None # there is no detail view for institution-users + + class InstitutionRelated(JSONAPIRelationshipSerializer): id = ser.CharField(source='_id', required=False, allow_null=True) class Meta: diff --git a/api/institutions/urls.py b/api/institutions/urls.py index be4f9ca0b43..477fe8d9377 100644 --- a/api/institutions/urls.py +++ b/api/institutions/urls.py @@ -13,7 +13,7 @@ re_path(r'^(?P\w+)/relationships/registrations/$', views.InstitutionRegistrationsRelationship.as_view(), name=views.InstitutionRegistrationsRelationship.view_name), re_path(r'^(?P\w+)/relationships/nodes/$', views.InstitutionNodesRelationship.as_view(), name=views.InstitutionNodesRelationship.view_name), re_path(r'^(?P\w+)/users/$', views.InstitutionUserList.as_view(), name=views.InstitutionUserList.view_name), - re_path(r'^(?P\w+)/metrics/summary/$', views.InstitutionSummaryMetrics.as_view(), name=views.InstitutionSummaryMetrics.view_name), + re_path(r'^(?P\w+)/metrics/summary/$', views.institution_summary_metrics_detail_view, name=views.institution_summary_metrics_detail_view.view_name), re_path(r'^(?P\w+)/metrics/departments/$', views.InstitutionDepartmentList.as_view(), name=views.InstitutionDepartmentList.view_name), - re_path(r'^(?P\w+)/metrics/users/$', views.InstitutionUserMetricsList.as_view(), name=views.InstitutionUserMetricsList.view_name), + re_path(r'^(?P\w+)/metrics/users/$', views.institution_user_metrics_list_view, name=views.institution_user_metrics_list_view.view_name), ] diff --git a/api/institutions/views.py b/api/institutions/views.py index d21c15e0746..124e523c7e8 100644 --- a/api/institutions/views.py +++ b/api/institutions/views.py @@ -8,12 +8,16 @@ from framework.auth.oauth_scopes import CoreScopes +import osf.features from osf.metrics import InstitutionProjectCounts from osf.models import OSFUser, Node, Institution, Registration from osf.metrics import UserInstitutionProjectCounts +from osf.metrics.reports import InstitutionalUserReport, InstitutionMonthlySummaryReport +from osf.metrics.utils import YearMonth from osf.utils import permissions as osf_permissions from api.base import permissions as base_permissions +from api.base.elasticsearch_dsl_views import ElasticsearchListView from api.base.filters import ListFilterMixin from api.base.views import JSONAPIBaseView from api.base.serializers import JSONAPISerializer @@ -25,9 +29,17 @@ ) from api.base.settings import MAX_SIZE_OF_ES_QUERY from api.base.exceptions import RelationshipPostMakesNoChanges -from api.base.utils import MockQueryset +from api.base.utils import ( + MockQueryset, + toggle_view_by_flag, +) from api.base.settings import DEFAULT_ES_NULL_VALUE from api.metrics.permissions import IsInstitutionalMetricsUser +from api.metrics.renderers import ( + MetricsReportsCsvRenderer, + MetricsReportsTsvRenderer, + MetricsReportsJsonRenderer, +) from api.nodes.serializers import NodeSerializer from api.nodes.filters import NodesFilterMixin from api.users.serializers import UserSerializer @@ -40,7 +52,9 @@ InstitutionRegistrationsRelationshipSerializer, InstitutionSummaryMetricSerializer, InstitutionDepartmentMetricsSerializer, - InstitutionUserMetricsSerializer, + NewInstitutionUserMetricsSerializer, + OldInstitutionUserMetricsSerializer, + NewInstitutionSummaryMetricsSerializer, ) from api.institutions.permissions import UserIsAffiliated from api.institutions.renderers import InstitutionDepartmentMetricsCSVRenderer, InstitutionUserMetricsCSVRenderer, MetricsCSVRenderer @@ -384,7 +398,7 @@ def create(self, *args, **kwargs): return ret -class InstitutionSummaryMetrics(JSONAPIBaseView, generics.RetrieveAPIView, InstitutionMixin): +class _OldInstitutionSummaryMetrics(JSONAPIBaseView, generics.RetrieveAPIView, InstitutionMixin): permission_classes = ( drf_permissions.IsAuthenticatedOrReadOnly, base_permissions.TokenHasScope, @@ -493,10 +507,15 @@ def get_default_queryset(self): return self._make_elasticsearch_results_filterable(search, id=institution._id) -class InstitutionUserMetricsList(InstitutionImpactList): +class _OldInstitutionUserMetricsList(InstitutionImpactList): + '''list view for institution-users metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is NOT active + (and should be removed when that flag is permanently active) + ''' view_name = 'institution-user-metrics' - serializer_class = InstitutionUserMetricsSerializer + serializer_class = OldInstitutionUserMetricsSerializer renderer_classes = tuple(api_settings.DEFAULT_RENDERER_CLASSES) + (InstitutionUserMetricsCSVRenderer,) ordering_fields = ('user_name', 'department') @@ -521,3 +540,120 @@ def get_default_queryset(self): institution = self.get_institution() search = UserInstitutionProjectCounts.get_current_user_metrics(institution) return self._make_elasticsearch_results_filterable(search, id=institution._id, department=DEFAULT_ES_NULL_VALUE) + + +class _NewInstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView): + '''list view for institution-users metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is active + (and should be renamed without "New" when that flag is permanently active) + ''' + permission_classes = ( + drf_permissions.IsAuthenticatedOrReadOnly, + base_permissions.TokenHasScope, + IsInstitutionalMetricsUser, + ) + + required_read_scopes = [CoreScopes.INSTITUTION_METRICS_READ] + required_write_scopes = [CoreScopes.NULL] + + view_category = 'institutions' + view_name = 'institution-user-metrics' + renderer_classes = ( + *api_settings.DEFAULT_RENDERER_CLASSES, + MetricsReportsCsvRenderer, + MetricsReportsTsvRenderer, + MetricsReportsJsonRenderer, + ) + + serializer_class = NewInstitutionUserMetricsSerializer + + default_ordering = '-storage_byte_count' + ordering_fields = frozenset(( + 'user_name', + 'department', + 'month_last_login', + 'month_last_active', + 'account_creation_date', + 'public_projects', + 'private_projects', + 'public_registration_count', + 'embargoed_registration_count', + 'published_preprint_count', + 'public_file_count', + 'storage_byte_count', + )) + + def get_default_search(self): + _yearmonth = InstitutionalUserReport.most_recent_yearmonth() + if _yearmonth is None: + return None + return ( + InstitutionalUserReport.search() + .filter('term', report_yearmonth=str(_yearmonth)) + .filter('term', institution_id=self.get_institution()._id) + ) + + +class _NewInstitutionSummaryMetricsDetail(JSONAPIBaseView, generics.RetrieveAPIView, InstitutionMixin): + '''detail view for institution-summary metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is active + (and should be renamed without "New" when that flag is permanently active) + ''' + permission_classes = ( + drf_permissions.IsAuthenticatedOrReadOnly, + base_permissions.TokenHasScope, + IsInstitutionalMetricsUser, + ) + + required_read_scopes = [CoreScopes.INSTITUTION_METRICS_READ] + required_write_scopes = [CoreScopes.NULL] + + view_category = 'institutions' + view_name = 'institution-summary-metrics' + + serializer_class = NewInstitutionSummaryMetricsSerializer + + def get_object(self): + institution = self.get_institution() + search_object = self.get_default_search() + if search_object: + object = search_object.execute()[0] + object.id = institution._id + return object + + def get_default_search(self): + yearmonth = InstitutionMonthlySummaryReport.most_recent_yearmonth() + if report_date_str := self.request.query_params.get('report_yearmonth'): + try: + yearmonth = YearMonth.from_str(report_date_str) + except ValueError: + pass + + if yearmonth is None: + return None + + return InstitutionMonthlySummaryReport.search().filter( + 'term', + report_yearmonth=str(yearmonth), + ).filter( + 'term', + institution_id=self.get_institution()._id, + ) + + +institution_summary_metrics_detail_view = toggle_view_by_flag( + flag_name=osf.features.INSTITUTIONAL_DASHBOARD_2024, + old_view=_OldInstitutionSummaryMetrics.as_view(), + new_view=_NewInstitutionSummaryMetricsDetail.as_view(), +) +institution_summary_metrics_detail_view.view_name = 'institution-summary-metrics' + + +institution_user_metrics_list_view = toggle_view_by_flag( + flag_name=osf.features.INSTITUTIONAL_DASHBOARD_2024, + old_view=_OldInstitutionUserMetricsList.as_view(), + new_view=_NewInstitutionUserMetricsList.as_view(), +) +institution_user_metrics_list_view.view_name = 'institution-user-metrics' diff --git a/api/metrics/renderers.py b/api/metrics/renderers.py index fd4bdc78da2..1e33515b68c 100644 --- a/api/metrics/renderers.py +++ b/api/metrics/renderers.py @@ -1,6 +1,6 @@ import csv import io - +import json from django.http import Http404 from rest_framework import renderers @@ -42,11 +42,7 @@ def get_csv_row(keys_list, report_attrs): ] -class MetricsReportsCsvRenderer(renderers.BaseRenderer): - media_type = 'text/csv' - format = 'csv' - CSV_DIALECT = csv.excel - +class MetricsReportsRenderer(renderers.BaseRenderer): def render(self, json_response, accepted_media_type=None, renderer_context=None): serialized_reports = ( jsonapi_resource['attributes'] @@ -67,7 +63,24 @@ def render(self, json_response, accepted_media_type=None, renderer_context=None) return csv_filecontent.getvalue() -class MetricsReportsTsvRenderer(MetricsReportsCsvRenderer): +class MetricsReportsCsvRenderer(MetricsReportsRenderer): + format = 'csv' + extension = 'csv' + media_type = 'text/csv' + CSV_DIALECT = csv.excel + + +class MetricsReportsTsvRenderer(MetricsReportsRenderer): format = 'tsv' + extension = 'tsv' media_type = 'text/tab-separated-values' CSV_DIALECT = csv.excel_tab + + +class MetricsReportsJsonRenderer(renderers.BaseRenderer): + format = 'json_report' + extension = 'json' + media_type = 'application/json' + + def render(self, json_response, accepted_media_type=None, renderer_context=None): + return json.dumps([item['attributes'] for item in json_response['data']]) diff --git a/api/share/utils.py b/api/share/utils.py index 34c9be4609c..4f4137dcf58 100644 --- a/api/share/utils.py +++ b/api/share/utils.py @@ -3,6 +3,7 @@ SHARE/Trove accepts metadata records as "indexcards" in turtle format: https://www.w3.org/TR/turtle/ """ from functools import partial +from http import HTTPStatus import logging import random from urllib.parse import urljoin @@ -17,7 +18,11 @@ from framework.encryption import ensure_bytes from framework.sentry import log_exception from osf import models as osf_db -from osf.metadata.tools import pls_gather_metadata_file +from osf.metadata.osf_gathering import ( + OsfmapPartition, + pls_get_magic_metadata_basket, +) +from osf.metadata.serializers import get_metadata_serializer from website import settings @@ -25,7 +30,7 @@ def shtrove_ingest_url(): - return f'{settings.SHARE_URL}api/v3/ingest' + return f'{settings.SHARE_URL}trove/ingest' def sharev2_push_url(): @@ -69,83 +74,100 @@ def _enqueue_update_share(osfresource): enqueue_task(async_update_resource_share.s(_osfguid_value)) -@celery_app.task(bind=True, max_retries=4, acks_late=True) -def task__update_share(self, guid: str, is_backfill=False): +@celery_app.task( + bind=True, + acks_late=True, + max_retries=4, + retry_backoff=True, +) +def task__update_share(self, guid: str, is_backfill=False, osfmap_partition_name='MAIN'): """ - This function updates share takes Preprints, Projects and Registrations. - :param self: - :param guid: - :return: + Send SHARE/trove current metadata record(s) for the osf-guid-identified object """ - resp = _do_update_share(guid, is_backfill=is_backfill) + _osfmap_partition = OsfmapPartition[osfmap_partition_name] + _osfid_instance = apps.get_model('osf.Guid').load(guid) + if _osfid_instance is None: + raise ValueError(f'unknown osfguid "{guid}"') + _resource = _osfid_instance.referent + _is_deletion = _should_delete_indexcard(_resource) + _response = ( + pls_delete_trove_record(_resource, osfmap_partition=_osfmap_partition) + if _is_deletion + else pls_send_trove_record( + _resource, + is_backfill=is_backfill, + osfmap_partition=_osfmap_partition, + ) + ) try: - resp.raise_for_status() + _response.raise_for_status() except Exception as e: - if self.request.retries == self.max_retries: - log_exception(e) - elif resp.status_code >= 500: - try: - self.retry( - exc=e, - countdown=(random.random() + 1) * min(60 + settings.CELERY_RETRY_BACKOFF_BASE ** self.request.retries, 60 * 10), + log_exception(e) + if HTTPStatus(_response.status_code).is_server_error: + raise self.retry(exc=e) + else: # success response + if not _is_deletion: + # enqueue followup task for supplementary metadata + _next_partition = _next_osfmap_partition(_osfmap_partition) + if _next_partition is not None: + task__update_share.delay( + guid, + is_backfill=is_backfill, + osfmap_partition_name=_next_partition.name, ) - except Retry as e: # Retry is only raise after > 5 retries - log_exception(e) - else: - log_exception(e) - - return resp -def pls_send_trove_indexcard(osf_item, *, is_backfill=False): +def pls_send_trove_record(osf_item, *, is_backfill: bool, osfmap_partition: OsfmapPartition): try: _iri = osf_item.get_semantic_iri() except (AttributeError, ValueError): raise ValueError(f'could not get iri for {osf_item}') - _metadata_record = pls_gather_metadata_file(osf_item, 'turtle') + _basket = pls_get_magic_metadata_basket(osf_item) + _serializer = get_metadata_serializer( + format_key='turtle', + basket=_basket, + serializer_config={'osfmap_partition': osfmap_partition}, + ) + _serialized_record = _serializer.serialize() _queryparams = { 'focus_iri': _iri, - 'record_identifier': _shtrove_record_identifier(osf_item), + 'record_identifier': _shtrove_record_identifier(osf_item, osfmap_partition), } if is_backfill: - _queryparams['nonurgent'] = True + _queryparams['nonurgent'] = '' + if osfmap_partition.is_supplementary: + _queryparams['is_supplementary'] = '' + _expiration_date = osfmap_partition.get_expiration_date(_basket) + if _expiration_date is not None: + _queryparams['expiration_date'] = str(_expiration_date) return requests.post( shtrove_ingest_url(), params=_queryparams, headers={ - 'Content-Type': _metadata_record.mediatype, + 'Content-Type': _serializer.mediatype, **_shtrove_auth_headers(osf_item), }, - data=ensure_bytes(_metadata_record.serialized_metadata), + data=ensure_bytes(_serialized_record), ) -def pls_delete_trove_indexcard(osf_item): +def pls_delete_trove_record(osf_item, osfmap_partition: OsfmapPartition): return requests.delete( shtrove_ingest_url(), params={ - 'record_identifier': _shtrove_record_identifier(osf_item), + 'record_identifier': _shtrove_record_identifier(osf_item, osfmap_partition), }, headers=_shtrove_auth_headers(osf_item), ) -def _do_update_share(osfguid: str, *, is_backfill=False): - logger.debug('%s._do_update_share("%s", is_backfill=%s)', __name__, osfguid, is_backfill) - _guid_instance = apps.get_model('osf.Guid').load(osfguid) - if _guid_instance is None: - raise ValueError(f'unknown osfguid "{osfguid}"') - _resource = _guid_instance.referent - _response = ( - pls_delete_trove_indexcard(_resource) - if _should_delete_indexcard(_resource) - else pls_send_trove_indexcard(_resource, is_backfill=is_backfill) +def _shtrove_record_identifier(osf_item, osfmap_partition: OsfmapPartition): + _id = osf_item.guids.values_list('_id', flat=True).first() + return ( + f'{_id}/{osfmap_partition.name}' + if osfmap_partition.is_supplementary + else _id ) - return _response - - -def _shtrove_record_identifier(osf_item): - return osf_item.guids.values_list('_id', flat=True).first() def _shtrove_auth_headers(osf_item): @@ -182,6 +204,16 @@ def _is_item_public(guid_referent) -> bool: return getattr(guid_referent, 'is_public', False) # quacks like AbstractNode +def _next_osfmap_partition(partition: OsfmapPartition) -> OsfmapPartition | None: + match partition: + case OsfmapPartition.MAIN: + return OsfmapPartition.SUPPLEMENT + case OsfmapPartition.SUPPLEMENT: + return OsfmapPartition.MONTHLY_SUPPLEMENT + case _: + return None + + ### # BEGIN soon-to-be-deleted (🤞) legacy sharev2 push # (until dust has settled on iri-centric (rdf-based) search) diff --git a/api_tests/base/test_views.py b/api_tests/base/test_views.py index 6d4a35c07e0..212ebed351a 100644 --- a/api_tests/base/test_views.py +++ b/api_tests/base/test_views.py @@ -43,9 +43,9 @@ if hasattr(patt, 'url_patterns'): # Namespaced list of patterns for subpatt in patt.url_patterns: - VIEW_CLASSES.append(subpatt.callback.cls) + VIEW_CLASSES.append(subpatt.callback.view_class) else: - VIEW_CLASSES.append(patt.callback.cls) + VIEW_CLASSES.append(patt.callback.view_class) class TestApiBaseViews(ApiTestCase): diff --git a/api_tests/institutions/views/test_institution_department_list.py b/api_tests/institutions/views/test_institution_department_list.py index 5a22d17fdff..f2a335eed85 100644 --- a/api_tests/institutions/views/test_institution_department_list.py +++ b/api_tests/institutions/views/test_institution_department_list.py @@ -10,7 +10,7 @@ from osf.metrics import UserInstitutionProjectCounts -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db class TestInstitutionDepartmentList: diff --git a/api_tests/institutions/views/test_institution_detail.py b/api_tests/institutions/views/test_institution_detail.py index e21e3a7087b..a8d81f7138f 100644 --- a/api_tests/institutions/views/test_institution_detail.py +++ b/api_tests/institutions/views/test_institution_detail.py @@ -1,6 +1,9 @@ import pytest -from osf_tests.factories import InstitutionFactory +from osf_tests.factories import ( + AuthUserFactory, + InstitutionFactory, +) from api.base.settings.defaults import API_BASE from django.core.validators import URLValidator @@ -11,6 +14,8 @@ class TestInstitutionDetail: 'nodes', 'registrations', 'users', + } + expected_metrics_relationships = { 'department_metrics', 'user_metrics', 'summary_metrics' @@ -26,34 +31,55 @@ def institution(self): def url(self, institution): return f'/{API_BASE}institutions/{institution._id}/' - def test_detail_response(self, app, institution, url): - - # 404 on wrong _id - res = app.get(f'/{institution}institutions/1PO/', expect_errors=True) - assert res.status_code == 404 - - res = app.get(url) - assert res.status_code == 200 - attrs = res.json['data']['attributes'] - assert attrs['name'] == institution.name - assert attrs['iri'] == institution.identifier_domain - assert attrs['ror_iri'] == institution.ror_uri - assert set(attrs['iris']) == { - institution.ror_uri, - institution.identifier_domain, - institution.absolute_url, - } - assert 'logo_path' in attrs - assert set(attrs['assets'].keys()) == {'logo', 'logo_rounded', 'banner'} - assert res.json['data']['links']['self'].endswith(url) - - relationships = res.json['data']['relationships'] - assert self.expected_relationships == set(relationships.keys()) - for relationships in list(relationships.values()): - # ↓ returns None if url is valid else throws error. - assert self.is_valid_url(relationships['links']['related']['href']) is None - - # test_return_without_logo_path - res = app.get(f'{url}?version=2.14') - assert res.status_code == 200 - assert 'logo_path' not in res.json['data']['attributes'] + @pytest.fixture() + def rando(self): + return AuthUserFactory() + + @pytest.fixture() + def institutional_admin(self, institution): + _admin_user = AuthUserFactory() + institution.get_group('institutional_admins').user_set.add(_admin_user) + return _admin_user + + def test_detail_response(self, app, institution, url, rando, institutional_admin): + + for _user in (None, rando, institutional_admin): + _auth = (None if _user is None else _user.auth) + # 404 on wrong _id + res = app.get(f'/{institution}institutions/1PO/', expect_errors=True, auth=_auth) + assert res.status_code == 404 + + res = app.get(url, auth=_auth) + assert res.status_code == 200 + attrs = res.json['data']['attributes'] + assert attrs['name'] == institution.name + assert attrs['iri'] == institution.identifier_domain + assert attrs['ror_iri'] == institution.ror_uri + assert set(attrs['iris']) == { + institution.ror_uri, + institution.identifier_domain, + institution.absolute_url, + } + assert 'logo_path' in attrs + assert set(attrs['assets'].keys()) == {'logo', 'logo_rounded', 'banner'} + if _user is institutional_admin: + assert attrs['link_to_external_reports_archive'] == institution.link_to_external_reports_archive + else: + assert 'link_to_external_reports_archive' not in attrs + assert res.json['data']['links']['self'].endswith(url) + + relationships = res.json['data']['relationships'] + _expected_relationships = ( + self.expected_relationships | self.expected_metrics_relationships + if _user is institutional_admin + else self.expected_relationships + ) + assert _expected_relationships == set(relationships.keys()) + for relationships in list(relationships.values()): + # ↓ returns None if url is valid else throws error. + assert self.is_valid_url(relationships['links']['related']['href']) is None + + # test_return_without_logo_path + res = app.get(f'{url}?version=2.14', auth=_auth) + assert res.status_code == 200 + assert 'logo_path' not in res.json['data']['attributes'] diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py index b29998d5561..d423663ea89 100644 --- a/api_tests/institutions/views/test_institution_summary_metrics.py +++ b/api_tests/institutions/views/test_institution_summary_metrics.py @@ -1,15 +1,19 @@ import pytest import datetime +from waffle.testutils import override_flag +from osf.metrics import InstitutionProjectCounts + from api.base.settings.defaults import API_BASE from osf_tests.factories import ( + InstitutionFactory, AuthUserFactory, - InstitutionFactory ) -from osf.metrics import InstitutionProjectCounts +from osf.metrics.reports import InstitutionMonthlySummaryReport +from osf import features -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db class TestInstitutionSummaryMetrics: @@ -92,3 +96,249 @@ def test_get(self, app, url, institution, user, admin): 'self': f'http://localhost:8000/v2/institutions/{institution._id}/metrics/summary/' } } + + +@pytest.mark.es_metrics +@pytest.mark.django_db +class TestNewInstitutionSummaryMetricsList: + @pytest.fixture(autouse=True) + def _waffled(self): + with override_flag(features.INSTITUTIONAL_DASHBOARD_2024, active=True): + yield + + @pytest.fixture() + def institution(self): + return InstitutionFactory() + + @pytest.fixture() + def rando(self): + return AuthUserFactory() + + @pytest.fixture() + def institutional_admin(self, institution): + admin_user = AuthUserFactory() + institution.get_group('institutional_admins').user_set.add(admin_user) + return admin_user + + @pytest.fixture() + def unshown_reports(self, institution): + # Reports that should not be shown in the results + # Report from another institution + another_institution = InstitutionFactory() + _summary_report_factory('2024-08', another_institution) + # Old report from the same institution + _summary_report_factory('2024-07', institution) + _summary_report_factory('2018-02', institution) + + @pytest.fixture() + def reports(self, institution): + return [ + _summary_report_factory( + '2024-08', institution, + user_count=100, + public_project_count=50, + private_project_count=25, + public_registration_count=10, + embargoed_registration_count=5, + published_preprint_count=15, + public_file_count=20, + storage_byte_count=5000000000, + monthly_logged_in_user_count=80, + monthly_active_user_count=60, + ), + _summary_report_factory( + '2024-08', institution, + user_count=200, + public_project_count=150, + private_project_count=125, + public_registration_count=110, + embargoed_registration_count=105, + published_preprint_count=115, + public_file_count=120, + storage_byte_count=15000000000, + monthly_logged_in_user_count=180, + monthly_active_user_count=160, + ), + ] + + @pytest.fixture() + def url(self, institution): + return f'/{API_BASE}institutions/{institution._id}/metrics/summary/' + + def test_anon(self, app, url): + resp = app.get(url, expect_errors=True) + assert resp.status_code == 401 + + def test_rando(self, app, url, rando): + resp = app.get(url, auth=rando.auth, expect_errors=True) + assert resp.status_code == 403 + + def test_get_empty(self, app, url, institutional_admin): + resp = app.get(url, auth=institutional_admin.auth) + assert resp.status_code == 200 + assert resp.json['meta'] == {'version': '2.0'} + + def test_get_report(self, app, url, institutional_admin, institution, reports, unshown_reports): + resp = app.get(url, auth=institutional_admin.auth) + assert resp.status_code == 200 + + data = resp.json['data'] + + assert data['id'] == institution._id + assert data['type'] == 'institution-summary-metrics' + + attributes = data['attributes'] + assert attributes['user_count'] == 200 + assert attributes['public_project_count'] == 150 + assert attributes['private_project_count'] == 125 + assert attributes['public_registration_count'] == 110 + assert attributes['embargoed_registration_count'] == 105 + assert attributes['published_preprint_count'] == 115 + assert attributes['public_file_count'] == 120 + assert attributes['storage_byte_count'] == 15000000000 + assert attributes['monthly_logged_in_user_count'] == 180 + assert attributes['monthly_active_user_count'] == 160 + + def test_get_report_with_multiple_months_and_institutions( + self, app, url, institutional_admin, institution + ): + # Create reports for multiple months and institutions + other_institution = InstitutionFactory() + _summary_report_factory( + '2024-09', institution, + user_count=250, + public_project_count=200, + private_project_count=150, + public_registration_count=120, + embargoed_registration_count=110, + published_preprint_count=130, + public_file_count=140, + storage_byte_count=20000000000, + monthly_logged_in_user_count=220, + monthly_active_user_count=200, + ) + _summary_report_factory( + '2024-08', institution, + user_count=200, + public_project_count=150, + private_project_count=125, + public_registration_count=110, + embargoed_registration_count=105, + published_preprint_count=115, + public_file_count=120, + storage_byte_count=15000000000, + monthly_logged_in_user_count=180, + monthly_active_user_count=160, + ) + _summary_report_factory( + '2024-09', other_institution, + user_count=300, + public_project_count=250, + private_project_count=200, + public_registration_count=180, + embargoed_registration_count=170, + published_preprint_count=190, + public_file_count=210, + storage_byte_count=25000000000, + monthly_logged_in_user_count=270, + monthly_active_user_count=260, + ) + + resp = app.get(url, auth=institutional_admin.auth) + assert resp.status_code == 200 + + data = resp.json['data'] + + assert data['id'] == institution._id + assert data['type'] == 'institution-summary-metrics' + + attributes = data['attributes'] + + assert attributes['user_count'] == 250 + assert attributes['public_project_count'] == 200 + assert attributes['private_project_count'] == 150 + assert attributes['public_registration_count'] == 120 + assert attributes['embargoed_registration_count'] == 110 + assert attributes['published_preprint_count'] == 130 + assert attributes['public_file_count'] == 140 + assert attributes['storage_byte_count'] == 20000000000 + assert attributes['monthly_logged_in_user_count'] == 220 + assert attributes['monthly_active_user_count'] == 200 + + def test_get_with_valid_report_dates(self, app, url, institution, institutional_admin): + _summary_report_factory( + '2024-08', + institution, + user_count=0, + ) + _summary_report_factory( + '2024-09', + institution, + user_count=999, + + ) + _summary_report_factory( + '2018-02', + institution, + user_count=4133, + ) + + resp = app.get(f'{url}?report_yearmonth=2024-08', auth=institutional_admin.auth) + assert resp.status_code == 200 + + attributes = resp.json['data']['attributes'] + assert attributes['user_count'] == 0 + + resp = app.get(f'{url}?report_yearmonth=2018-02', auth=institutional_admin.auth) + assert resp.status_code == 200 + + attributes = resp.json['data']['attributes'] + assert attributes['user_count'] == 4133 + + def test_get_with_invalid_report_date(self, app, url, institution, institutional_admin): + _summary_report_factory( + '2024-08', + institution, + user_count=0, + ) + _summary_report_factory( + '2024-09', + institution, + user_count=999, + ) + + # Request with an invalid report_date format + resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth) + assert resp.status_code == 200 + + # Verify it defaults to the most recent report data + attributes = resp.json['data']['attributes'] + assert attributes['user_count'] == 999 + + def test_get_without_report_date_uses_most_recent(self, app, url, institution, institutional_admin): + _summary_report_factory( + '2024-08', + institution, + user_count=0, + ) + _summary_report_factory( + '2024-09', + institution, + user_count=999, + ) + + resp = app.get(url, auth=institutional_admin.auth) + assert resp.status_code == 200 + + attributes = resp.json['data']['attributes'] + assert attributes['user_count'] == 999 + + +def _summary_report_factory(yearmonth, institution, **kwargs): + report = InstitutionMonthlySummaryReport( + report_yearmonth=yearmonth, + institution_id=institution._id, + **kwargs, + ) + report.save(refresh=True) + return report diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py index dfee4d178f5..f83fd7fc3fa 100644 --- a/api_tests/institutions/views/test_institution_user_metric_list.py +++ b/api_tests/institutions/views/test_institution_user_metric_list.py @@ -1,22 +1,31 @@ -import pytest import datetime import csv +import json from io import StringIO from random import random -import time +from urllib.parse import urlencode + +import pytest +from waffle.testutils import override_flag -from api.base.settings.defaults import API_BASE, DEFAULT_ES_NULL_VALUE +from api.base.settings.defaults import API_BASE, DEFAULT_ES_NULL_VALUE, REPORT_FILENAME_FORMAT +import osf.features from osf_tests.factories import ( InstitutionFactory, AuthUserFactory, ) from osf.metrics import UserInstitutionProjectCounts -from api.base import settings +from osf.metrics.reports import InstitutionalUserReport -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db -class TestInstitutionUserMetricList: +class TestOldInstitutionUserMetricList: + + @pytest.fixture(autouse=True) + def _waffled(self): + with override_flag(osf.features.INSTITUTIONAL_DASHBOARD_2024, active=False): + yield # these tests apply only before institution dashboard improvements @pytest.fixture() def institution(self): @@ -52,33 +61,31 @@ def admin(self, institution): @pytest.fixture() def populate_counts(self, institution, user, user2): # Old data that shouldn't appear in responses - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user._id, institution_id=institution._id, department='Biology dept', public_project_count=4, private_project_count=4, timestamp=datetime.date(2019, 6, 4) - ).save() + ).save(refresh=True) # New data - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user._id, institution_id=institution._id, department='Biology dept', public_project_count=6, private_project_count=5, - ).save() + ).save(refresh=True) - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user2._id, institution_id=institution._id, department='Psychology dept', public_project_count=3, private_project_count=2, - ).save() - - time.sleep(10) + ).save(refresh=True) @pytest.fixture() def populate_more_counts(self, institution, user, user2, user3, populate_counts): @@ -89,34 +96,30 @@ def populate_more_counts(self, institution, user, user2, user3, populate_counts) users.append(AuthUserFactory()) for test_user in users: - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=test_user._id, institution_id=institution._id, department='Psychology dept', public_project_count=int(10 * random()), private_project_count=int(10 * random()), - ).save() + ).save(refresh=True) - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user3._id, institution_id=institution._id, department='Psychology dept', public_project_count=int(10 * random()), private_project_count=int(10 * random()), - ).save() - - time.sleep(10) + ).save(refresh=True) @pytest.fixture() def populate_na_department(self, institution, user4): - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user4._id, institution_id=institution._id, public_project_count=1, private_project_count=1, - ).save() - - time.sleep(10) + ).save(refresh=True) @pytest.fixture() def url(self, institution): @@ -218,7 +221,6 @@ def test_filter(self, app, url, admin, populate_counts): resp = app.get(f'{url}?filter[department]=Psychology dept', auth=admin.auth) assert resp.json['data'][0]['attributes']['department'] == 'Psychology dept' - @pytest.mark.skipif(settings.CI_ENV, reason='Non-deterministic fails on CI') def test_sort_and_pagination(self, app, url, user, user2, user3, admin, populate_counts, populate_more_counts, institution): resp = app.get(f'{url}?sort=user_name&page[size]=1&page=2', auth=admin.auth) assert resp.status_code == 200 @@ -229,7 +231,6 @@ def test_sort_and_pagination(self, app, url, user, user2, user3, admin, populate assert resp.json['links']['meta']['total'] == 11 assert resp.json['data'][-1]['attributes']['user_name'] == 'Zedd' - @pytest.mark.skipif(settings.CI_ENV, reason='Non-deterministic fails on CI') def test_filter_and_pagination(self, app, user, user2, user3, url, admin, populate_counts, populate_more_counts, institution): resp = app.get(f'{url}?page=2', auth=admin.auth) assert resp.json['links']['meta']['total'] == 11 @@ -238,7 +239,6 @@ def test_filter_and_pagination(self, app, user, user2, user3, url, admin, popula assert resp.json['links']['meta']['total'] == 1 assert resp.json['data'][0]['attributes']['user_name'] == 'Zedd' - @pytest.mark.skipif(settings.CI_ENV, reason='Non-deterministic fails on CI') def test_filter_and_sort(self, app, url, user, user2, user3, admin, user4, populate_counts, populate_na_department, institution): """ Testing for bug where sorting and filtering would throw 502. @@ -265,3 +265,374 @@ def test_filter_and_sort(self, app, url, user, user2, user3, admin, user4, popul assert data[0]['attributes']['department'] == 'Biology dept' assert data[1]['attributes']['department'] == 'N/A' assert data[2]['attributes']['department'] == 'Psychology dept' + + +@pytest.mark.es_metrics +@pytest.mark.django_db +class TestNewInstitutionUserMetricList: + @pytest.fixture(autouse=True) + def _waffled(self): + with override_flag(osf.features.INSTITUTIONAL_DASHBOARD_2024, active=True): + yield # these tests apply only after institution dashboard improvements + + @pytest.fixture() + def institution(self): + return InstitutionFactory() + + @pytest.fixture() + def rando(self): + return AuthUserFactory() + + @pytest.fixture() + def institutional_admin(self, institution): + _admin_user = AuthUserFactory() + institution.get_group('institutional_admins').user_set.add(_admin_user) + return _admin_user + + @pytest.fixture() + def unshown_reports(self, institution): + # unshown because another institution + _another_institution = InstitutionFactory() + _report_factory('2024-08', _another_institution, user_id='nother_inst') + # unshown because old + _report_factory('2024-07', institution, user_id='old') + + @pytest.fixture() + def reports(self, institution): + return [ + _report_factory( + '2024-08', institution, + user_id='u_sparse', + storage_byte_count=53, + ), + _report_factory( + '2024-08', institution, + user_id='u_orc', + orcid_id='5555-4444-3333-2222', + storage_byte_count=8277, + ), + _report_factory( + '2024-08', institution, + user_id='u_blargl', + department_name='blargl', + storage_byte_count=34834834, + ), + _report_factory( + '2024-08', institution, + user_id='u_orcomma', + orcid_id='4444-3333-2222-1111', + department_name='a department, or so, that happens, incidentally, to have commas', + storage_byte_count=736662999298, + ), + ] + + @pytest.fixture() + def url(self, institution): + return f'/{API_BASE}institutions/{institution._id}/metrics/users/' + + def test_anon(self, app, url): + _resp = app.get(url, expect_errors=True) + assert _resp.status_code == 401 + + def test_rando(self, app, url, rando): + _resp = app.get(url, auth=rando.auth, expect_errors=True) + assert _resp.status_code == 403 + + def test_get_empty(self, app, url, institutional_admin): + _resp = app.get(url, auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert _resp.json['data'] == [] + + def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + _resp = app.get(url, auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert len(_resp.json['data']) == len(reports) + _expected_user_ids = {_report.user_id for _report in reports} + assert set(_user_ids(_resp)) == _expected_user_ids + + def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_ids in ( + ({'filter[department]': 'nunavum'}, set()), + ({'filter[department]': 'incidentally'}, set()), + ({'filter[department]': 'blargl'}, {'u_blargl'}), + ({'filter[department]': 'a department, or so, that happens, incidentally, to have commas'}, {'u_orcomma'}), + ({'filter[department][eq]': 'nunavum'}, set()), + ({'filter[department][eq]': 'blargl'}, {'u_blargl'}), + ({'filter[department][eq]': 'a department, or so, that happens, incidentally, to have commas'}, {'u_orcomma'}), + ({'filter[department][ne]': 'nunavum'}, {'u_sparse', 'u_blargl', 'u_orc', 'u_orcomma'}), + + ({'filter[orcid_id][eq]': '5555-4444-3333-2222'}, {'u_orc'}), + ({'filter[orcid_id][ne]': ''}, {'u_orc', 'u_orcomma'}), + ({'filter[orcid_id][eq]': ''}, {'u_sparse', 'u_blargl'}), + ({ + 'filter[orcid_id]': '', + 'filter[department]': 'blargl', + }, {'u_blargl'}), + ({ + 'filter[orcid_id]': '', + 'filter[department][ne]': 'blargl', + }, {'u_sparse'}), + ({ + 'filter[orcid_id]': '5555-4444-3333-2222', + 'filter[department][ne]': 'blargl', + }, {'u_orc'}), + ({ + 'filter[orcid_id]': '5555-4444-3333-2222', + 'filter[department][ne]': '', + }, set()), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert set(_user_ids(_resp)) == _expected_user_ids + + def test_sort_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_id_list in ( + ({'sort': 'storage_byte_count'}, ['u_sparse', 'u_orc', 'u_blargl', 'u_orcomma']), + ({'sort': '-storage_byte_count'}, ['u_orcomma', 'u_blargl', 'u_orc', 'u_sparse']), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert list(_user_ids(_resp)) == _expected_user_id_list + + def test_paginate_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_id_list in ( + ({'sort': 'storage_byte_count', 'page[size]': 2}, ['u_sparse', 'u_orc']), + ({'sort': 'storage_byte_count', 'page[size]': 2, 'page': 2}, ['u_blargl', 'u_orcomma']), + ({'sort': '-storage_byte_count', 'page[size]': 3}, ['u_orcomma', 'u_blargl', 'u_orc']), + ({'sort': '-storage_byte_count', 'page[size]': 3, 'page': 2}, ['u_sparse']), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert list(_user_ids(_resp)) == _expected_user_id_list + + @pytest.mark.parametrize('format_type, delimiter, content_type', [ + ('csv', ',', 'text/csv; charset=utf-8'), + ('tsv', '\t', 'text/tab-separated-values; charset=utf-8') + ]) + def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institution, format_type, delimiter, + content_type): + _report_factory( + '2024-08', + institution, + user_id='u_orcomma', + account_creation_date='2018-02', + user_name='Jason Kelce', + orcid_id='4444-3333-2222-1111', + department_name='Center, \t Greatest Ever', + storage_byte_count=736662999298, + embargoed_registration_count=1, + published_preprint_count=1, + public_registration_count=2, + public_project_count=3, + public_file_count=4, + private_project_count=5, + month_last_active='2018-02', + month_last_login='2018-02', + ) + + resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) + assert resp.status_code == 200 + assert resp.headers['Content-Type'] == content_type + + current_date = datetime.datetime.now().strftime('%Y-%m') + expected_filename = REPORT_FILENAME_FORMAT.format( + view_name='institution-user-metrics', + date_created=current_date, + extension=format_type + ) + assert resp.headers['Content-Disposition'] == f'attachment; filename="{expected_filename}"' + + response_body = resp.text + expected_response = [ + [ + 'account_creation_date', + 'department', + 'embargoed_registration_count', + 'month_last_active', + 'month_last_login', + 'orcid_id', + 'private_projects', + 'public_file_count', + 'public_projects', + 'public_registration_count', + 'published_preprint_count', + 'storage_byte_count', + 'user_name' + ], + [ + '2018-02', + 'Center, \t Greatest Ever', + '1', + '2018-02', + '2018-02', + '4444-3333-2222-1111', + '5', + '4', + '3', + '2', + '1', + '736662999298', + 'Jason Kelce' + ] + ] + + if delimiter: + with StringIO(response_body) as file: + reader = csv.reader(file, delimiter=delimiter) + response_rows = list(reader) + assert response_rows[0] == expected_response[0] + assert sorted(response_rows[1:]) == sorted(expected_response[1:]) + + @pytest.mark.parametrize('format_type, delimiter, content_type', [ + ('csv', ',', 'text/csv; charset=utf-8'), + ('tsv', '\t', 'text/tab-separated-values; charset=utf-8') + ]) + def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institution, format_type, delimiter, + content_type): + # Create 15 records, exceeding the default page size of 10 + num_records = 15 + expected_data = [] + for i in range(num_records): + _report_factory( + '2024-08', + institution, + user_id=f'u_orcomma_{i}', + account_creation_date='2018-02', + user_name=f'Jalen Hurts #{i}', + orcid_id=f'4444-3333-2222-111{i}', + department_name='QBatman', + storage_byte_count=736662999298 + i, + embargoed_registration_count=1, + published_preprint_count=1, + public_registration_count=2, + public_project_count=3, + public_file_count=4, + private_project_count=5, + month_last_active='2018-02', + month_last_login='2018-02', + ) + expected_data.append([ + '2018-02', + 'QBatman', + '1', + '2018-02', + '2018-02', + f'4444-3333-2222-111{i}', + '5', + '4', + '3', + '2', + '1', + str(736662999298 + i), + f'Jalen Hurts #{i}', + ]) + + # Make request for CSV format with page[size]=10 + resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) + assert resp.status_code == 200 + assert resp.headers['Content-Type'] == content_type + + current_date = datetime.datetime.now().strftime('%Y-%m') + expected_filename = REPORT_FILENAME_FORMAT.format( + view_name='institution-user-metrics', + date_created=current_date, + extension=format_type + ) + assert resp.headers['Content-Disposition'] == f'attachment; filename="{expected_filename}"' + + # Validate the CSV content contains all 15 records, ignoring the default pagination of 10 + response_body = resp.text + rows = response_body.splitlines() + + assert len(rows) == num_records + 1 == 16 # 1 header + 15 records + + if delimiter: + with StringIO(response_body) as file: + reader = csv.reader(file, delimiter=delimiter) + response_rows = list(reader) + # Validate header row + expected_header = [ + 'account_creation_date', + 'department', + 'embargoed_registration_count', + 'month_last_active', + 'month_last_login', + 'orcid_id', + 'private_projects', + 'public_file_count', + 'public_projects', + 'public_registration_count', + 'published_preprint_count', + 'storage_byte_count', + 'user_name' + ] + assert response_rows[0] == expected_header + # Sort both expected and actual rows (ignoring the header) before comparison + assert sorted(response_rows[1:]) == sorted(expected_data) + + def test_get_report_format_table_json(self, app, url, institutional_admin, institution): + _report_factory( + '2024-08', + institution, + user_id='u_orcomma', + account_creation_date='2018-02', + user_name='Brian Dawkins', + orcid_id='4444-3333-2222-1111', + department_name='Safety "The Wolverine" Weapon X', + storage_byte_count=736662999298, + embargoed_registration_count=1, + published_preprint_count=1, + public_registration_count=2, + public_project_count=3, + public_file_count=4, + private_project_count=5, + month_last_active='2018-02', + month_last_login='2018-02', + ) + + resp = app.get(f'{url}?format=json_report', auth=institutional_admin.auth) + assert resp.status_code == 200 + assert resp.headers['Content-Type'] == 'application/json; charset=utf-8' + + current_date = datetime.datetime.now().strftime('%Y-%m') + expected_filename = REPORT_FILENAME_FORMAT.format( + view_name='institution-user-metrics', + date_created=current_date, + extension='json' + ) + assert resp.headers['Content-Disposition'] == f'attachment; filename="{expected_filename}"' + + # Validate JSON structure and content + response_data = json.loads(resp.body) + expected_data = [ + { + 'account_creation_date': '2018-02', + 'department': 'Safety "The Wolverine" Weapon X', + 'embargoed_registration_count': 1, + 'month_last_active': '2018-02', + 'month_last_login': '2018-02', + 'orcid_id': '4444-3333-2222-1111', + 'private_projects': 5, + 'public_file_count': 4, + 'public_projects': 3, + 'public_registration_count': 2, + 'published_preprint_count': 1, + 'storage_byte_count': 736662999298, + 'user_name': 'Brian Dawkins' + } + ] + assert response_data == expected_data + + +def _user_ids(api_response): + for _datum in api_response.json['data']: + yield _datum['relationships']['user']['data']['id'] + +def _report_factory(yearmonth, institution, **kwargs): + _report = InstitutionalUserReport( + report_yearmonth=yearmonth, + institution_id=institution._id, + **kwargs, + ) + _report.save(refresh=True) + return _report diff --git a/api_tests/metrics/test_composite_query.py b/api_tests/metrics/test_composite_query.py index fd36c0c5f24..0cd0b3bb180 100644 --- a/api_tests/metrics/test_composite_query.py +++ b/api_tests/metrics/test_composite_query.py @@ -29,7 +29,7 @@ def base_url(): return f'/{API_BASE}metrics/preprints/' -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db class TestElasticSearch: diff --git a/api_tests/metrics/test_preprint_metrics.py b/api_tests/metrics/test_preprint_metrics.py index 57e31655c40..1bde8719b75 100644 --- a/api_tests/metrics/test_preprint_metrics.py +++ b/api_tests/metrics/test_preprint_metrics.py @@ -116,7 +116,7 @@ def test_custom_metric_malformed_query(self, mock_execute, app, user, base_url): assert res.status_code == 400 assert res.json['errors'][0]['detail'] == 'Malformed elasticsearch query.' - @pytest.mark.es + @pytest.mark.es_metrics def test_agg_query(self, app, user, base_url): post_url = f'{base_url}downloads/' diff --git a/api_tests/metrics/test_raw_metrics.py b/api_tests/metrics/test_raw_metrics.py index c7feb69426b..6a3b9b8f8c5 100644 --- a/api_tests/metrics/test_raw_metrics.py +++ b/api_tests/metrics/test_raw_metrics.py @@ -14,7 +14,7 @@ pytestmark = pytest.mark.django_db -@pytest.mark.es +@pytest.mark.es_metrics class TestRawMetrics: @pytest.fixture(autouse=True) @@ -22,6 +22,12 @@ def enable_elasticsearch_metrics(self): with override_switch(features.ENABLE_RAW_METRICS, active=True): yield + @pytest.fixture(autouse=True) + def teardown_customer_index(self, es6_client): + es6_client.indices.delete(index='customer', ignore_unavailable=True) + yield + es6_client.indices.delete(index='customer', ignore_unavailable=True) + @pytest.fixture def user(self): user = AuthUserFactory() @@ -132,7 +138,7 @@ def test_post_and_get(self, app, user, base_url): time.sleep(3) - get_url = f'{base_url}_search?q=*' + get_url = f'{base_url}customer/_search?q=*' res = app.get(get_url, auth=user.auth) assert res.json['hits']['total'] == 1 diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py index d8b78cdf5ad..93cde9f1121 100644 --- a/api_tests/metrics/test_registries_moderation_metrics.py +++ b/api_tests/metrics/test_registries_moderation_metrics.py @@ -22,7 +22,7 @@ def enable_elasticsearch_metrics(self): with override_switch(features.ELASTICSEARCH_METRICS, active=True): yield - @pytest.mark.es + @pytest.mark.es_metrics def test_record_transitions(self, registration): registration._write_registration_action( RegistrationModerationStates.INITIAL, @@ -70,7 +70,7 @@ def other_user(self): def base_url(self): return '/_/metrics/registries_moderation/transitions/' - @pytest.mark.es + @pytest.mark.es_metrics def test_registries_moderation_view(self, app, user, base_url, registration): registration._write_registration_action( RegistrationModerationStates.INITIAL, diff --git a/api_tests/share/_utils.py b/api_tests/share/_utils.py index 9595aaf1b81..a04808cac3c 100644 --- a/api_tests/share/_utils.py +++ b/api_tests/share/_utils.py @@ -12,6 +12,7 @@ ) from website import settings as website_settings from api.share.utils import shtrove_ingest_url, sharev2_push_url +from osf.metadata.osf_gathering import OsfmapPartition @contextlib.contextmanager @@ -40,36 +41,67 @@ def mock_update_share(): @contextlib.contextmanager -def expect_ingest_request(mock_share_responses, osfguid, *, token=None, delete=False, count=1): +def expect_ingest_request(mock_share_responses, osfguid, *, token=None, delete=False, count=1, error_response=False): mock_share_responses._calls.reset() yield - _double_count = count * 2 # pushing to share two ways - assert len(mock_share_responses.calls) == _double_count, ( - f'expected {_double_count} call(s), got {len(mock_share_responses.calls)}: {list(mock_share_responses.calls)}' + _legacy_count_per_item = 1 + _trove_main_count_per_item = 1 + _trove_supplementary_count_per_item = ( + 0 + if (error_response or delete) + else (len(OsfmapPartition) - 1) ) + _total_count = count * ( + _legacy_count_per_item + + _trove_main_count_per_item + + _trove_supplementary_count_per_item + ) + assert len(mock_share_responses.calls) == _total_count, ( + f'expected {_total_count} call(s), got {len(mock_share_responses.calls)}: {list(mock_share_responses.calls)}' + ) + _trove_ingest_calls = [] + _trove_supp_ingest_calls = [] + _legacy_push_calls = [] for _call in mock_share_responses.calls: if _call.request.url.startswith(shtrove_ingest_url()): - assert_ingest_request(_call.request, osfguid, token=token, delete=delete) + if 'is_supplementary' in _call.request.url: + _trove_supp_ingest_calls.append(_call) + else: + _trove_ingest_calls.append(_call) else: - assert _call.request.url.startswith(sharev2_push_url()) + _legacy_push_calls.append(_call) + assert len(_trove_ingest_calls) == count + assert len(_trove_supp_ingest_calls) == count * _trove_supplementary_count_per_item + assert len(_legacy_push_calls) == count + for _call in _trove_ingest_calls: + assert_ingest_request(_call.request, osfguid, token=token, delete=delete) + for _call in _trove_supp_ingest_calls: + assert_ingest_request(_call.request, osfguid, token=token, delete=delete, supp=True) + for _call in _legacy_push_calls: + assert _call.request.url.startswith(sharev2_push_url()) -def assert_ingest_request(request, expected_osfguid, *, token=None, delete=False): +def assert_ingest_request(request, expected_osfguid, *, token=None, delete=False, supp=False): _querydict = QueryDict(urlsplit(request.path_url).query) - assert _querydict['record_identifier'] == expected_osfguid + if supp: + assert _querydict['record_identifier'].startswith(expected_osfguid) + assert _querydict['record_identifier'] != expected_osfguid + else: + assert _querydict['record_identifier'] == expected_osfguid if delete: assert request.method == 'DELETE' else: assert request.method == 'POST' _focus_iri = _querydict['focus_iri'] assert _focus_iri == f'{website_settings.DOMAIN}{expected_osfguid}' - assert _focus_iri in request.body.decode('utf-8') + _request_body = request.body.decode('utf-8') + assert (_focus_iri in _request_body) or (supp and not _request_body.strip()) _token = token or website_settings.SHARE_API_TOKEN assert request.headers['Authorization'] == f'Bearer {_token}' @contextlib.contextmanager -def expect_preprint_ingest_request(mock_share_responses, preprint, *, delete=False, count=1): +def expect_preprint_ingest_request(mock_share_responses, preprint, *, delete=False, count=1, error_response=False): # same as expect_ingest_request, but with convenience for preprint specifics # and postcommit-task handling (so on_preprint_updated actually runs) with expect_ingest_request( @@ -78,6 +110,7 @@ def expect_preprint_ingest_request(mock_share_responses, preprint, *, delete=Fal token=preprint.provider.access_token, delete=delete, count=count, + error_response=error_response, ): # clear out postcommit tasks from factories postcommit_queue().clear() diff --git a/api_tests/share/test_share_preprint.py b/api_tests/share/test_share_preprint.py index aa4d769d1f7..4ab47963bc8 100644 --- a/api_tests/share/test_share_preprint.py +++ b/api_tests/share/test_share_preprint.py @@ -133,7 +133,7 @@ def test_no_call_async_update_on_400_failure(self, mock_share_responses, preprin mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400) mock_share_responses.replace(responses.POST, sharev2_push_url(), status=400) preprint.set_published(True, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint, count=1): + with expect_preprint_ingest_request(mock_share_responses, preprint, count=1, error_response=True): preprint.update_search() def test_delete_from_share(self, mock_share_responses): diff --git a/conftest.py b/conftest.py index 2eb51df076e..6f870093ed4 100644 --- a/conftest.py +++ b/conftest.py @@ -1,3 +1,4 @@ +import contextlib from unittest import mock import logging import os @@ -5,7 +6,9 @@ from django.core.management import call_command from django.db import transaction +from elasticsearch import exceptions as es_exceptions from elasticsearch_dsl.connections import connections +from elasticsearch_metrics.registry import registry as es_metrics_registry from faker import Factory import pytest import responses @@ -133,22 +136,44 @@ def es6_client(setup_connections): @pytest.fixture(scope='function', autouse=True) -def _es_marker(request): +def _es_metrics_marker(request): """Clear out all indices and index templates before and after - tests marked with ``es``. + tests marked with `es_metrics`. """ - marker = request.node.get_closest_marker('es') + marker = request.node.get_closest_marker('es_metrics') if marker: es6_client = request.getfixturevalue('es6_client') - - def teardown_es(): - es6_client.indices.delete(index='*') - es6_client.indices.delete_template('*') - - teardown_es() - call_command('sync_metrics') - yield - teardown_es() + _temp_prefix = 'temp_metrics_' + _temp_wildcard = f'{_temp_prefix}*' + + def _teardown_es_temps(): + es6_client.indices.delete(index=_temp_wildcard) + try: + es6_client.indices.delete_template(_temp_wildcard) + except es_exceptions.NotFoundError: + pass + + @contextlib.contextmanager + def _mock_metric_names(): + with contextlib.ExitStack() as _exit: + for _metric_class in es_metrics_registry.get_metrics(): + _exit.enter_context(mock.patch.object( + _metric_class, + '_template_name', # also used to construct index names + f'{_temp_prefix}{_metric_class._template_name}', + )) + _exit.enter_context(mock.patch.object( + _metric_class, + '_template', # a wildcard string for indexes and templates + f'{_temp_prefix}{_metric_class._template}', + )) + yield + + _teardown_es_temps() + with _mock_metric_names(): + call_command('sync_metrics') + yield + _teardown_es_temps() else: yield diff --git a/osf/admin.py b/osf/admin.py index 2bfd8c2cc35..71c0ae8172b 100644 --- a/osf/admin.py +++ b/osf/admin.py @@ -6,6 +6,7 @@ from django.db.models import Q, Count from django.http import HttpResponseRedirect from django.urls import reverse +import waffle from osf.external.spam.tasks import reclassify_domain_references from osf.models import OSFUser, Node, NotableDomain, NodeLicense @@ -140,7 +141,24 @@ def get_queryset(self, request): qs = super().get_queryset(request).annotate(number_of_references=Count('domainreference')) return qs + +class _ManygroupWaffleFlagAdmin(waffle.admin.FlagAdmin): + '''customized `waffle.admin.FlagAdmin` to support many groups + + waffle assumes "there are likely not that many" groups [0], + but in osf there are, in fact, that many groups. + + [0]: https://github.com/jazzband/django-waffle/commit/bf36c19ee03baf1c5850ffe0b284900a5c416f53 + ''' + raw_id_fields = (*waffle.admin.FlagAdmin.raw_id_fields, 'groups') + + admin.site.register(OSFUser, OSFUserAdmin) admin.site.register(Node, NodeAdmin) admin.site.register(NotableDomain, NotableDomainAdmin) admin.site.register(NodeLicense, LicenseAdmin) + +# waffle admins, with Flag admin override +admin.site.register(waffle.models.Flag, _ManygroupWaffleFlagAdmin) +admin.site.register(waffle.models.Sample, waffle.admin.SampleAdmin) +admin.site.register(waffle.models.Switch, waffle.admin.SwitchAdmin) diff --git a/osf/features.yaml b/osf/features.yaml index c6f02ce2994..a3f0fcc1f14 100644 --- a/osf/features.yaml +++ b/osf/features.yaml @@ -189,6 +189,10 @@ flags: note: This is not used everyone: true + - flag_name: INSTITUTIONAL_DASHBOARD_2024 + name: institutional_dashboard_2024 + note: whether to surface older or updated (in 2024) institutional metrics + switches: - flag_name: DISABLE_ENGAGEMENT_EMAILS name: disable_engagement_emails diff --git a/osf/management/commands/make_dummy_pageviews_for_metrics.py b/osf/management/commands/make_dummy_pageviews_for_metrics.py index 11ff9ca69c9..09de34bf7a8 100644 --- a/osf/management/commands/make_dummy_pageviews_for_metrics.py +++ b/osf/management/commands/make_dummy_pageviews_for_metrics.py @@ -74,6 +74,8 @@ def _generate_random_countedusage(self, n, max_age): item_guid=ITEM_GUID, session_id='freshen by key', user_is_authenticated=bool(random.randint(0, 1)), + item_public=bool(random.randint(0, 1)), + action_labels=[['view', 'download'][random.randint(0, 1)]], ) def _run_date_query(self, time_range_filter): @@ -103,8 +105,8 @@ def _run_date_query(self, time_range_filter): }, }) return { - 'min': result.aggs['min-timestamp'].value_as_string, - 'max': result.aggs['max-timestamp'].value_as_string, + 'min': result.aggs['min-timestamp'].value, + 'max': result.aggs['max-timestamp'].value, **{ str(bucket.key.date()): bucket.doc_count for bucket in result.aggs['by-date'] diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py index 8f9854a722b..c467640cd15 100644 --- a/osf/management/commands/monthly_reporters_go.py +++ b/osf/management/commands/monthly_reporters_go.py @@ -44,8 +44,11 @@ def monthly_reporters_go(report_year=None, report_month=None): ) def monthly_reporter_go(task, reporter_key: str, yearmonth: str): _reporter_class = AllMonthlyReporters[reporter_key].value - _parsed_yearmonth = YearMonth.from_str(yearmonth) - _reporter_class().run_and_record_for_month(_parsed_yearmonth) + _reporter = _reporter_class(YearMonth.from_str(yearmonth)) + _reporter.run_and_record_for_month() + _followup = _reporter.followup_task() + if _followup is not None: + _followup.apply_async() class Command(BaseCommand): @@ -58,10 +61,8 @@ def add_arguments(self, parser): ) def handle(self, *args, **options): - errors = monthly_reporters_go( + monthly_reporters_go( report_year=getattr(options.get('yearmonth'), 'year', None), report_month=getattr(options.get('yearmonth'), 'month', None), ) - for error_key, error_val in errors.items(): - self.stdout.write(self.style.ERROR(f'error running {error_key}: ') + error_val) - self.stdout.write(self.style.SUCCESS('done.')) + self.stdout.write(self.style.SUCCESS('reporter tasks scheduled.')) diff --git a/osf/metadata/gather/basket.py b/osf/metadata/gather/basket.py index f28a4dee6d6..eb28a087ad3 100644 --- a/osf/metadata/gather/basket.py +++ b/osf/metadata/gather/basket.py @@ -19,15 +19,14 @@ class Basket: def __init__(self, focus: Focus): assert isinstance(focus, Focus) self.focus = focus - self.reset() # start with an empty basket (except the focus itself) + self.reset() # start with an empty basket def reset(self): self._gathertasks_done = set() - self._known_focus_dict = {} + self._known_focus_dict = {self.focus.iri: {self.focus}} self.gathered_metadata = rdfutils.contextualized_graph() - self._add_focus_reference(self.focus) - def pls_gather(self, predicate_map): # TODO: async + def pls_gather(self, predicate_map, *, include_defaults=True): # TODO: async '''go gatherers, go! @predicate_map: dict with rdflib.URIRef keys @@ -48,7 +47,7 @@ def pls_gather(self, predicate_map): # TODO: async }, }) ''' - self._do_gather(self.focus, predicate_map) + self._do_gather(self.focus, predicate_map, include_defaults=include_defaults) def __getitem__(self, slice_or_arg) -> typing.Iterable[rdflib.term.Node]: '''convenience for getting values from the basket @@ -98,14 +97,20 @@ def _maybe_gather_for_predicate_map(self, iri_or_focus, predicate_map): else: raise ValueError(f'expected `iri_or_focus` to be Focus or URIRef (got {iri_or_focus})') - def _do_gather(self, focus, predicate_map): + def _do_gather(self, focus, predicate_map, *, include_defaults=True): + if include_defaults: + self._add_focus_reference(focus) if not isinstance(predicate_map, dict): # allow iterable of predicates with no deeper paths predicate_map = { predicate_iri: None for predicate_iri in predicate_map } - for gatherer in get_gatherers(focus.rdftype, predicate_map.keys()): + for gatherer in get_gatherers( + focus.rdftype, + predicate_map.keys(), + include_focustype_defaults=include_defaults, + ): for (subj, pred, obj) in self._do_a_gathertask(gatherer, focus): if isinstance(obj, Focus): self._add_focus_reference(obj) diff --git a/osf/metadata/gather/gatherer.py b/osf/metadata/gather/gatherer.py index 2a8822c9d2a..0630e6d61ae 100644 --- a/osf/metadata/gather/gatherer.py +++ b/osf/metadata/gather/gatherer.py @@ -61,11 +61,16 @@ def add_gatherer(gatherer, predicate_iris, focustype_iris): ) -def get_gatherers(focustype_iri, predicate_iris): +def get_gatherers(focustype_iri, predicate_iris, *, include_focustype_defaults=True): gatherer_set = set() for focustype in (None, focustype_iri): for_focustype = __gatherer_registry.get(focustype, {}) - for predicate in (None, *predicate_iris): + _predicates = ( + (None, *predicate_iris) + if include_focustype_defaults + else predicate_iris + ) + for predicate in _predicates: gatherer_set.update(for_focustype.get(predicate, ())) return gatherer_set diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py index 6e5e25c6d0b..9783f7b0879 100644 --- a/osf/metadata/osf_gathering.py +++ b/osf/metadata/osf_gathering.py @@ -1,11 +1,14 @@ '''gatherers of metadata from the osf database, in particular ''' +import datetime +import enum import logging from django.contrib.contenttypes.models import ContentType from django import db import rdflib +from api.caching.tasks import get_storage_usage_total from osf import models as osfdb from osf.metadata import gather from osf.metadata.rdfutils import ( @@ -19,6 +22,7 @@ OSF, OSFIO, OWL, + PROV, RDF, ROR, SKOS, @@ -27,7 +31,12 @@ without_namespace, smells_like_iri, ) -from osf.utils import workflows as osfworkflows +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth +from osf.utils import ( + workflows as osfworkflows, + permissions as osfpermissions, +) from osf.utils.outcomes import ArtifactTypes from website import settings as website_settings @@ -47,13 +56,6 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket: return gather.Basket(focus) -def osfmap_for_type(rdftype_iri: str): - try: - return OSFMAP[rdftype_iri] - except KeyError: - raise ValueError(f'invalid OSFMAP type! expected one of {set(OSFMAP.keys())}, got {rdftype_iri}') - - ##### END "public" api ##### @@ -88,6 +90,7 @@ def osfmap_for_type(rdftype_iri: str): OSF.isContainedBy: OSF_OBJECT_REFERENCE, OSF.fileName: None, OSF.filePath: None, + OSF.hasFileVersion: None, } OSF_OBJECT = { @@ -131,16 +134,7 @@ def osfmap_for_type(rdftype_iri: str): DCTERMS.creator: OSF_AGENT_REFERENCE, }, OWL.sameAs: None, -} - -OSF_FILEVERSION = { - DCTERMS.created: None, - DCTERMS.creator: OSF_AGENT_REFERENCE, - DCTERMS.extent: None, - DCTERMS.modified: None, - DCTERMS.requires: None, - DCTERMS['format']: None, - OSF.versionNumber: None, + PROV.qualifiedAttribution: None, } OSFMAP = { @@ -193,7 +187,7 @@ def osfmap_for_type(rdftype_iri: str): DCTERMS.modified: None, DCTERMS.title: None, DCTERMS.type: None, - OSF.hasFileVersion: OSF_FILEVERSION, + OSF.hasFileVersion: None, OSF.isContainedBy: OSF_OBJECT_REFERENCE, OSF.fileName: None, OSF.filePath: None, @@ -211,6 +205,57 @@ def osfmap_for_type(rdftype_iri: str): }, } +# metadata not included in the core record +OSFMAP_SUPPLEMENT = { + OSF.Project: { + OSF.hasOsfAddon: None, + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.ProjectComponent: { + OSF.hasOsfAddon: None, + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.Registration: { + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.RegistrationComponent: { + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.Preprint: { + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.File: { + }, +} + +# metadata not included in the core record that expires after a month +OSFMAP_MONTHLY_SUPPLEMENT = { + OSF.Project: { + OSF.usage: None, + }, + OSF.ProjectComponent: { + OSF.usage: None, + }, + OSF.Registration: { + OSF.usage: None, + }, + OSF.RegistrationComponent: { + OSF.usage: None, + }, + OSF.Preprint: { + OSF.usage: None, + }, + OSF.File: { + OSF.usage: None, + }, +} + + OSF_ARTIFACT_PREDICATES = { ArtifactTypes.ANALYTIC_CODE: OSF.hasAnalyticCodeResource, ArtifactTypes.DATA: OSF.hasDataResource, @@ -218,6 +263,11 @@ def osfmap_for_type(rdftype_iri: str): ArtifactTypes.PAPERS: OSF.hasPapersResource, ArtifactTypes.SUPPLEMENTS: OSF.hasSupplementalResource, } +OSF_CONTRIBUTOR_ROLES = { + osfpermissions.READ: OSF['readonly-contributor'], + osfpermissions.WRITE: OSF['write-contributor'], + osfpermissions.ADMIN: OSF['admin-contributor'], +} BEPRESS_SUBJECT_SCHEME_URI = 'https://bepress.com/reference_guide_dc/disciplines/' BEPRESS_SUBJECT_SCHEME_TITLE = 'bepress Digital Commons Three-Tiered Taxonomy' @@ -259,6 +309,37 @@ def osfmap_for_type(rdftype_iri: str): OSF.Registration: 'StudyRegistration', } + +class OsfmapPartition(enum.Enum): + MAIN = OSFMAP + SUPPLEMENT = OSFMAP_SUPPLEMENT + MONTHLY_SUPPLEMENT = OSFMAP_MONTHLY_SUPPLEMENT + + @property + def is_supplementary(self) -> bool: + return self is not OsfmapPartition.MAIN + + def osfmap_for_type(self, rdftype_iri: str): + try: + return self.value[rdftype_iri] + except KeyError: + if self.is_supplementary: + return {} # allow missing types for non-main partitions + raise ValueError(f'invalid OSFMAP type! expected one of {set(self.value.keys())}, got {rdftype_iri}') + + def get_expiration_date(self, basket: gather.Basket) -> datetime.date | None: + if self is not OsfmapPartition.MONTHLY_SUPPLEMENT: + return None + # let a monthly report expire two months after its reporting period ends + # (this allows the *next* monthly report up to a month to compute, which + # aligns with COUNTER https://www.countermetrics.org/code-of-practice/ ) + # (HACK: entangled with `gather_last_month_usage` implementation, below) + _report_yearmonth_str = next(basket[OSF.usage / DCTERMS.temporal], None) + if _report_yearmonth_str is None: + return None + _report_yearmonth = YearMonth.from_str(_report_yearmonth_str) + return _report_yearmonth.next().next().month_end().date() + ##### END osfmap ##### @@ -619,6 +700,8 @@ def _gather_fileversion(fileversion, fileversion_iri): version_sha256 = (fileversion.metadata or {}).get('sha256') if version_sha256: yield (fileversion_iri, DCTERMS.requires, checksum_iri('sha-256', version_sha256)) + if fileversion.region is not None: + yield from _storage_region_triples(fileversion.region, subject_ref=fileversion_iri) @gather.er(OSF.contains) @@ -819,11 +902,24 @@ def gather_agents(focus): # TODO: preserve order via rdflib.Seq +@gather.er(PROV.qualifiedAttribution) +def gather_qualified_attributions(focus): + _contributor_set = getattr(focus.dbmodel, 'contributor_set', None) + if _contributor_set is not None: + for _contributor in _contributor_set.filter(visible=True).select_related('user'): + _osfrole_ref = OSF_CONTRIBUTOR_ROLES.get(_contributor.permission) + if _osfrole_ref is not None: + _attribution_ref = rdflib.BNode() + yield (PROV.qualifiedAttribution, _attribution_ref) + yield (_attribution_ref, PROV.agent, OsfFocus(_contributor.user)) + yield (_attribution_ref, DCAT.hadRole, _osfrole_ref) + + @gather.er(OSF.affiliation) def gather_affiliated_institutions(focus): if hasattr(focus.dbmodel, 'get_affiliated_institutions'): # like OSFUser institution_qs = focus.dbmodel.get_affiliated_institutions() - elif hasattr(focus.dbmodel, 'affiliated_institutions'): # like AbstractNode + elif hasattr(focus.dbmodel, 'affiliated_institutions'): # like AbstractNode or Preprint institution_qs = focus.dbmodel.affiliated_institutions.all() else: institution_qs = () @@ -1029,3 +1125,63 @@ def gather_cedar_templates(focus): template_iri = rdflib.URIRef(record.get_template_semantic_iri()) yield (OSF.hasCedarTemplate, template_iri) yield (template_iri, DCTERMS.title, record.get_template_name()) + + +@gather.er(OSF.usage) +def gather_last_month_usage(focus): + _usage_report = PublicItemUsageReport.for_last_month( + item_osfid=osfguid_from_iri(focus.iri), + ) + if _usage_report is not None: + _usage_report_ref = rdflib.BNode() + yield (OSF.usage, _usage_report_ref) + yield (_usage_report_ref, DCAT.accessService, rdflib.URIRef(website_settings.DOMAIN.rstrip('/'))) + yield (_usage_report_ref, FOAF.primaryTopic, focus.iri) + yield (_usage_report_ref, DCTERMS.temporal, rdflib.Literal( + str(_usage_report.report_yearmonth), + datatype=rdflib.XSD.gYearMonth, + )) + yield (_usage_report_ref, OSF.viewCount, _usage_report.view_count) + yield (_usage_report_ref, OSF.viewSessionCount, _usage_report.view_session_count) + yield (_usage_report_ref, OSF.downloadCount, _usage_report.download_count) + yield (_usage_report_ref, OSF.downloadSessionCount, _usage_report.download_session_count) + + +@gather.er(OSF.hasOsfAddon) +def gather_addons(focus): + # note: when gravyvalet exists, use `iterate_addons_for_resource` + # from osf.external.gravy_valet.request_helpers and get urls like + # "https://addons.osf.example/v1/addon-imps/..." instead of a urn + for _addon_settings in focus.dbmodel.get_addons(): + if not _addon_settings.config.added_default: # skip always-on addons + _addon_ref = rdflib.URIRef(f'urn:osf.io:addons:{_addon_settings.short_name}') + yield (OSF.hasOsfAddon, _addon_ref) + yield (_addon_ref, RDF.type, OSF.AddonImplementation) + yield (_addon_ref, DCTERMS.identifier, _addon_settings.short_name) + yield (_addon_ref, SKOS.prefLabel, _addon_settings.config.full_name) + + +@gather.er(OSF.storageRegion) +def gather_storage_region(focus): + _region = getattr(focus.dbmodel, 'osfstorage_region', None) + if _region is not None: + yield from _storage_region_triples(_region) + + +def _storage_region_triples(region, *, subject_ref=None): + _region_ref = rdflib.URIRef(region.absolute_api_v2_url) + if subject_ref is None: + yield (OSF.storageRegion, _region_ref) + else: + yield (subject_ref, OSF.storageRegion, _region_ref) + yield (_region_ref, SKOS.prefLabel, rdflib.Literal(region.name, lang='en')) + + +@gather.er( + OSF.storageByteCount, + focustype_iris=[OSF.Project, OSF.ProjectComponent, OSF.Registration, OSF.RegistrationComponent, OSF.Preprint] +) +def gather_storage_byte_count(focus): + _storage_usage_total = get_storage_usage_total(focus.dbmodel) + if _storage_usage_total is not None: + yield (OSF.storageByteCount, _storage_usage_total) diff --git a/osf/metadata/rdfutils.py b/osf/metadata/rdfutils.py index cd944169e20..d2596ad344e 100644 --- a/osf/metadata/rdfutils.py +++ b/osf/metadata/rdfutils.py @@ -23,6 +23,7 @@ RDF = rdflib.Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') # "resource description framework" SKOS = rdflib.Namespace('http://www.w3.org/2004/02/skos/core#') # "simple knowledge organization system" DCAT = rdflib.Namespace('http://www.w3.org/ns/dcat#') # "data catalog (vocabulary)" +PROV = rdflib.Namespace('http://www.w3.org/ns/prov#') # "provenance" # non-standard namespace for datacite terms (resolves to datacite docs) DATACITE = rdflib.Namespace('https://schema.datacite.org/meta/kernel-4/#') @@ -38,6 +39,7 @@ 'skos': SKOS, 'dcmitype': DCMITYPE, 'dcat': DCAT, + 'prov': PROV, } diff --git a/osf/metadata/serializers/turtle.py b/osf/metadata/serializers/turtle.py index 649614b0bfa..e90db45f2f6 100644 --- a/osf/metadata/serializers/turtle.py +++ b/osf/metadata/serializers/turtle.py @@ -1,4 +1,4 @@ -from osf.metadata.osf_gathering import osfmap_for_type +from osf.metadata.osf_gathering import OsfmapPartition from osf.metadata.serializers import _base @@ -9,5 +9,9 @@ def filename_for_itemid(self, itemid: str): return f'{itemid}-metadata.ttl' def serialize(self) -> str: - self.basket.pls_gather(osfmap_for_type(self.basket.focus.rdftype)) + _partition = self.serializer_config.get('osfmap_partition', OsfmapPartition.MAIN) + self.basket.pls_gather( + _partition.osfmap_for_type(self.basket.focus.rdftype), + include_defaults=(_partition is OsfmapPartition.MAIN), + ) return self.basket.gathered_metadata.serialize(format='turtle') diff --git a/osf/metrics/counted_usage.py b/osf/metrics/counted_usage.py index e6a3abf9cd5..c3c6d4cc1aa 100644 --- a/osf/metrics/counted_usage.py +++ b/osf/metrics/counted_usage.py @@ -10,7 +10,6 @@ import pytz from osf.metrics.utils import stable_key -from osf.models import Guid logger = logging.getLogger(__name__) @@ -87,6 +86,7 @@ def _autofill_fields(sender, instance, **kwargs): _fill_pageview_info(instance) item_guid = getattr(instance, 'item_guid', None) if item_guid: + from osf.models import Guid guid_instance = Guid.load(item_guid) if guid_instance and guid_instance.referent: _fill_osfguid_info(instance, guid_instance.referent) @@ -104,10 +104,10 @@ def _fill_pageview_info(counted_usage): def _fill_osfguid_info(counted_usage, guid_referent): counted_usage.item_public = _get_ispublic(guid_referent) - counted_usage.item_type = type(guid_referent).__name__.lower() + counted_usage.item_type = get_item_type(guid_referent) counted_usage.surrounding_guids = _get_surrounding_guids(guid_referent) if not counted_usage.provider_id: - counted_usage.provider_id = _get_provider_id(guid_referent) + counted_usage.provider_id = get_provider_id(guid_referent) def _fill_document_id(counted_usage): @@ -153,7 +153,7 @@ def _get_ispublic(guid_referent): return getattr(maybe_public, 'is_public', None) # quacks like AbstractNode -def _get_provider_id(guid_referent): +def get_provider_id(guid_referent): provider = getattr(guid_referent, 'provider', None) if isinstance(provider, str): return provider # quacks like BaseFileNode @@ -162,6 +162,10 @@ def _get_provider_id(guid_referent): return 'osf' # quacks like Node, Comment, WikiPage +def get_item_type(guid_referent): + return type(guid_referent).__name__.lower() + + def _get_immediate_wrapper(guid_referent): if hasattr(guid_referent, 'verified_publishable'): return None # quacks like Preprint diff --git a/osf/metrics/reporters/__init__.py b/osf/metrics/reporters/__init__.py index 1f8e0fba862..412b1c2bf90 100644 --- a/osf/metrics/reporters/__init__.py +++ b/osf/metrics/reporters/__init__.py @@ -4,10 +4,13 @@ from .storage_addon_usage import StorageAddonUsageReporter from .download_count import DownloadCountReporter from .institution_summary import InstitutionSummaryReporter +from .institutional_users import InstitutionalUsersReporter +from .institution_summary_monthly import InstitutionalSummaryMonthlyReporter from .new_user_domain import NewUserDomainReporter from .node_count import NodeCountReporter from .osfstorage_file_count import OsfstorageFileCountReporter from .preprint_count import PreprintCountReporter +from .public_item_usage import PublicItemUsageReporter from .user_count import UserCountReporter from .spam_count import SpamCountReporter @@ -26,3 +29,6 @@ class AllDailyReporters(enum.Enum): class AllMonthlyReporters(enum.Enum): SPAM_COUNT = SpamCountReporter + INSTITUTIONAL_USERS = InstitutionalUsersReporter + INSTITUTIONAL_SUMMARY = InstitutionalSummaryMonthlyReporter + ITEM_USAGE = PublicItemUsageReporter diff --git a/osf/metrics/reporters/_base.py b/osf/metrics/reporters/_base.py index d3bf1722523..931afe23fd0 100644 --- a/osf/metrics/reporters/_base.py +++ b/osf/metrics/reporters/_base.py @@ -1,23 +1,34 @@ +from collections import abc +import dataclasses import logging +import celery + +from osf.metrics.reports import MonthlyReport from osf.metrics.utils import YearMonth logger = logging.getLogger(__name__) +@dataclasses.dataclass class MonthlyReporter: - def report(self, report_yearmonth: YearMonth): + yearmonth: YearMonth + + def report(self) -> abc.Iterable[MonthlyReport] | abc.Iterator[MonthlyReport]: """build a report for the given month """ raise NotImplementedError(f'{self.__name__} must implement `report`') - def run_and_record_for_month(self, report_yearmonth: YearMonth): - reports = self.report(report_yearmonth) + def run_and_record_for_month(self) -> None: + reports = self.report() for report in reports: - assert report.report_yearmonth == str(report_yearmonth) + report.report_yearmonth = self.yearmonth report.save() + def followup_task(self) -> celery.Signature | None: + return None + class DailyReporter: def report(self, report_date): diff --git a/osf/metrics/reporters/institution_summary_monthly.py b/osf/metrics/reporters/institution_summary_monthly.py new file mode 100644 index 00000000000..998cc056298 --- /dev/null +++ b/osf/metrics/reporters/institution_summary_monthly.py @@ -0,0 +1,105 @@ +from django.contrib.contenttypes.models import ContentType +from django.db.models import Q, F, Sum, OuterRef, Exists + +from osf.models import Institution, Preprint, AbstractNode, FileVersion, NodeLog, PreprintLog +from osf.models.spam import SpamStatus +from addons.osfstorage.models import OsfStorageFile +from osf.metrics.reports import InstitutionMonthlySummaryReport +from ._base import MonthlyReporter + + +class InstitutionalSummaryMonthlyReporter(MonthlyReporter): + """Generate an InstitutionMonthlySummaryReport for each institution.""" + + def report(self): + for institution in Institution.objects.all(): + yield self.generate_report(institution) + + def generate_report(self, institution): + node_queryset = institution.nodes.filter( + deleted__isnull=True, + created__lt=self.yearmonth.month_end() + ).exclude( + spam_status=SpamStatus.SPAM, + ) + + preprint_queryset = self.get_published_preprints(institution, self.yearmonth) + + return InstitutionMonthlySummaryReport( + institution_id=institution._id, + user_count=institution.get_institution_users().count(), + private_project_count=self._get_count(node_queryset, 'osf.node', is_public=False), + public_project_count=self._get_count(node_queryset, 'osf.node', is_public=True), + public_registration_count=self._get_count(node_queryset, 'osf.registration', is_public=True), + embargoed_registration_count=self._get_count(node_queryset, 'osf.registration', is_public=False), + published_preprint_count=preprint_queryset.count(), + storage_byte_count=self.get_storage_size(node_queryset, preprint_queryset), + public_file_count=self.get_files(node_queryset, preprint_queryset, is_public=True).count(), + monthly_logged_in_user_count=self.get_monthly_logged_in_user_count(institution, self.yearmonth), + monthly_active_user_count=self.get_monthly_active_user_count(institution, self.yearmonth), + ) + + def _get_count(self, node_queryset, node_type, is_public): + return node_queryset.filter(type=node_type, is_public=is_public, root_id=F('pk')).count() + + def get_published_preprints(self, institution, yearmonth): + queryset = Preprint.objects.can_view().filter( + affiliated_institutions=institution, + created__lte=yearmonth.month_end() + ).exclude( + spam_status=SpamStatus.SPAM + ) + + return queryset + + def get_files(self, node_queryset, preprint_queryset, is_public=None): + public_kwargs = {} + if is_public: + public_kwargs = {'is_public': is_public} + + target_node_q = Q( + target_object_id__in=node_queryset.filter(**public_kwargs).values('pk'), + target_content_type=ContentType.objects.get_for_model(AbstractNode), + ) + target_preprint_q = Q( + target_object_id__in=preprint_queryset.values('pk'), + target_content_type=ContentType.objects.get_for_model(Preprint), + ) + return OsfStorageFile.objects.filter( + deleted__isnull=True, purged__isnull=True + ).filter(target_node_q | target_preprint_q) + + def get_storage_size(self, node_queryset, preprint_queryset): + files = self.get_files(node_queryset, preprint_queryset) + return FileVersion.objects.filter( + size__gt=0, + purged__isnull=True, + basefilenode__in=files + ).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes'] + + def get_monthly_logged_in_user_count(self, institution, yearmonth): + return institution.get_institution_users().filter( + date_last_login__gte=yearmonth.month_start(), + date_last_login__lt=yearmonth.month_end() + ).count() + + def get_monthly_active_user_count(self, institution, yearmonth): + start_date = yearmonth.month_start() + end_date = yearmonth.month_end() + + nodelogs = NodeLog.objects.filter( + user=OuterRef('pk'), + created__gte=start_date, + created__lt=end_date + ) + preprintlogs = PreprintLog.objects.filter( + user=OuterRef('pk'), + created__gte=start_date, + created__lt=end_date + ) + + return institution.get_institution_users().filter( + date_disabled__isnull=True + ).annotate( + has_logs=Exists(nodelogs) | Exists(preprintlogs) + ).filter(has_logs=True).count() diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py new file mode 100644 index 00000000000..e0f7f42a156 --- /dev/null +++ b/osf/metrics/reporters/institutional_users.py @@ -0,0 +1,161 @@ +import dataclasses +import datetime + +from django.contrib.contenttypes.models import ContentType +from django.db.models import Q, F, Sum + +from osf import models as osfdb +from osf.models.spam import SpamStatus +from addons.osfstorage.models import OsfStorageFile +from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.utils import YearMonth +from ._base import MonthlyReporter + + +_CHUNK_SIZE = 500 + + +class InstitutionalUsersReporter(MonthlyReporter): + '''build an InstitutionalUserReport for each institution-user affiliation + + built for the institution dashboard at ://osf.example/institutions//dashboard/, + which offers institutional admins insight into how people at their institution are + using osf, based on their explicitly-affiliated osf objects + ''' + def report(self): + _before_datetime = self.yearmonth.month_end() + for _institution in osfdb.Institution.objects.filter(created__lt=_before_datetime): + _user_qs = _institution.get_institution_users().filter(created__lt=_before_datetime) + for _user in _user_qs.iterator(chunk_size=_CHUNK_SIZE): + _helper = _InstiUserReportHelper(_institution, _user, self.yearmonth, _before_datetime) + yield _helper.report + + +# helper +@dataclasses.dataclass +class _InstiUserReportHelper: + institution: osfdb.Institution + user: osfdb.OSFUser + yearmonth: YearMonth + before_datetime: datetime.datetime + report: InstitutionalUserReport = dataclasses.field(init=False) + + def __post_init__(self): + _affiliation = self.user.get_institution_affiliation(self.institution._id) + self.report = InstitutionalUserReport( + institution_id=self.institution._id, + user_id=self.user._id, + user_name=self.user.fullname, + department_name=(_affiliation.sso_department or None), + month_last_login=( + YearMonth.from_date(self.user.date_last_login) + if self.user.date_last_login is not None + else None + ), + month_last_active=self._get_last_active(), + account_creation_date=YearMonth.from_date(self.user.created), + orcid_id=self.user.get_verified_external_id('ORCID', verified_only=True), + public_project_count=self._public_project_queryset().count(), + private_project_count=self._private_project_queryset().count(), + public_registration_count=self._public_registration_queryset().count(), + embargoed_registration_count=self._embargoed_registration_queryset().count(), + public_file_count=self._public_osfstorage_file_queryset().count(), + published_preprint_count=self._published_preprint_queryset().count(), + storage_byte_count=self._storage_byte_count(), + ) + + def _node_queryset(self): + _institution_node_qs = self.institution.nodes.filter( + created__lt=self.before_datetime, + is_deleted=False, + ).exclude(spam_status=SpamStatus.SPAM) + return osfdb.Node.objects.get_nodes_for_user( + user=self.user, + base_queryset=_institution_node_qs, + ) + + def _public_project_queryset(self): + return self._node_queryset().filter( + type='osf.node', # `type` field from TypedModel + is_public=True, + root_id=F('pk'), # only root nodes + ) + + def _private_project_queryset(self): + return self._node_queryset().filter( + type='osf.node', # `type` field from TypedModel + is_public=False, + root_id=F('pk'), # only root nodes + ) + + def _public_registration_queryset(self): + return self._node_queryset().filter( + type='osf.registration', # `type` field from TypedModel + is_public=True, + root_id=F('pk'), # only root nodes + ) + + def _embargoed_registration_queryset(self): + return self._node_queryset().filter( + type='osf.registration', # `type` field from TypedModel + is_public=False, + root_id=F('pk'), # only root nodes + embargo__end_date__gte=self.before_datetime, + ) + + def _published_preprint_queryset(self): + return ( + osfdb.Preprint.objects.can_view() # published/publicly-viewable + .filter( + affiliated_institutions=self.institution, + _contributors=self.user, + date_published__lt=self.before_datetime, + ) + .exclude(spam_status=SpamStatus.SPAM) + ) + + def _public_osfstorage_file_queryset(self): + _target_node_q = Q( + # any public project, registration, project component, or registration component + target_object_id__in=self._node_queryset().filter(is_public=True).values('pk'), + target_content_type=ContentType.objects.get_for_model(osfdb.AbstractNode), + ) + _target_preprint_q = Q( + target_object_id__in=self._published_preprint_queryset().values('pk'), + target_content_type=ContentType.objects.get_for_model(osfdb.Preprint), + ) + return ( + OsfStorageFile.objects + .filter( + created__lt=self.before_datetime, + deleted__isnull=True, + purged__isnull=True, + ) + .filter(_target_node_q | _target_preprint_q) + ) + + def _storage_byte_count(self): + return osfdb.FileVersion.objects.filter( + size__gt=0, + created__lt=self.before_datetime, + purged__isnull=True, + basefilenode__in=self._public_osfstorage_file_queryset(), + ).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes'] + + def _get_last_active(self): + end_date = self.yearmonth.month_end() + + node_logs = self.user.logs.filter(created__lt=end_date).order_by('-created') + preprint_logs = self.user.preprint_logs.filter(created__lt=end_date).order_by('-created') + + dates = filter(bool, [ + node_logs.values_list('created', flat=True).first(), + preprint_logs.values_list('created', flat=True).first(), + ]) + + latest_activity_date = max(dates, default=None) + + if latest_activity_date: + return YearMonth.from_date(latest_activity_date) + else: + return None diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py new file mode 100644 index 00000000000..ecc34a5d9c7 --- /dev/null +++ b/osf/metrics/reporters/public_item_usage.py @@ -0,0 +1,286 @@ +from __future__ import annotations +import typing + +import celery +if typing.TYPE_CHECKING: + import elasticsearch_dsl as edsl + +from osf.metadata.osf_gathering import OsfmapPartition +from osf.metrics.counted_usage import ( + CountedAuthUsage, + get_item_type, + get_provider_id, +) +from osf.metrics.reports import PublicItemUsageReport +from osf import models as osfdb +from website import settings as website_settings +from ._base import MonthlyReporter + + +_CHUNK_SIZE = 500 + +_MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control + + +class _SkipItem(Exception): + pass + + +class PublicItemUsageReporter(MonthlyReporter): + '''build a PublicItemUsageReport for each public item + + includes projects, project components, registrations, registration components, and preprints + ''' + + def report(self): + # use two composite aggregations in parallel to page thru every + # public item viewed or downloaded this month, counting: + # - views and downloads for each item (using `CountedAuthUsage.item_guid`) + # - views for each item's components and files (using `CountedAuthUsage.surrounding_guids`) + for _exact_bucket, _contained_views_bucket in _zip_composite_aggs( + self._exact_item_search(), 'agg_osfid', + self._contained_item_views_search(), 'agg_surrounding_osfid', + ): + try: + _report = self._report_from_buckets(_exact_bucket, _contained_views_bucket) + yield _report + except _SkipItem: + pass + + def followup_task(self): + return task__update_monthly_metadatas.signature( + args=[str(self.yearmonth)], + countdown=30, # give index time to settle + ) + + def _report_from_buckets(self, exact_bucket, contained_views_bucket): + # either exact_bucket or contained_views_bucket may be None, but not both + assert (exact_bucket is not None) or (contained_views_bucket is not None) + _report = ( + self._init_report_from_exact_bucket(exact_bucket) + if exact_bucket is not None + else self._init_report_from_osfid(contained_views_bucket.key.osfid) + ) + # view counts include views on contained items (components, files) + _report.view_count, _report.view_session_count = self._get_view_counts(_report.item_osfid) + return _report + + def _init_report_from_exact_bucket(self, exact_bucket) -> PublicItemUsageReport: + # in the (should-be common) case of an item that has been directly viewed in + # this month, the stored metrics already have the data required + _report = PublicItemUsageReport( + item_osfid=exact_bucket.key.osfid, + item_type=_agg_keys(exact_bucket.agg_item_type), + provider_id=_agg_keys(exact_bucket.agg_provider_id), + platform_iri=_agg_keys(exact_bucket.agg_platform_iri), + # default counts to zero, will be updated if non-zero + view_count=0, + view_session_count=0, + download_count=0, + download_session_count=0, + ) + for _actionbucket in exact_bucket.agg_action: + # note: view counts computed separately to avoid double-counting + if _actionbucket.key == CountedAuthUsage.ActionLabel.DOWNLOAD.value: + _report.download_count = _actionbucket.doc_count + _report.download_session_count = _actionbucket.agg_session_count.value + return _report + + def _init_report_from_osfid(self, osfid: str) -> PublicItemUsageReport: + # for the (should-be unusual) case where the components/files contained by + # an item have views in this month, but the item itself does not -- + # load necessary info via django models, instead + _osfguid = osfdb.Guid.load(osfid) + if _osfguid is None or not getattr(_osfguid.referent, 'is_public', False): + raise _SkipItem + return PublicItemUsageReport( + item_osfid=osfid, + item_type=[get_item_type(_osfguid.referent)], + provider_id=[get_provider_id(_osfguid.referent)], + platform_iri=[website_settings.DOMAIN], + # default counts to zero, will be updated if non-zero + view_count=0, + view_session_count=0, + download_count=0, + download_session_count=0, + ) + + def _base_usage_search(self): + return ( + CountedAuthUsage.search() + .filter('term', item_public=True) + .filter('range', timestamp={ + 'gte': self.yearmonth.month_start(), + 'lt': self.yearmonth.month_end(), + }) + .update_from_dict({'size': 0}) # only aggregations, no hits + ) + + def _exact_item_search(self) -> edsl.Search: + '''aggregate views and downloads on each osfid (not including components/files)''' + _search = self._base_usage_search() + # the main agg: use a composite aggregation to page thru *every* item + _agg_osfid = _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'item_guid'}}}], + size=_CHUNK_SIZE, + ) + # nested agg: for each item, get platform_iri values + _agg_osfid.bucket('agg_platform_iri', 'terms', field='platform_iri') + # nested agg: for each item, get provider_id values + _agg_osfid.bucket('agg_provider_id', 'terms', field='provider_id') + # nested agg: for each item, get item_type values + _agg_osfid.bucket('agg_item_type', 'terms', field='item_type') + # nested agg: for each item, get download count + _agg_action = _agg_osfid.bucket( + 'agg_action', + 'terms', + field='action_labels', + include=[ + CountedAuthUsage.ActionLabel.DOWNLOAD.value, + ], + ) + # nested nested agg: get download session count + _agg_action.metric( + 'agg_session_count', + 'cardinality', + field='session_id', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + return _search + + def _contained_item_views_search(self) -> edsl.Search: + '''iterate osfids with views on contained components and files''' + _search = ( + self._base_usage_search() + .filter('term', action_labels=CountedAuthUsage.ActionLabel.VIEW.value) + ) + # the main agg: use a composite aggregation to page thru *every* item + _search.aggs.bucket( + 'agg_surrounding_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'surrounding_guids'}}}], + size=_CHUNK_SIZE, + ) + return _search + + def _get_view_counts(self, osfid: str) -> tuple[int, int]: + '''compute view_session_count separately to avoid double-counting + + (the same session may be represented in both the composite agg on `item_guid` + and that on `surrounding_guids`) + ''' + _search = ( + self._base_usage_search() + .query( + 'bool', + filter=[ + {'term': {'action_labels': CountedAuthUsage.ActionLabel.VIEW.value}}, + ], + should=[ + {'term': {'item_guid': osfid}}, + {'term': {'surrounding_guids': osfid}}, + ], + minimum_should_match=1, + ) + ) + _search.aggs.metric( + 'agg_session_count', + 'cardinality', + field='session_id', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _view_count = _response.hits.total + _view_session_count = _response.aggregations.agg_session_count.value + return (_view_count, _view_session_count) + + +### +# followup celery task +@celery.shared_task +def task__update_monthly_metadatas(yearmonth: str): + from api.share.utils import task__update_share + _report_search = ( + PublicItemUsageReport.search() + .filter('term', report_yearmonth=yearmonth) + .source(['item_osfid']) # return only the 'item_osfid' field + ) + for _hit in _report_search.scan(): + task__update_share.delay( + _hit.item_osfid, + is_backfill=True, + osfmap_partition_name=OsfmapPartition.MONTHLY_SUPPLEMENT.name, + ) + + +### +# local helpers + +def _agg_keys(bucket_agg_result) -> list: + return [_bucket.key for _bucket in bucket_agg_result] + + +def _zip_composite_aggs( + search_a: edsl.Search, + composite_agg_name_a: str, + search_b: edsl.Search, + composite_agg_name_b: str, +): + '''iterate thru two composite aggregations, yielding pairs of buckets matched by key + + the composite aggregations must have matching names in `sources` so their keys can be compared + ''' + _iter_a = _iter_composite_buckets(search_a, composite_agg_name_a) + _iter_b = _iter_composite_buckets(search_b, composite_agg_name_b) + _next_a = next(_iter_a, None) + _next_b = next(_iter_b, None) + while True: + if _next_a is None and _next_b is None: + return # both done + elif _next_a is None or _next_b is None: + # one is done but not the other -- no matching needed + yield (_next_a, _next_b) + _next_a = next(_iter_a, None) + _next_b = next(_iter_b, None) + elif _next_a.key == _next_b.key: + # match -- yield and increment both + yield (_next_a, _next_b) + _next_a = next(_iter_a, None) + _next_b = next(_iter_b, None) + elif _orderable_key(_next_a) < _orderable_key(_next_b): + # mismatch -- yield and increment a (but not b) + yield (_next_a, None) + _next_a = next(_iter_a, None) + else: + # mismatch -- yield and increment b (but not a) + yield (None, _next_b) + _next_b = next(_iter_b, None) + + +def _iter_composite_buckets(search: edsl.Search, composite_agg_name: str): + '''iterate thru *all* buckets of a composite aggregation, requesting new pages as needed + + assumes the given search has a composite aggregation of the given name + + updates the search in-place for subsequent pages + ''' + while True: + _page_response = search.execute(ignore_cache=True) # reused search object has the previous page cached + try: + _agg_result = _page_response.aggregations[composite_agg_name] + except KeyError: + return # no data; all done + yield from _agg_result.buckets + # update the search for the next page + try: + _next_after = _agg_result.after_key + except AttributeError: + return # all done + else: + search.aggs[composite_agg_name].after = _next_after + + +def _orderable_key(composite_bucket) -> list: + return sorted(composite_bucket.key.to_dict().items()) diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py index 54feae8bee5..94290f96203 100644 --- a/osf/metrics/reporters/spam_count.py +++ b/osf/metrics/reporters/spam_count.py @@ -8,12 +8,11 @@ class SpamCountReporter(MonthlyReporter): - def report(self, report_yearmonth): - target_month = report_yearmonth.target_month() - next_month = report_yearmonth.next_month() + def report(self): + target_month = self.yearmonth.month_start() + next_month = self.yearmonth.month_end() report = SpamSummaryReport( - report_yearmonth=str(report_yearmonth), # Node Log entries node_confirmed_spam=NodeLog.objects.filter( action=NodeLog.CONFIRM_SPAM, diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index 609e79fc324..43bdd0fabd1 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -1,3 +1,5 @@ +from __future__ import annotations +from collections import abc import datetime from django.dispatch import receiver @@ -20,10 +22,14 @@ class DailyReport(metrics.Metric): There's something we'd like to know about every so often, so let's regularly run a report and stash the results here. """ - DAILY_UNIQUE_FIELD = None # set in subclasses that expect multiple reports per day + UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_date',) # override in subclasses for multiple reports per day report_date = metrics.Date(format='strict_date', required=True) + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + class Meta: abstract = True dynamic = metrics.MetaField('strict') @@ -32,7 +38,7 @@ class Meta: class YearmonthField(metrics.Date): def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, format='strict_year_month', required=True) + super().__init__(*args, **kwargs, format='strict_year_month') def deserialize(self, data): if isinstance(data, YearMonth): @@ -41,8 +47,14 @@ def deserialize(self, data): return YearMonth.from_str(data) elif isinstance(data, (datetime.datetime, datetime.date)): return YearMonth.from_date(data) + elif isinstance(data, int): + # elasticsearch stores dates in milliseconds since the unix epoch + _as_datetime = datetime.datetime.fromtimestamp(data // 1000) + return YearMonth.from_date(_as_datetime) + elif data is None: + return None else: - raise ValueError('unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') + raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') def serialize(self, data): if isinstance(data, str): @@ -51,6 +63,8 @@ def serialize(self, data): return str(data) elif isinstance(data, (datetime.datetime, datetime.date)): return str(YearMonth.from_date(data)) + elif data is None: + return None else: raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM') @@ -58,34 +72,57 @@ def serialize(self, data): class MonthlyReport(metrics.Metric): """MonthlyReport (abstract base for report-based metrics that run monthly) """ + UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_yearmonth',) # override in subclasses for multiple reports per month - report_yearmonth = YearmonthField() + report_yearmonth = YearmonthField(required=True) class Meta: abstract = True dynamic = metrics.MetaField('strict') source = metrics.MetaField(enabled=True) + @classmethod + def most_recent_yearmonth(cls, base_search=None) -> YearMonth | None: + _search = base_search or cls.search() + _search = _search.update_from_dict({'size': 0}) # omit hits + _search.aggs.bucket( + 'agg_most_recent_yearmonth', + 'terms', + field='report_yearmonth', + order={'_key': 'desc'}, + size=1, + ) + _response = _search.execute() + if not _response.aggregations: + return None + (_bucket,) = _response.aggregations.agg_most_recent_yearmonth.buckets + return _bucket.key + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + @receiver(metrics_pre_save) def set_report_id(sender, instance, **kwargs): - # Set the document id to a hash of "unique together" - # values (just `report_date` by default) to get - # "ON CONFLICT UPDATE" behavior -- if the document - # already exists, it will be updated rather than duplicated. - # Cannot detect/avoid conflicts this way, but that's ok. - - if issubclass(sender, DailyReport): - duf_name = instance.DAILY_UNIQUE_FIELD - if duf_name is None: - instance.meta.id = stable_key(instance.report_date) - else: - duf_value = getattr(instance, duf_name) - if not duf_value or not isinstance(duf_value, str): - raise ReportInvalid(f'{sender.__name__}.{duf_name} MUST have a non-empty string value (got {duf_value})') - instance.meta.id = stable_key(instance.report_date, duf_value) - elif issubclass(sender, MonthlyReport): - instance.meta.id = stable_key(instance.report_yearmonth) + try: + _unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS + except AttributeError: + pass + else: + # Set the document id to a hash of "unique together" fields + # for "ON CONFLICT UPDATE" behavior -- if the document + # already exists, it will be updated rather than duplicated. + # Cannot detect/avoid conflicts this way, but that's ok. + _key_values = [] + for _field_name in _unique_together_fields: + _field_value = getattr(instance, _field_name) + if not _field_value or ( + isinstance(_field_value, abc.Iterable) and not isinstance(_field_value, str) + ): + raise ReportInvalid(f'because "{_field_name}" is in {sender.__name__}.UNIQUE_TOGETHER_FIELDS, {sender.__name__}.{_field_name} MUST have a non-empty scalar value (got {_field_value} of type {type(_field_value)})') + _key_values.append(_field_value) + instance.meta.id = stable_key(*_key_values) #### BEGIN reusable inner objects ##### @@ -157,7 +194,7 @@ class DownloadCountReport(DailyReport): class InstitutionSummaryReport(DailyReport): - DAILY_UNIQUE_FIELD = 'institution_id' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'institution_id',) institution_id = metrics.Keyword() institution_name = metrics.Keyword() @@ -169,7 +206,7 @@ class InstitutionSummaryReport(DailyReport): class NewUserDomainReport(DailyReport): - DAILY_UNIQUE_FIELD = 'domain_name' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'domain_name',) domain_name = metrics.Keyword() new_user_count = metrics.Integer() @@ -187,7 +224,7 @@ class OsfstorageFileCountReport(DailyReport): class PreprintSummaryReport(DailyReport): - DAILY_UNIQUE_FIELD = 'provider_key' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'provider_key',) provider_key = metrics.Keyword() preprint_count = metrics.Integer() @@ -214,3 +251,75 @@ class SpamSummaryReport(MonthlyReport): preprint_flagged = metrics.Integer() user_marked_as_spam = metrics.Integer() user_marked_as_ham = metrics.Integer() + + +class InstitutionalUserReport(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', 'user_id',) + institution_id = metrics.Keyword() + # user info: + user_id = metrics.Keyword() + user_name = metrics.Keyword() + department_name = metrics.Keyword() + month_last_login = YearmonthField() + month_last_active = YearmonthField() + account_creation_date = YearmonthField() + orcid_id = metrics.Keyword() + # counts: + public_project_count = metrics.Integer() + private_project_count = metrics.Integer() + public_registration_count = metrics.Integer() + embargoed_registration_count = metrics.Integer() + published_preprint_count = metrics.Integer() + public_file_count = metrics.Long() + storage_byte_count = metrics.Long() + + +class InstitutionMonthlySummaryReport(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', ) + institution_id = metrics.Keyword() + user_count = metrics.Integer() + public_project_count = metrics.Integer() + private_project_count = metrics.Integer() + public_registration_count = metrics.Integer() + embargoed_registration_count = metrics.Integer() + published_preprint_count = metrics.Integer() + storage_byte_count = metrics.Long() + public_file_count = metrics.Long() + monthly_logged_in_user_count = metrics.Long() + monthly_active_user_count = metrics.Long() + + +class PublicItemUsageReport(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid') + + # where noted, fields are meant to correspond to defined terms from COUNTER + # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html + item_osfid = metrics.Keyword() # counter:Item (or Dataset) + item_type = metrics.Keyword(multi=True) # counter:Data-Type + provider_id = metrics.Keyword(multi=True) # counter:Database(?) + platform_iri = metrics.Keyword(multi=True) # counter:Platform + + # view counts include views on components or files contained by this item + view_count = metrics.Long() # counter:Total Investigations + view_session_count = metrics.Long() # counter:Unique Investigations + + # download counts of this item only (not including contained components or files) + download_count = metrics.Long() # counter:Total Requests + download_session_count = metrics.Long() # counter:Unique Requests + + @classmethod + def for_last_month(cls, item_osfid: str) -> PublicItemUsageReport | None: + _search = ( + PublicItemUsageReport.search() + .filter('term', item_osfid=item_osfid) + # only last month's report + .filter('range', report_yearmonth={ + 'gte': 'now-2M/M', + 'lt': 'now/M', + }) + .sort('-report_yearmonth') + [:1] + ) + _response = _search.execute() + return _response[0] if _response else None diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 5ea397fef39..910b1f3104c 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -1,9 +1,10 @@ +from __future__ import annotations +import calendar +import dataclasses import re import datetime -import typing from hashlib import sha256 - -import pytz +from typing import ClassVar def stable_key(*key_parts): @@ -20,19 +21,22 @@ def stable_key(*key_parts): return sha256(bytes(plain_key, encoding='utf')).hexdigest() -class YearMonth(typing.NamedTuple): +@dataclasses.dataclass(frozen=True) +class YearMonth: + """YearMonth: represents a specific month in a specific year""" year: int month: int - YEARMONTH_RE = re.compile(r'(?P\d{4})-(?P\d{2})') + YEARMONTH_RE: ClassVar[re.Pattern] = re.compile(r'(?P\d{4})-(?P\d{2})') @classmethod - def from_date(cls, date): - assert isinstance(date, (datetime.datetime, datetime.date)) + def from_date(cls, date: datetime.date) -> YearMonth: + """construct a YearMonth from a `datetime.date` (or `datetime.datetime`)""" return cls(date.year, date.month) @classmethod - def from_str(cls, input_str): + def from_str(cls, input_str: str) -> YearMonth: + """construct a YearMonth from a string in "YYYY-MM" format""" match = cls.YEARMONTH_RE.fullmatch(input_str) if match: return cls( @@ -43,12 +47,21 @@ def from_str(cls, input_str): raise ValueError(f'expected YYYY-MM format, got "{input_str}"') def __str__(self): + """convert to string of "YYYY-MM" format""" return f'{self.year}-{self.month:0>2}' - def target_month(self): - return datetime.datetime(self.year, self.month, 1, tzinfo=pytz.utc) + def next(self) -> YearMonth: + """get a new YearMonth for the month after this one""" + return ( + YearMonth(self.year + 1, int(calendar.JANUARY)) + if self.month == calendar.DECEMBER + else YearMonth(self.year, self.month + 1) + ) + + def month_start(self) -> datetime.datetime: + """get a datetime (in UTC timezone) when this YearMonth starts""" + return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.UTC) - def next_month(self): - if self.month == 12: - return datetime.datetime(self.year + 1, 1, 1, tzinfo=pytz.utc) - return datetime.datetime(self.year, self.month + 1, 1, tzinfo=pytz.utc) + def month_end(self) -> datetime.datetime: + """get a datetime (in UTC timezone) when this YearMonth ends (the start of next month)""" + return self.next().month_start() diff --git a/osf/migrations/0024_institution_link_to_external_reports_archive.py b/osf/migrations/0024_institution_link_to_external_reports_archive.py new file mode 100644 index 00000000000..8e1a47fcffb --- /dev/null +++ b/osf/migrations/0024_institution_link_to_external_reports_archive.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.15 on 2024-08-16 15:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('osf', '0023_preprint_affiliated_institutions'), + ] + + operations = [ + migrations.AddField( + model_name='institution', + name='link_to_external_reports_archive', + field=models.URLField(blank=True, default='', help_text='Full URL where institutional admins can access archived metrics reports.', max_length=2048), + ), + ] diff --git a/osf/models/institution.py b/osf/models/institution.py index 0c3a9780ac2..d0ce38eacf4 100644 --- a/osf/models/institution.py +++ b/osf/models/institution.py @@ -118,6 +118,12 @@ class Institution(DirtyFieldsMixin, Loggable, ObjectIDMixin, BaseModel, Guardian blank=True, help_text='The full domain this institutions that will appear in DOI metadata.' ) + link_to_external_reports_archive = models.URLField( + max_length=2048, + blank=True, + default='', + help_text='Full URL where institutional admins can access archived metrics reports.', + ) class Meta: # custom permissions for use in the OSF Admin App diff --git a/osf/models/node.py b/osf/models/node.py index 9e342308f44..62925966e2e 100644 --- a/osf/models/node.py +++ b/osf/models/node.py @@ -80,7 +80,6 @@ from api.caching.tasks import update_storage_usage from api.caching import settings as cache_settings from api.caching.utils import storage_usage_cache -from api.share.utils import update_share logger = logging.getLogger(__name__) @@ -711,6 +710,7 @@ def should_request_identifiers(self): @classmethod def bulk_update_search(cls, nodes, index=None): + from api.share.utils import update_share for _node in nodes: update_share(_node) from website import search @@ -722,6 +722,7 @@ def bulk_update_search(cls, nodes, index=None): log_exception(e) def update_search(self): + from api.share.utils import update_share update_share(self) from website import search try: diff --git a/osf/models/user.py b/osf/models/user.py index 22bbfc5baf9..bb0f97f91a9 100644 --- a/osf/models/user.py +++ b/osf/models/user.py @@ -34,7 +34,6 @@ MergeConflictError) from framework.exceptions import PermissionsError from framework.sessions.utils import remove_sessions_for_user -from api.share.utils import update_share from osf.utils.requests import get_current_request from osf.exceptions import reraise_django_validation_errors, UserStateError from .base import BaseModel, GuidMixin, GuidMixinQuerySet @@ -1469,6 +1468,7 @@ def is_assumed_ham(self): return user_has_trusted_email def update_search(self): + from api.share.utils import update_share update_share(self) from website.search.search import update_user update_user(self) diff --git a/osf_tests/factories.py b/osf_tests/factories.py index 860dd967e5e..0bd1664977d 100644 --- a/osf_tests/factories.py +++ b/osf_tests/factories.py @@ -188,7 +188,7 @@ class BaseNodeFactory(DjangoModelFactory): title = factory.Faker('catch_phrase') description = factory.Faker('sentence') created = factory.LazyFunction(timezone.now) - creator = factory.SubFactory(AuthUserFactory) + creator = factory.LazyAttribute(lambda o: AuthUserFactory()) class Meta: model = models.Node diff --git a/osf_tests/metadata/_utils.py b/osf_tests/metadata/_utils.py index df5ed2b7ac7..fb23bdb16c5 100644 --- a/osf_tests/metadata/_utils.py +++ b/osf_tests/metadata/_utils.py @@ -3,23 +3,23 @@ from osf.metadata import gather from osf.metadata.rdfutils import contextualized_graph -def assert_triples(actual_triples, expected_triples): +def assert_triples(actual_triples, expected_triples, label=''): _expected_graph, _expected_focuses = _get_graph_and_focuses(expected_triples) _actual_graph, _actual_focuses = _get_graph_and_focuses(actual_triples) - assert_graphs_equal(_actual_graph, _expected_graph) + assert_graphs_equal(_actual_graph, _expected_graph, label=label) assert _expected_focuses == _actual_focuses -def assert_graphs_equal(actual_rdflib_graph, expected_rdflib_graph): +def assert_graphs_equal(actual_rdflib_graph, expected_rdflib_graph, label=''): (_overlap, _expected_but_absent, _unexpected_but_present) = rdflib.compare.graph_diff( expected_rdflib_graph, actual_rdflib_graph, ) assert not _expected_but_absent and not _unexpected_but_present, '\n\t'.join(( - 'unequal triple-sets!', + (f'unequal triplesets for "{label}"!' if label else 'unequal triple-sets!'), f'overlap size: {len(_overlap)}', - f'expected (but absent): {_friendly_graph(_expected_but_absent)}', - f'unexpected (but present): {_friendly_graph(_unexpected_but_present)}', + f'expected (but absent): {_indented_graph(_expected_but_absent)}', + f'unexpected (but present): {_indented_graph(_unexpected_but_present)}', )) @@ -35,10 +35,9 @@ def _get_graph_and_focuses(triples): return _graph, _focuses -def _friendly_graph(rdfgraph) -> str: +def _indented_graph(rdfgraph) -> str: _graph_to_print = contextualized_graph(rdfgraph) _delim = '\n\t\t' return _delim + _delim.join( - ' '.join(_term.n3() for _term in triple) - for triple in _graph_to_print + _graph_to_print.serialize(format='turtle').strip().split('\n') ) diff --git a/osf_tests/metadata/expected_metadata_files/file_basic.turtle b/osf_tests/metadata/expected_metadata_files/file_basic.turtle index 14a78c46c88..3f430b22521 100644 --- a/osf_tests/metadata/expected_metadata_files/file_basic.turtle +++ b/osf_tests/metadata/expected_metadata_files/file_basic.turtle @@ -3,6 +3,7 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix skos: . a osf:File ; dcat:accessService ; @@ -33,7 +34,8 @@ dcterms:extent "0.000007 MB" ; dcterms:format "img/png" ; dcterms:modified "2123-05-04" ; - dcterms:requires ; + dcterms:requires ; + osf:storageRegion ; osf:versionNumber "1" . a dcterms:Agent, @@ -45,3 +47,5 @@ foaf:Organization ; dcterms:identifier "http://localhost:5000" ; foaf:name "OSF" . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/file_full.turtle b/osf_tests/metadata/expected_metadata_files/file_full.turtle index 37dd3c537f0..175ccfb042f 100644 --- a/osf_tests/metadata/expected_metadata_files/file_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/file_full.turtle @@ -4,6 +4,7 @@ @prefix osf: . @prefix owl: . @prefix rdfs: . +@prefix skos: . a osf:File ; dcat:accessService ; @@ -39,7 +40,8 @@ dcterms:extent "0.000007 MB" ; dcterms:format "img/png" ; dcterms:modified "2123-05-04" ; - dcterms:requires ; + dcterms:requires ; + osf:storageRegion ; osf:versionNumber "1" . a osf:FundingAward ; @@ -76,3 +78,5 @@ foaf:name "OSF" . rdfs:label "Dataset"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/file_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/file_monthly_supplement.turtle new file mode 100644 index 00000000000..845bd149f37 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/file_monthly_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcat: . +@prefix dcterms: . +@prefix foaf: . +@prefix osf: . +@prefix xsd: . + + osf:usage [ dcterms:temporal "2123-05"^^xsd:gYearMonth ; + dcat:accessService ; + foaf:primaryTopic ; + osf:downloadCount 3 ; + osf:downloadSessionCount 2 ; + osf:viewCount 7 ; + osf:viewSessionCount 5 ] . diff --git a/osf_tests/metadata/expected_metadata_files/file_supplement.turtle b/osf_tests/metadata/expected_metadata_files/file_supplement.turtle new file mode 100644 index 00000000000..662c197699d --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/file_supplement.turtle @@ -0,0 +1 @@ +# correctly empty (for now) diff --git a/osf_tests/metadata/expected_metadata_files/preprint_basic.turtle b/osf_tests/metadata/expected_metadata_files/preprint_basic.turtle index f6db59e6e24..ee7e866827b 100644 --- a/osf_tests/metadata/expected_metadata_files/preprint_basic.turtle +++ b/osf_tests/metadata/expected_metadata_files/preprint_basic.turtle @@ -3,6 +3,7 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . @prefix skos: . @@ -25,7 +26,9 @@ dcat:accessService ; osf:hostingInstitution ; osf:isSupplementedBy ; - osf:statedConflictOfInterest osf:no-conflict-of-interest . + osf:statedConflictOfInterest osf:no-conflict-of-interest ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a dcterms:Agent, foaf:Organization ; diff --git a/osf_tests/metadata/expected_metadata_files/preprint_full.turtle b/osf_tests/metadata/expected_metadata_files/preprint_full.turtle index 93c69fa4e8c..cdf665fd5fe 100644 --- a/osf_tests/metadata/expected_metadata_files/preprint_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/preprint_full.turtle @@ -3,6 +3,7 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . @prefix skos: . @@ -25,7 +26,9 @@ dcat:accessService ; osf:hostingInstitution ; osf:isSupplementedBy ; - osf:statedConflictOfInterest osf:no-conflict-of-interest . + osf:statedConflictOfInterest osf:no-conflict-of-interest ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a dcterms:Agent, foaf:Organization ; diff --git a/osf_tests/metadata/expected_metadata_files/preprint_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/preprint_monthly_supplement.turtle new file mode 100644 index 00000000000..8e6d6fb9331 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/preprint_monthly_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcat: . +@prefix dcterms: . +@prefix foaf: . +@prefix osf: . +@prefix xsd: . + + osf:usage [ dcterms:temporal "2123-05"^^xsd:gYearMonth ; + dcat:accessService ; + foaf:primaryTopic ; + osf:downloadCount 3 ; + osf:downloadSessionCount 2 ; + osf:viewCount 7 ; + osf:viewSessionCount 5 ] . diff --git a/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle b/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle new file mode 100644 index 00000000000..9ff0732a509 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle @@ -0,0 +1,7 @@ +@prefix osf: . +@prefix skos: . + + osf:storageByteCount 1337 ; + osf:storageRegion . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/project_basic.turtle b/osf_tests/metadata/expected_metadata_files/project_basic.turtle index c3846782273..aa8244da1fd 100644 --- a/osf_tests/metadata/expected_metadata_files/project_basic.turtle +++ b/osf_tests/metadata/expected_metadata_files/project_basic.turtle @@ -3,7 +3,9 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . +@prefix skos: . a osf:Project ; dcterms:created "2123-05-04" ; @@ -23,7 +25,9 @@ dcat:accessService ; osf:contains ; osf:hostingInstitution ; - osf:supplements . + osf:supplements ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a osf:Preprint ; dcterms:created "2123-05-04" ; @@ -53,8 +57,19 @@ dcterms:modified "2123-05-04" ; osf:fileName "my-file.blarg" ; osf:filePath "/my-file.blarg" ; + osf:hasFileVersion ; osf:isContainedBy . + a osf:FileVersion ; + dcterms:created "2123-05-04" ; + dcterms:creator ; + dcterms:extent "0.000007 MB" ; + dcterms:format "img/png" ; + dcterms:modified "2123-05-04" ; + dcterms:requires ; + osf:storageRegion ; + osf:versionNumber "1" . + a dcterms:Agent, foaf:Organization ; dcterms:identifier "https://cos.io/", @@ -85,3 +100,5 @@ rdfs:label "Preprint"@en . rdfs:label "StudyRegistration"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/project_full.turtle b/osf_tests/metadata/expected_metadata_files/project_full.turtle index 6a84d141440..63946b2f80b 100644 --- a/osf_tests/metadata/expected_metadata_files/project_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/project_full.turtle @@ -3,7 +3,9 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . +@prefix skos: . a osf:Project ; dcterms:created "2123-05-04" ; @@ -29,7 +31,9 @@ osf:hasFunding , ; osf:hostingInstitution ; - osf:supplements . + osf:supplements ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a osf:Preprint ; dcterms:created "2123-05-04" ; @@ -59,8 +63,19 @@ dcterms:modified "2123-05-04" ; osf:fileName "my-file.blarg" ; osf:filePath "/my-file.blarg" ; + osf:hasFileVersion ; osf:isContainedBy . + a osf:FileVersion ; + dcterms:created "2123-05-04" ; + dcterms:creator ; + dcterms:extent "0.000007 MB" ; + dcterms:format "img/png" ; + dcterms:modified "2123-05-04" ; + dcterms:requires ; + osf:storageRegion ; + osf:versionNumber "1" . + a osf:FundingAward ; dcterms:contributor ; dcterms:identifier "https://moneypockets.example/millions" ; @@ -116,3 +131,5 @@ rdfs:label "Dataset"@en . rdfs:label "StudyRegistration"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/project_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/project_monthly_supplement.turtle new file mode 100644 index 00000000000..dd9c54b1f93 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/project_monthly_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcat: . +@prefix dcterms: . +@prefix foaf: . +@prefix osf: . +@prefix xsd: . + + osf:usage [ dcterms:temporal "2123-05"^^xsd:gYearMonth ; + dcat:accessService ; + foaf:primaryTopic ; + osf:downloadCount 3 ; + osf:downloadSessionCount 2 ; + osf:viewCount 7 ; + osf:viewSessionCount 5 ] . diff --git a/osf_tests/metadata/expected_metadata_files/project_supplement.turtle b/osf_tests/metadata/expected_metadata_files/project_supplement.turtle new file mode 100644 index 00000000000..d055e97554f --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/project_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcterms: . +@prefix osf: . +@prefix skos: . + + osf:hasOsfAddon ; + osf:storageByteCount 7 ; + osf:storageRegion . + + a osf:AddonImplementation ; + dcterms:identifier "gitlab" ; + skos:prefLabel "GitLab" . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/registration_basic.turtle b/osf_tests/metadata/expected_metadata_files/registration_basic.turtle index eae4a92336c..9601477944f 100644 --- a/osf_tests/metadata/expected_metadata_files/registration_basic.turtle +++ b/osf_tests/metadata/expected_metadata_files/registration_basic.turtle @@ -3,7 +3,9 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . +@prefix skos: . a osf:Registration ; dcterms:conformsTo ; @@ -21,7 +23,10 @@ dcterms:title "this is a project title!" ; dcterms:type ; dcat:accessService ; - osf:hostingInstitution . + osf:contains ; + osf:hostingInstitution ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a osf:Project ; dcterms:created "2123-05-04" ; @@ -36,6 +41,25 @@ dcterms:title "this is a project title!" ; owl:sameAs . + a osf:File ; + dcterms:created "2123-05-04" ; + dcterms:identifier "http://localhost:5000/w6ibb" ; + dcterms:modified "2123-05-04" ; + osf:fileName "my-reg-file.blarg" ; + osf:filePath "/my-reg-file.blarg" ; + osf:hasFileVersion ; + osf:isContainedBy . + + a osf:FileVersion ; + dcterms:created "2123-05-04" ; + dcterms:creator ; + dcterms:extent "0.000016 MB" ; + dcterms:format "img/png" ; + dcterms:modified "2123-05-04" ; + dcterms:requires ; + osf:storageRegion ; + osf:versionNumber "1" . + a dcterms:Agent, foaf:Organization ; dcterms:identifier "https://cos.io/", @@ -61,3 +85,5 @@ dcterms:title "Open-Ended Registration" . rdfs:label "StudyRegistration"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/registration_full.turtle b/osf_tests/metadata/expected_metadata_files/registration_full.turtle index d30c4594bbe..4ab508c2f17 100644 --- a/osf_tests/metadata/expected_metadata_files/registration_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/registration_full.turtle @@ -3,7 +3,9 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . +@prefix skos: . a osf:Registration ; dcterms:conformsTo ; @@ -21,7 +23,10 @@ dcterms:title "this is a project title!" ; dcterms:type ; dcat:accessService ; - osf:hostingInstitution . + osf:contains ; + osf:hostingInstitution ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a osf:Project ; dcterms:created "2123-05-04" ; @@ -41,6 +46,25 @@ osf:hasFunding , . + a osf:File ; + dcterms:created "2123-05-04" ; + dcterms:identifier "http://localhost:5000/w6ibb" ; + dcterms:modified "2123-05-04" ; + osf:fileName "my-reg-file.blarg" ; + osf:filePath "/my-reg-file.blarg" ; + osf:hasFileVersion ; + osf:isContainedBy . + + a osf:FileVersion ; + dcterms:created "2123-05-04" ; + dcterms:creator ; + dcterms:extent "0.000016 MB" ; + dcterms:format "img/png" ; + dcterms:modified "2123-05-04" ; + dcterms:requires ; + osf:storageRegion ; + osf:versionNumber "1" . + a osf:FundingAward ; dcterms:contributor ; dcterms:identifier "https://moneypockets.example/millions" ; @@ -91,3 +115,5 @@ dcterms:title "Open-Ended Registration" . rdfs:label "StudyRegistration"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/registration_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/registration_monthly_supplement.turtle new file mode 100644 index 00000000000..435f7f4f921 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/registration_monthly_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcat: . +@prefix dcterms: . +@prefix foaf: . +@prefix osf: . +@prefix xsd: . + + osf:usage [ dcterms:temporal "2123-05"^^xsd:gYearMonth ; + dcat:accessService ; + foaf:primaryTopic ; + osf:downloadCount 3 ; + osf:downloadSessionCount 2 ; + osf:viewCount 7 ; + osf:viewSessionCount 5 ] . diff --git a/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle b/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle new file mode 100644 index 00000000000..9e8201b7915 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle @@ -0,0 +1,7 @@ +@prefix osf: . +@prefix skos: . + + osf:storageByteCount 17 ; + osf:storageRegion . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/user_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/user_monthly_supplement.turtle new file mode 100644 index 00000000000..662c197699d --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/user_monthly_supplement.turtle @@ -0,0 +1 @@ +# correctly empty (for now) diff --git a/osf_tests/metadata/expected_metadata_files/user_supplement.turtle b/osf_tests/metadata/expected_metadata_files/user_supplement.turtle new file mode 100644 index 00000000000..662c197699d --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/user_supplement.turtle @@ -0,0 +1 @@ +# correctly empty (for now) diff --git a/osf_tests/metadata/test_basket.py b/osf_tests/metadata/test_basket.py index 1fa8381cf08..c34ded3e2c5 100644 --- a/osf_tests/metadata/test_basket.py +++ b/osf_tests/metadata/test_basket.py @@ -34,7 +34,7 @@ def test_goodbasket(): basket = gather.Basket(focus) assert basket.focus == focus assert isinstance(basket.gathered_metadata, rdflib.Graph) - assert len(basket.gathered_metadata) == 1 + assert len(basket.gathered_metadata) == 0 assert len(basket._gathertasks_done) == 0 assert len(basket._known_focus_dict) == 1 # no repeat gathertasks: @@ -78,5 +78,6 @@ def test_goodbasket(): # reset basket.reset() - assert len(basket.gathered_metadata) == 1 + assert len(basket.gathered_metadata) == 0 assert len(basket._gathertasks_done) == 0 + assert len(basket._known_focus_dict) == 1 diff --git a/osf_tests/metadata/test_gatherer_registry.py b/osf_tests/metadata/test_gatherer_registry.py index fda28eaf680..c139946ab80 100644 --- a/osf_tests/metadata/test_gatherer_registry.py +++ b/osf_tests/metadata/test_gatherer_registry.py @@ -74,6 +74,10 @@ def gather_agent_name(focus): gather_preprint_or_project_creator, gather_special_preprint_creator, } + assert get_gatherers(BAZ.Preprint, [BAZ.creator], include_focustype_defaults=False) == { + gather_preprint_or_project_creator, + gather_special_preprint_creator, + } assert get_gatherers(BAZ.Agent, [FOO.name, FOO.identifier, FOO.unknown]) == { gather_agent_name, gather_identifiers, diff --git a/osf_tests/metadata/test_osf_gathering.py b/osf_tests/metadata/test_osf_gathering.py index 7bd72770aba..4c064c8a690 100644 --- a/osf_tests/metadata/test_osf_gathering.py +++ b/osf_tests/metadata/test_osf_gathering.py @@ -1,4 +1,5 @@ import datetime +from unittest import mock from django.test import TestCase import rdflib @@ -11,15 +12,19 @@ FOAF, OSF, OSFIO, + DCAT, DCTERMS, DCMITYPE, DOI, OWL, + PROV, RDF, SKOS, checksum_iri, ) from osf import models as osfdb +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth from osf.utils import permissions, workflows from osf_tests import factories from website import settings as website_settings @@ -36,12 +41,13 @@ def setUpTestData(cls): external_identity={'ORCID': {'1234-4321-5678-8765': 'VERIFIED'}}, ) cls.user__readonly = factories.UserFactory( - external_identity={'ORCID': {'1234-4321-6789-9876': 'CREATE'}}, + external_identity={'ORCID': {'1234-4321-6789-9876': 'CREATE'}}, # unverified orcid social={ 'profileWebsites': ['http://mysite.example', 'http://myothersite.example/foo'], 'baiduScholar': 'blarg', }, ) + cls.user__invisible = factories.UserFactory() # cedar metadata template cls.cedar_template = factories.CedarMetadataTemplateFactory( cedar_id='https://repo.metadatacenter.org/templates/this-is-a-cedar-id', @@ -51,8 +57,11 @@ def setUpTestData(cls): ) # project (with components): cls.project = factories.ProjectFactory(creator=cls.user__admin, is_public=True) + cls.project.add_addon('box', auth=None) + cls.project.add_addon('gitlab', auth=None) cls.project.add_contributor(cls.user__readwrite, permissions=permissions.WRITE) - cls.project.add_contributor(cls.user__readonly, permissions=permissions.READ, visible=False) + cls.project.add_contributor(cls.user__readonly, permissions=permissions.READ) + cls.project.add_contributor(cls.user__invisible, permissions=permissions.WRITE, visible=False) cls.component = factories.ProjectFactory(parent=cls.project, creator=cls.user__admin, is_public=True) cls.sibcomponent = factories.ProjectFactory(parent=cls.project, creator=cls.user__admin, is_public=True) cls.subcomponent = factories.ProjectFactory(parent=cls.component, creator=cls.user__admin, is_public=True) @@ -89,7 +98,8 @@ def setUpTestData(cls): is_public=True, ) cls.preprint.add_contributor(cls.user__readwrite, permissions=permissions.WRITE) - cls.preprint.add_contributor(cls.user__readonly, permissions=permissions.READ, visible=False) + cls.preprint.add_contributor(cls.user__readonly, permissions=permissions.READ) + cls.preprint.add_contributor(cls.user__invisible, permissions=permissions.WRITE, visible=False) cls.registration_cedar_record = factories.CedarMetadataRecordFactory( template=cls.cedar_template, is_published=True, @@ -453,6 +463,7 @@ def test_gather_versions(self): # focus: file fileversion = self.file.versions.first() fileversion_iri = URIRef(f'{self.filefocus.iri}?revision={fileversion.identifier}') + storageregion_iri = URIRef(f'{website_settings.API_DOMAIN}v2/regions/us/') assert_triples(osf_gathering.gather_versions(self.filefocus), { (self.filefocus.iri, OSF.hasFileVersion, fileversion_iri), (fileversion_iri, RDF.type, OSF.FileVersion), @@ -462,7 +473,9 @@ def test_gather_versions(self): (fileversion_iri, DCTERMS['format'], Literal(fileversion.content_type)), (fileversion_iri, DCTERMS.extent, Literal('0.118 MB')), (fileversion_iri, OSF.versionNumber, Literal(fileversion.identifier)), - (fileversion_iri, DCTERMS.requires, checksum_iri('sha-256', self.file_sha256)) + (fileversion_iri, DCTERMS.requires, checksum_iri('sha-256', self.file_sha256)), + (fileversion_iri, OSF.storageRegion, storageregion_iri), + (storageregion_iri, SKOS.prefLabel, Literal('United States', lang='en')), }) def test_gather_files(self): @@ -521,11 +534,19 @@ def test_gather_agents(self): assert_triples(osf_gathering.gather_agents(self.projectfocus), { (self.projectfocus.iri, DCTERMS.creator, self.userfocus__admin), (self.projectfocus.iri, DCTERMS.creator, self.userfocus__readwrite), + (self.projectfocus.iri, DCTERMS.creator, self.userfocus__readonly), }) # focus: registration assert_triples(osf_gathering.gather_agents(self.registrationfocus), { (self.registrationfocus.iri, DCTERMS.creator, self.userfocus__admin), (self.registrationfocus.iri, DCTERMS.creator, self.userfocus__readwrite), + (self.registrationfocus.iri, DCTERMS.creator, self.userfocus__readonly), + }) + # focus: preprint + assert_triples(osf_gathering.gather_agents(self.preprintfocus), { + (self.preprintfocus.iri, DCTERMS.creator, self.userfocus__admin), + (self.preprintfocus.iri, DCTERMS.creator, self.userfocus__readwrite), + (self.preprintfocus.iri, DCTERMS.creator, self.userfocus__readonly), }) # focus: file assert_triples(osf_gathering.gather_agents(self.filefocus), set()) @@ -750,3 +771,116 @@ def test_gather_cedar_templates(self): (self.filefocus.iri, OSF.hasCedarTemplate, cedar_template_iri), (cedar_template_iri, DCTERMS.title, Literal(self.cedar_template.schema_name)) }) + + def test_gather_last_month_usage(self): + # no usage report: + with mock.patch( + 'osf.metrics.reports.PublicItemUsageReport.for_last_month', + return_value=None, + ): + assert_triples(osf_gathering.gather_last_month_usage(self.projectfocus), set()) + # yes usage report: + _ym = YearMonth.from_date(datetime.datetime.now(tz=datetime.UTC)) + with mock.patch( + 'osf.metrics.reports.PublicItemUsageReport.for_last_month', + return_value=PublicItemUsageReport( + item_osfid=self.project._id, + report_yearmonth=_ym, + view_count=71, + view_session_count=13, + download_count=43, + download_session_count=11, + ), + ): + _usage_bnode = rdflib.BNode() + assert_triples(osf_gathering.gather_last_month_usage(self.projectfocus), { + (self.projectfocus.iri, OSF.usage, _usage_bnode), + (_usage_bnode, DCTERMS.temporal, Literal(str(_ym), datatype=rdflib.XSD.gYearMonth)), + (_usage_bnode, DCAT.accessService, rdflib.URIRef(website_settings.DOMAIN.rstrip('/'))), + (_usage_bnode, FOAF.primaryTopic, self.projectfocus.iri), + (_usage_bnode, OSF.viewCount, Literal(71)), + (_usage_bnode, OSF.viewSessionCount, Literal(13)), + (_usage_bnode, OSF.downloadCount, Literal(43)), + (_usage_bnode, OSF.downloadSessionCount, Literal(11)), + }) + + def test_gather_addons(self): + # registration (without non-default addon) + assert_triples(osf_gathering.gather_addons(self.registrationfocus), set()) + # project (with non-default addons) + _box_ref = rdflib.URIRef('urn:osf.io:addons:box') + _gitlab_ref = rdflib.URIRef('urn:osf.io:addons:gitlab') + assert_triples(osf_gathering.gather_addons(self.projectfocus), { + (self.projectfocus.iri, OSF.hasOsfAddon, _box_ref), + (_box_ref, RDF.type, OSF.AddonImplementation), + (_box_ref, DCTERMS.identifier, Literal('box')), + (_box_ref, SKOS.prefLabel, Literal('Box')), + (self.projectfocus.iri, OSF.hasOsfAddon, _gitlab_ref), + (_gitlab_ref, RDF.type, OSF.AddonImplementation), + (_gitlab_ref, DCTERMS.identifier, Literal('gitlab')), + (_gitlab_ref, SKOS.prefLabel, Literal('GitLab')), + }) + + def test_gather_storage_region(self): + _default_region_ref = rdflib.URIRef(f'{website_settings.API_DOMAIN}v2/regions/us/') + assert_triples(osf_gathering.gather_storage_region(self.projectfocus), { + (self.projectfocus.iri, OSF.storageRegion, _default_region_ref), + (_default_region_ref, SKOS.prefLabel, Literal('United States', lang='en')), + }) + assert_triples(osf_gathering.gather_storage_region(self.registrationfocus), { + (self.registrationfocus.iri, OSF.storageRegion, _default_region_ref), + (_default_region_ref, SKOS.prefLabel, Literal('United States', lang='en')), + }) + assert_triples(osf_gathering.gather_storage_region(self.preprintfocus), { + (self.preprintfocus.iri, OSF.storageRegion, _default_region_ref), + (_default_region_ref, SKOS.prefLabel, Literal('United States', lang='en')), + }) + + def test_gather_qualified_attributions(self): + _attribution_admin = rdflib.BNode() + _attribution_readwrite = rdflib.BNode() + _attribution_readonly = rdflib.BNode() + assert_triples(osf_gathering.gather_qualified_attributions(self.projectfocus), { + (self.projectfocus.iri, PROV.qualifiedAttribution, _attribution_admin), + (_attribution_admin, PROV.agent, self.userfocus__admin), + (_attribution_admin, DCAT.hadRole, OSF['admin-contributor']), + (self.projectfocus.iri, PROV.qualifiedAttribution, _attribution_readwrite), + (_attribution_readwrite, PROV.agent, self.userfocus__readwrite), + (_attribution_readwrite, DCAT.hadRole, OSF['write-contributor']), + (self.projectfocus.iri, PROV.qualifiedAttribution, _attribution_readonly), + (_attribution_readonly, PROV.agent, self.userfocus__readonly), + (_attribution_readonly, DCAT.hadRole, OSF['readonly-contributor']), + }) + assert_triples(osf_gathering.gather_qualified_attributions(self.registrationfocus), { + (self.registrationfocus.iri, PROV.qualifiedAttribution, _attribution_admin), + (_attribution_admin, PROV.agent, self.userfocus__admin), + (_attribution_admin, DCAT.hadRole, OSF['admin-contributor']), + (self.registrationfocus.iri, PROV.qualifiedAttribution, _attribution_readwrite), + (_attribution_readwrite, PROV.agent, self.userfocus__readwrite), + (_attribution_readwrite, DCAT.hadRole, OSF['write-contributor']), + (self.registrationfocus.iri, PROV.qualifiedAttribution, _attribution_readonly), + (_attribution_readonly, PROV.agent, self.userfocus__readonly), + (_attribution_readonly, DCAT.hadRole, OSF['readonly-contributor']), + }) + assert_triples(osf_gathering.gather_qualified_attributions(self.preprintfocus), { + (self.preprintfocus.iri, PROV.qualifiedAttribution, _attribution_admin), + (_attribution_admin, PROV.agent, self.userfocus__admin), + (_attribution_admin, DCAT.hadRole, OSF['admin-contributor']), + (self.preprintfocus.iri, PROV.qualifiedAttribution, _attribution_readwrite), + (_attribution_readwrite, PROV.agent, self.userfocus__readwrite), + (_attribution_readwrite, DCAT.hadRole, OSF['write-contributor']), + (self.preprintfocus.iri, PROV.qualifiedAttribution, _attribution_readonly), + (_attribution_readonly, PROV.agent, self.userfocus__readonly), + (_attribution_readonly, DCAT.hadRole, OSF['readonly-contributor']), + }) + + def test_gather_storage_byte_count(self): + assert_triples(osf_gathering.gather_storage_byte_count(self.projectfocus), { + (self.projectfocus.iri, OSF.storageByteCount, Literal(123456)), + }) + assert_triples(osf_gathering.gather_storage_byte_count(self.registrationfocus), { + (self.registrationfocus.iri, OSF.storageByteCount, Literal(0)), + }) + assert_triples(osf_gathering.gather_storage_byte_count(self.preprintfocus), { + (self.preprintfocus.iri, OSF.storageByteCount, Literal(1337)), + }) diff --git a/osf_tests/metadata/test_serialized_metadata.py b/osf_tests/metadata/test_serialized_metadata.py index 0c74961778a..c8a0eee95ac 100644 --- a/osf_tests/metadata/test_serialized_metadata.py +++ b/osf_tests/metadata/test_serialized_metadata.py @@ -5,8 +5,11 @@ import rdflib from osf import models as osfdb +from osf.metadata.osf_gathering import OsfmapPartition from osf.metadata.rdfutils import OSF, DCTERMS from osf.metadata.tools import pls_gather_metadata_file +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth from osf.models.licenses import NodeLicense from api_tests.utils import create_test_file from osf_tests import factories @@ -22,53 +25,103 @@ BASIC_METADATA_SCENARIO = { OSF.Project: { - 'turtle': 'project_basic.turtle', - 'datacite-xml': 'project_basic.datacite.xml', - 'datacite-json': 'project_basic.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'project_basic.turtle', + 'datacite-xml': 'project_basic.datacite.xml', + 'datacite-json': 'project_basic.datacite.json', + }, }, OSF.Preprint: { - 'turtle': 'preprint_basic.turtle', - 'datacite-xml': 'preprint_basic.datacite.xml', - 'datacite-json': 'preprint_basic.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'preprint_basic.turtle', + 'datacite-xml': 'preprint_basic.datacite.xml', + 'datacite-json': 'preprint_basic.datacite.json', + }, }, OSF.Registration: { - 'turtle': 'registration_basic.turtle', - 'datacite-xml': 'registration_basic.datacite.xml', - 'datacite-json': 'registration_basic.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'registration_basic.turtle', + 'datacite-xml': 'registration_basic.datacite.xml', + 'datacite-json': 'registration_basic.datacite.json', + }, }, OSF.File: { - 'turtle': 'file_basic.turtle', - 'datacite-xml': 'file_basic.datacite.xml', - 'datacite-json': 'file_basic.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'file_basic.turtle', + 'datacite-xml': 'file_basic.datacite.xml', + 'datacite-json': 'file_basic.datacite.json', + }, }, DCTERMS.Agent: { - 'turtle': 'user_basic.turtle', + OsfmapPartition.MAIN: { + 'turtle': 'user_basic.turtle', + }, }, } FULL_METADATA_SCENARIO = { OSF.Project: { - 'turtle': 'project_full.turtle', - 'datacite-xml': 'project_full.datacite.xml', - 'datacite-json': 'project_full.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'project_full.turtle', + 'datacite-xml': 'project_full.datacite.xml', + 'datacite-json': 'project_full.datacite.json', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'project_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'project_monthly_supplement.turtle', + }, }, OSF.Preprint: { - 'turtle': 'preprint_full.turtle', - 'datacite-xml': 'preprint_full.datacite.xml', - 'datacite-json': 'preprint_full.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'preprint_full.turtle', + 'datacite-xml': 'preprint_full.datacite.xml', + 'datacite-json': 'preprint_full.datacite.json', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'preprint_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'preprint_monthly_supplement.turtle', + }, }, OSF.Registration: { - 'turtle': 'registration_full.turtle', - 'datacite-xml': 'registration_full.datacite.xml', - 'datacite-json': 'registration_full.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'registration_full.turtle', + 'datacite-xml': 'registration_full.datacite.xml', + 'datacite-json': 'registration_full.datacite.json', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'registration_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'registration_monthly_supplement.turtle', + }, }, OSF.File: { - 'turtle': 'file_full.turtle', - 'datacite-xml': 'file_full.datacite.xml', - 'datacite-json': 'file_full.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'file_full.turtle', + 'datacite-xml': 'file_full.datacite.xml', + 'datacite-json': 'file_full.datacite.json', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'file_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'file_monthly_supplement.turtle', + }, }, DCTERMS.Agent: { - 'turtle': 'user_full.turtle', + OsfmapPartition.MAIN: { + 'turtle': 'user_full.turtle', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'user_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'user_monthly_supplement.turtle', + }, }, } @@ -124,8 +177,7 @@ def setUp(self): mock.patch('django.utils.timezone.now', new=forever_now), mock.patch('osf.models.metaschema.RegistrationSchema.absolute_api_v2_url', new='http://fake.example/schema/for/test'), ): - patcher.start() - self.addCleanup(patcher.stop) + self.enterContext(patcher) # build test objects self.user = factories.AuthUserFactory( fullname='Person McNamington', @@ -147,12 +199,13 @@ def setUp(self): category='doi', value=f'10.70102/FK2osf.io/{self.project._id}', ) + self.project.add_addon('gitlab', auth=None) self.file = create_test_file( self.project, self.user, filename='my-file.blarg', size=7, - sha256='6ac3c336e4094835293a3fed8a4b5fedde1b5e2626d9838fed50693bba00af0e', + sha256='shashasha', ) osf_preprint_provider = factories.PreprintProviderFactory(_id='osf') another_provider = factories.PreprintProviderFactory( @@ -208,9 +261,26 @@ def setUp(self): doi_prefix='11.rp', ), ) + self.reg_file = create_test_file( + self.registration, + self.user, + filename='my-reg-file.blarg', + size=17, + sha256='shashasha', + ) osfdb.GuidMetadataRecord.objects.for_guid(self.registration._id).update({ 'resource_type_general': 'StudyRegistration', }, auth=self.user) + self.enterContext(mock.patch( + 'osf.metrics.reports.PublicItemUsageReport.for_last_month', + return_value=PublicItemUsageReport( + report_yearmonth=YearMonth.from_date(forever_now()), + view_count=7, + view_session_count=5, + download_count=3, + download_session_count=2, + ), + )) self.guid_dict = { OSF.Project: self.project._id, OSF.Preprint: self.preprint._id, @@ -261,27 +331,37 @@ def test_serialized_metadata(self): self._assert_scenario(FULL_METADATA_SCENARIO) def _assert_scenario(self, scenario_dict): - for focus_type, expected_files in scenario_dict.items(): - for format_key, filename in expected_files.items(): - osfguid = self.guid_dict[focus_type] - gathered_file = pls_gather_metadata_file(osfguid, format_key) - with self.subTest(focus_type=focus_type, format_key=format_key, testpath='pls_gather_metadata_file'): - self.assertEqual(gathered_file.mediatype, EXPECTED_MEDIATYPE[format_key]) - # to update expected metadata, uncomment `_write_expected_file` and this - # next line (being careful not to leave it uncommented...) and run tests - # self._write_expected_file(filename, gathered_file.serialized_metadata) - self._assert_expected_file(filename, gathered_file.serialized_metadata) + for focus_type, by_partition in scenario_dict.items(): + for osfmap_partition, expected_files in by_partition.items(): + for format_key, filename in expected_files.items(): + self._assert_scenario_file(focus_type, osfmap_partition, format_key, filename) - with self.subTest(focus_type=focus_type, format_key=format_key, testpath='metadata download'): - resp = self.app.get(f'/{osfguid}/metadata/?format={format_key}') - assert resp.status_code == 200 - self.assertEqual(resp.status_code, 200) - self.assertEqual(resp.headers['Content-Type'], EXPECTED_MEDIATYPE[format_key]) - self.assertEqual( - resp.headers['Content-Disposition'], - f'attachment; filename={gathered_file.filename}', - ) - self._assert_expected_file(filename, resp.text) + def _assert_scenario_file( + self, + focus_type: str, + osfmap_partition: OsfmapPartition, + format_key: str, + filename: str, + ): + osfguid = self.guid_dict[focus_type] + gathered_file = pls_gather_metadata_file(osfguid, format_key, {'osfmap_partition': osfmap_partition}) + with self.subTest(focus_type=focus_type, format_key=format_key, testpath='pls_gather_metadata_file'): + self.assertEqual(gathered_file.mediatype, EXPECTED_MEDIATYPE[format_key]) + # to update expected metadata, uncomment `_write_expected_file` and this + # next line (being careful not to leave it uncommented...) and run tests + # self._write_expected_file(filename, gathered_file.serialized_metadata) + self._assert_expected_file(filename, gathered_file.serialized_metadata) + if not osfmap_partition.is_supplementary: + with self.subTest(focus_type=focus_type, format_key=format_key, testpath='metadata download'): + resp = self.app.get(f'/{osfguid}/metadata/?format={format_key}') + assert resp.status_code == 200 + self.assertEqual(resp.status_code, 200) + self.assertEqual(resp.headers['Content-Type'], EXPECTED_MEDIATYPE[format_key]) + self.assertEqual( + resp.headers['Content-Disposition'], + f'attachment; filename={gathered_file.filename}', + ) + self._assert_expected_file(filename, resp.text) def _assert_expected_file(self, filename, actual_metadata): _open_mode = ('rb' if isinstance(actual_metadata, bytes) else 'r') @@ -290,16 +370,16 @@ def _assert_expected_file(self, filename, actual_metadata): if filename.endswith('.turtle'): # HACK: because the turtle serializer may output things in different order # TODO: stable turtle serializer (or another primitive rdf serialization) - self._assert_equivalent_turtle(actual_metadata, _expected_metadata) + self._assert_equivalent_turtle(actual_metadata, _expected_metadata, filename) else: self.assertEqual(actual_metadata, _expected_metadata) - def _assert_equivalent_turtle(self, actual_turtle, expected_turtle): + def _assert_equivalent_turtle(self, actual_turtle, expected_turtle, filename): _actual = rdflib.Graph() _actual.parse(data=actual_turtle, format='turtle') _expected = rdflib.Graph() _expected.parse(data=expected_turtle, format='turtle') - assert_graphs_equal(_actual, _expected) + assert_graphs_equal(_actual, _expected, label=filename) # def _write_expected_file(self, filename, expected_metadata): # '''for updating expected metadata files from current serializers diff --git a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py new file mode 100644 index 00000000000..715a2cd1553 --- /dev/null +++ b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py @@ -0,0 +1,286 @@ +import time +import datetime +import logging +from django.test import TestCase +from osf.metrics.reporters import InstitutionalSummaryMonthlyReporter +from osf.metrics.utils import YearMonth +from osf_tests.factories import ( + InstitutionFactory, + ProjectFactory, + RegistrationFactory, + PreprintFactory, + AuthUserFactory, +) + + +class TestInstiSummaryMonthlyReporter(TestCase): + + @classmethod + def setUpTestData(cls): + cls._yearmonth = YearMonth(2018, 2) # February 2018 + cls._institution = InstitutionFactory() + cls._now = datetime.datetime(2018, 2, 4, tzinfo=datetime.UTC) + + # Existing data for the primary institution + cls._public_project = cls._create_affiliated_project(cls._institution, is_public=True, created=cls._now) + cls._private_project = cls._create_affiliated_project(cls._institution, is_public=False, created=cls._now) + cls._public_registration = cls._create_affiliated_registration(cls._institution, is_public=True, created=cls._now) + cls._embargoed_registration = cls._create_affiliated_registration(cls._institution, is_public=False, created=cls._now) + + cls._published_preprint = cls._create_affiliated_preprint(cls._institution, is_public=True, created=cls._now) + + cls._logged_in_user = cls._create_logged_in_user(cls._institution, date_last_login=cls._now) + cls._active_user = cls._create_active_user(cls._institution, date_confirmed=cls._now - datetime.timedelta(days=1)) + + @classmethod + def _create_affiliated_preprint(cls, institution, is_public, created): + published_preprint = PreprintFactory(is_public=is_public) + published_preprint.affiliated_institutions.add(institution) + published_preprint.created = created + published_preprint.save() + return published_preprint + + @classmethod + def _create_affiliated_project(cls, institution, is_public, created): + project = ProjectFactory(is_public=is_public) + project.affiliated_institutions.add(institution) + project.created = created + project.save() + return project + + @classmethod + def _create_affiliated_registration(cls, institution, is_public, created): + registration = RegistrationFactory(is_public=is_public) + registration.affiliated_institutions.add(institution) + registration.created = created + registration.save() + return registration + + @classmethod + def _create_logged_in_user(cls, institution, date_last_login): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_last_login = date_last_login + user.save() + return user + + @classmethod + def _create_active_user(cls, institution, date_confirmed): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_confirmed = date_confirmed + ProjectFactory(creator=user) # adds log to make active + log = user.logs.get() + log.created = date_confirmed + log.save() + user.save() + return user + + def test_report_generation(self): + reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) + reports = list(reporter.report()) + self.assertEqual(len(reports), 1) + + report = reports[0] + self.assertEqual(report.institution_id, self._institution._id) + self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user + self.assertEqual(report.public_project_count, 1) + self.assertEqual(report.private_project_count, 1) + self.assertEqual(report.public_registration_count, 1) + self.assertEqual(report.embargoed_registration_count, 1) + self.assertEqual(report.published_preprint_count, 1) + self.assertEqual(report.storage_byte_count, 1337) # test value for one file + self.assertEqual(report.public_file_count, 1) + self.assertEqual(report.monthly_logged_in_user_count, 1) + self.assertEqual(report.monthly_active_user_count, 1) + + def test_report_generation_multiple_institutions(self): + institution2 = InstitutionFactory() + institution3 = InstitutionFactory() + + # Set up dates for different months + last_month = datetime.datetime(2018, 1, 15, tzinfo=datetime.UTC) + next_month = datetime.datetime(2018, 3, 10, tzinfo=datetime.UTC) + + self._create_affiliated_project(institution2, is_public=True, created=self._now) + self._create_affiliated_project(institution3, is_public=True, created=last_month) + + # Create future projects for self._institution (should not be counted) + self._create_affiliated_project(self._institution, is_public=True, created=next_month) + + # Create users affiliated with different institutions + self._create_active_user(institution2, date_confirmed=self._now) + self._create_active_user(institution3, date_confirmed=last_month) + + # Run the reporter for the current month (February 2018) + reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) + reports = list(reporter.report()) + self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3 + + # Extract reports by institution + report_institution = next(r for r in reports if r.institution_id == self._institution._id) + report_institution2 = next(r for r in reports if r.institution_id == institution2._id) + + # Validate report for self._institution + self.assertEqual(report_institution.public_project_count, 1) + self.assertEqual(report_institution.private_project_count, 1) + self.assertEqual(report_institution.user_count, 2) + self.assertEqual(report_institution.monthly_active_user_count, 1) + self.assertEqual(report_institution.monthly_logged_in_user_count, 1) + + # Validate report for institution2 + self.assertEqual(report_institution2.public_project_count, 1) + self.assertEqual(report_institution2.private_project_count, 0) + self.assertEqual(report_institution2.user_count, 1) + self.assertEqual(report_institution2.monthly_active_user_count, 1) + self.assertEqual(report_institution2.monthly_logged_in_user_count, 0) # No logged-in users + + +class TestSummaryMonthlyReporterBenchmarker(TestCase): + + @classmethod + def setUpTestData(cls): + cls.logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.INFO) + cls._yearmonth = YearMonth(2018, 2) # February 2018 + cls._institution = InstitutionFactory() + cls._now = datetime.datetime(2018, 2, 4, tzinfo=datetime.UTC) + cls.enable_benchmarking = True + + @classmethod + def _create_affiliated_preprint(cls, institution, is_public, created, creator=None): + published_preprint = PreprintFactory(is_public=is_public, creator=creator) + published_preprint.affiliated_institutions.add(institution) + published_preprint.created = created + published_preprint.save() + return published_preprint + + @classmethod + def _create_affiliated_project(cls, institution, is_public, created, creator=None): + project = ProjectFactory(is_public=is_public, creator=creator) + project.affiliated_institutions.add(institution) + project.created = created + project.save() + return project + + @classmethod + def _create_affiliated_registration(cls, institution, is_public, created, creator=None): + registration = RegistrationFactory(is_public=is_public, creator=creator) + registration.affiliated_institutions.add(institution) + registration.created = created + registration.save() + return registration + + @classmethod + def _create_logged_in_user(cls, institution, date_last_login): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_last_login = date_last_login + user.save() + return user + + @classmethod + def _create_active_user(cls, institution, date_confirmed): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_confirmed = date_confirmed + ProjectFactory(creator=user) # adds log to make active + log = user.logs.get() + log.created = date_confirmed + log.save() + user.save() + return user + + def test_high_counts_multiple_institutions(self): + """ + Test the report generation with configurable high counts for institutions, users, and their objects. + Benchmarking can be enabled by setting the 'enable_benchmarking' attribute to True. + """ + # Check if benchmarking is enabled + enable_benchmarking = self.enable_benchmarking + + # Configure counts (adjust these numbers as needed) + additional_institution_count = 1 # Number of institutions (adjust as needed) + users_per_institution = 3 # Number of users per institution (adjust as needed) + objects_per_user = 3 # Number of objects per user (adjust as needed) + + # Timing variables + if enable_benchmarking: + total_start_time = time.time() + data_creation_start_time = time.time() + + # Create institutions + institutions = [self._institution] + institutions += [InstitutionFactory() for _ in range(additional_institution_count)] + + if enable_benchmarking: + institutions_creation_time = time.time() + self.logger.info( + f"Time taken to create {additional_institution_count + 1} institutions: {institutions_creation_time - data_creation_start_time:.2f} seconds") + + # Generate data for each institution + if enable_benchmarking: + users_creation_start_time = time.time() + institution_users = {} + for institution in institutions: + # Create users for the institution + users = [] + for _ in range(users_per_institution): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_last_login = self._now + user.date_confirmed = self._now - datetime.timedelta(days=1) + user.save() + users.append(user) + institution_users[institution] = users + + if enable_benchmarking: + users_creation_time = time.time() + self.logger.info(f"Time taken to create users: {users_creation_time - users_creation_start_time:.2f} seconds") + + # Create projects, registrations, and preprints for each user + if enable_benchmarking: + objects_creation_start_time = time.time() + for institution in institutions: + users = institution_users[institution] + for user in users: + for _ in range(objects_per_user): + self._create_affiliated_project(institution, is_public=True, created=self._now, creator=user) + self._create_affiliated_project(institution, is_public=False, created=self._now, creator=user) + self._create_affiliated_registration(institution, is_public=True, created=self._now, creator=user) + self._create_affiliated_registration(institution, is_public=False, created=self._now, creator=user) + self._create_affiliated_preprint(institution, is_public=True, created=self._now, creator=user) + + if enable_benchmarking: + objects_creation_time = time.time() + self.logger.info( + f"Time taken to create objects: {objects_creation_time - objects_creation_start_time:.2f} seconds") + data_creation_end_time = time.time() + self.logger.info( + f"Total time taken to create data: {data_creation_end_time - data_creation_start_time:.2f} seconds") + + # Run the reporter + if enable_benchmarking: + reporter_start_time = time.time() + reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) + reports = list(reporter.report()) + assert len(reports) == additional_institution_count + 1 + + if enable_benchmarking: + reporter_end_time = time.time() + self.logger.info(f"Time taken to run the reporter: {reporter_end_time - reporter_start_time:.2f} seconds") + total_end_time = time.time() + self.logger.info(f"Total test execution time: {total_end_time - total_start_time:.2f} seconds") + + self.assertEqual(len(reports), additional_institution_count + 1) + + # Validate counts for each institution + expected_count = users_per_institution * objects_per_user + for report in reports: + self.assertEqual(report.public_project_count, expected_count) + self.assertEqual(report.private_project_count, expected_count) + self.assertEqual(report.public_registration_count, expected_count) + self.assertEqual(report.embargoed_registration_count, expected_count) + self.assertEqual(report.published_preprint_count, expected_count) + self.assertEqual(report.user_count, users_per_institution) + self.assertEqual(report.monthly_logged_in_user_count, users_per_institution) diff --git a/osf_tests/metrics/reporters/test_institutional_users_reporter.py b/osf_tests/metrics/reporters/test_institutional_users_reporter.py new file mode 100644 index 00000000000..876fd08cf9b --- /dev/null +++ b/osf_tests/metrics/reporters/test_institutional_users_reporter.py @@ -0,0 +1,262 @@ +from __future__ import annotations +import dataclasses +import datetime +import unittest + +from django.test import TestCase + +from api_tests.utils import create_test_file +from osf import models as osfdb +from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.reporters import InstitutionalUsersReporter +from osf.metrics.utils import YearMonth +from osf_tests.factories import ( + InstitutionFactory, + PreprintFactory, + ProjectFactory, + RegistrationFactory, + UserFactory, + EmbargoFactory, +) + + +def _patch_now(fakenow: datetime.datetime): + return unittest.mock.patch('django.utils.timezone.now', return_value=fakenow) + + +class TestInstiUsersReporter(TestCase): + @classmethod + def setUpTestData(cls): + cls._yearmonth = YearMonth(2012, 7) + cls._now = datetime.datetime( + cls._yearmonth.year, + cls._yearmonth.month, + 13, # just some day in the month + tzinfo=datetime.UTC, + ) + with _patch_now(cls._now): + cls._institution = InstitutionFactory() + cls._user_setup_with_nothing = _InstiUserSetup(0, 0, 0, 0, 0, cls._institution, cls._now) + cls._user_setup_with_ones = _InstiUserSetup(1, 1, 1, 1, 1, cls._institution, cls._now) + cls._user_setup_with_stuff = _InstiUserSetup( + 2, 3, 5, 3, 2, cls._institution, cls._now, + orcid_id='1111-2222-3333-4444', + department_name='blargl studies', + ) + cls._user_setup_with_stuff.fill_uncounted_objects() + + def _assert_report_matches_setup(self, report: InstitutionalUserReport, setup: _InstiUserSetup): + self.assertEqual(report.institution_id, setup.institution._id) + # user info: + self.assertEqual(report.user_id, setup.user._id) + self.assertEqual(report.user_name, setup.user.fullname) + self.assertEqual(report.department_name, setup.department_name) + self.assertEqual(report.month_last_login, YearMonth.from_date(setup.user.date_last_login)) + if setup.month_last_active: + self.assertEqual(report.month_last_active, YearMonth.from_date(setup.month_last_active)) + else: + self.assertEqual(report.month_last_active, setup.month_last_active) + + self.assertEqual(report.account_creation_date, YearMonth.from_date(setup.user.created)) + self.assertEqual(report.orcid_id, setup.orcid_id) + # counts (NOTE: report.public_file_count and report.storage_byte_count tested separately) + self.assertEqual(report.public_project_count, setup.public_project_count) + self.assertEqual(report.private_project_count, setup.private_project_count) + self.assertEqual(report.public_registration_count, setup.public_registration_count) + self.assertEqual(report.embargoed_registration_count, setup.embargoed_registration_count) + self.assertEqual(report.published_preprint_count, setup.published_preprint_count) + + def test_no_users(self): + _actual_reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + self.assertEqual(_actual_reports, []) + + def test_one_user_with_nothing(self): + self._user_setup_with_nothing.affiliate_user() + _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + self.assertEqual(len(_reports), 1) + self._assert_report_matches_setup(_reports[0], self._user_setup_with_nothing) + + def test_one_user_with_ones(self): + self._user_setup_with_ones.affiliate_user() + _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + self.assertEqual(len(_reports), 1) + self._assert_report_matches_setup(_reports[0], self._user_setup_with_ones) + + def test_one_user_with_stuff_and_no_files(self): + self._user_setup_with_stuff.affiliate_user() + _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + self.assertEqual(len(_reports), 1) + self._assert_report_matches_setup(_reports[0], self._user_setup_with_stuff) + self.assertEqual(_reports[0].public_file_count, 2) # preprint 2 files + self.assertEqual(_reports[0].storage_byte_count, 2674) # preprint bytes + + def test_one_user_with_stuff_and_a_file(self): + self._user_setup_with_stuff.affiliate_user() + _user = self._user_setup_with_stuff.user + _project = _user.nodes.first() + with _patch_now(self._now): + create_test_file(target=_project, user=_user, size=37) + (_report,) = InstitutionalUsersReporter(self._yearmonth).report() + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 3) # 2 preprint files + self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files + + def test_one_user_with_stuff_and_multiple_files(self): + self._user_setup_with_stuff.affiliate_user() + _user = self._user_setup_with_stuff.user + _project = _user.nodes.first() + with _patch_now(self._now): + create_test_file(target=_project, user=_user, size=37, filename='b') + create_test_file(target=_project, user=_user, size=73, filename='bl') + _component = ProjectFactory(parent=_project, creator=_user, is_public=True) + _component.affiliated_institutions.add(self._institution) + create_test_file(target=_component, user=_user, size=53, filename='bla') + create_test_file(target=_component, user=_user, size=51, filename='blar') + create_test_file(target=_component, user=_user, size=47, filename='blarg') + (_report,) = InstitutionalUsersReporter(self._yearmonth).report() + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 7) # 2 preprint files + self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47 + + def test_several_users(self): + _setups = [ + self._user_setup_with_nothing, + self._user_setup_with_ones, + self._user_setup_with_stuff, + ] + for _setup in _setups: + _setup.affiliate_user() + _setup_by_userid = { + _setup.user._id: _setup + for _setup in _setups + } + _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + self.assertEqual(len(_reports), len(_setup_by_userid)) + for _actual_report in _reports: + _setup = _setup_by_userid[_actual_report.user_id] + self._assert_report_matches_setup(_actual_report, _setup) + + +@dataclasses.dataclass +class _InstiUserSetup: + '''helper class to simplify database setup for a test-case + + (note: public_file_count and storage_byte_count set up separately) + ''' + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + institution: osfdb.Institution + now: datetime.datetime + department_name: str | None = None + orcid_id: str | None = None + user: osfdb.OSFUser = dataclasses.field(init=False) + month_last_active: datetime.datetime | None = dataclasses.field(init=False) + + def __post_init__(self): + self.user = UserFactory( + date_last_login=self.now, + external_identity=( + {'ORCID': {self.orcid_id: 'VERIFIED'}} + if self.orcid_id + else {} + ), + ) + self._add_affiliations(self._generate_counted_objects()) + node_logs = self.user.logs.order_by('-created') + preprint_logs = self.user.preprint_logs.order_by('-created') + + dates = filter(bool, [ + node_logs.values_list('created', flat=True).first(), + preprint_logs.values_list('created', flat=True).first(), + ]) + + self.month_last_active = max(dates, default=None) + + def affiliate_user(self): + self.user.add_or_update_affiliated_institution( + self.institution, + sso_department=self.department_name, + ) + + @property + def future_timestamp(self): + return self.now + datetime.timedelta(days=123) + + def fill_uncounted_objects(self): + # uncounted because not affiliated: + self._add_public_project() + self._add_private_project() + self._add_public_registration() + self._add_embargoed_registration() + self._add_published_preprint() + # uncounted because affiliated with another institution: + self._add_affiliations(( + self._add_public_project(), + self._add_private_project(), + self._add_public_registration(), + self._add_embargoed_registration(), + self._add_published_preprint(), + ), institution=InstitutionFactory()) + # uncounted because created after the report's time range: + with _patch_now(self.future_timestamp): + self._add_affiliations(( + self._add_public_project(), + self._add_private_project(), + self._add_public_registration(), + self._add_embargoed_registration(), + self._add_published_preprint(), + )) + + def _add_affiliations(self, objs, institution=None): + for _obj in objs: + if _obj is not None: + _obj.affiliated_institutions.add(institution or self.institution) + + def _generate_counted_objects(self): + for _ in range(self.public_project_count): + yield self._add_public_project() + for _ in range(self.private_project_count): + yield self._add_private_project() + for _ in range(self.public_registration_count): + yield self._add_public_registration() + for _ in range(self.embargoed_registration_count): + yield self._add_embargoed_registration() + for _ in range(self.published_preprint_count): + yield self._add_published_preprint() + + def _add_public_project(self) -> osfdb.Node: + return ProjectFactory( + creator=self.user, + is_public=True, + ) + + def _add_private_project(self) -> osfdb.Node: + return ProjectFactory( + creator=self.user, + is_public=False, + ) + + def _add_public_registration(self) -> osfdb.Registration: + return RegistrationFactory( + creator=self.user, + is_public=True, + ) + + def _add_embargoed_registration(self) -> osfdb.Registration: + return RegistrationFactory( + creator=self.user, + is_public=False, + embargo=EmbargoFactory( + user=self.user, + end_date=self.future_timestamp, + ), + ) + + def _add_published_preprint(self) -> osfdb.Preprint | None: + return PreprintFactory( + creator=self.user, + is_public=True, + ) diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py new file mode 100644 index 00000000000..454b8d6700d --- /dev/null +++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py @@ -0,0 +1,238 @@ +from datetime import timedelta +from operator import attrgetter +from unittest import mock + +import pytest + +from osf.metrics.counted_usage import CountedAuthUsage +from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth + + +@pytest.mark.es_metrics +class TestPublicItemUsageReporter: + @pytest.fixture(autouse=True) + def _mocks(self): + with ( + # set a tiny page size to force aggregation pagination: + mock.patch('osf.metrics.reporters.public_item_usage._CHUNK_SIZE', 1), + # HACK: skip auto-filling fields from the database: + mock.patch('osf.models.base.Guid.load', return_value=None), + ): + yield + + @pytest.fixture + def ym_empty(self) -> YearMonth: + return YearMonth(2012, 7) + + @pytest.fixture + def ym_sparse(self) -> YearMonth: + return YearMonth(2017, 7) + + @pytest.fixture + def ym_busy(self) -> YearMonth: + return YearMonth(2023, 7) + + @pytest.fixture + def sparse_month_usage(self, ym_sparse): + # "sparse" month: + # item0: 3 views, 0 downloads, 2 sessions + # item1: 1 views, 1 download, 1 session (plus 1 view from child item2) + # item2: 1 views, 0 downloads, 1 session + _month_start = ym_sparse.month_start() + _save_usage( + timestamp=_month_start, + item_guid='item0', + session_id='sesh0', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start + timedelta(minutes=2), + item_guid='item0', + session_id='sesh0', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start + timedelta(minutes=3), + item_guid='item1', + session_id='sesh0', + action_labels=['download'], + ) + _save_usage( + timestamp=_month_start + timedelta(days=17), + item_guid='item0', + session_id='sesh1', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start + timedelta(days=17, minutes=3), + item_guid='item1', + session_id='sesh1', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start + timedelta(days=17, minutes=5), + item_guid='item2', + surrounding_guids=['item1'], + session_id='sesh1', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start + timedelta(days=17, minutes=11), + item_guid='item2', + surrounding_guids=['item1'], + session_id='sesh1', + action_labels=['download'], + ) + + @pytest.fixture + def busy_month_item0(self, ym_busy): + # item0: 4 sessions, 4*7 views, 4*5 downloads + _month_start = ym_busy.month_start() + for _sesh in range(0, 4): + _sesh_start = _month_start + timedelta(days=_sesh) + for _minute in range(0, 7): + _save_usage( + timestamp=_sesh_start + timedelta(minutes=_minute), + item_guid='item0', + session_id=f'sesh0{_sesh}', + action_labels=['view'], + ) + for _minute in range(10, 15): + _save_usage( + timestamp=_sesh_start + timedelta(minutes=_minute), + item_guid='item0', + session_id=f'sesh0{_sesh}', + action_labels=['download'], + ) + + @pytest.fixture + def busy_month_item1(self, ym_busy): + # item1: 10 sessions, 6*9 views, 5*7 downloads, 2 providers + # (plus 11 views in 11 sessions from child item2) + _month_start = ym_busy.month_start() + for _sesh in range(0, 6): + _sesh_start = _month_start + timedelta(days=_sesh) + for _minute in range(0, 9): + _save_usage( + timestamp=_sesh_start + timedelta(minutes=_minute), + item_guid='item1', + session_id=f'sesh1{_sesh}', + action_labels=['view'], + ) + for _sesh in range(5, 10): + _sesh_start = _month_start + timedelta(days=_sesh) + for _minute in range(10, 17): + _save_usage( + timestamp=_sesh_start + timedelta(minutes=_minute), + item_guid='item1', + session_id=f'sesh1{_sesh}', + action_labels=['download'], + provider_id='prov1', # additional provider_id + ) + + @pytest.fixture + def busy_month_item2(self, ym_busy): + # item2: 11 sessions, 11 views, 11 downloads (child of item1) + _month_start = ym_busy.month_start() + for _sesh in range(1, 12): + _save_usage( + timestamp=_month_start + timedelta(days=_sesh), + item_guid='item2', + surrounding_guids=['item1'], + session_id=f'sesh2{_sesh}', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start + timedelta(days=_sesh, hours=_sesh), + item_guid='item2', + surrounding_guids=['item1'], + session_id=f'sesh2{_sesh}', + action_labels=['download'], + ) + + def test_no_data(self, ym_empty): + _reporter = PublicItemUsageReporter(ym_empty) + _empty = list(_reporter.report()) + assert _empty == [] + + def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2): + _empty = list(PublicItemUsageReporter(ym_empty).report()) + _sparse = list(PublicItemUsageReporter(ym_sparse).report()) + _busy = list(PublicItemUsageReporter(ym_busy).report()) + + # empty month: + assert _empty == [] + + # sparse month: + assert len(_sparse) == 3 + _sparse_item0, _sparse_item1, _sparse_item2 = sorted(_sparse, key=attrgetter('item_osfid')) + # sparse-month item0 + assert isinstance(_sparse_item0, PublicItemUsageReport) + assert _sparse_item0.item_osfid == 'item0' + assert _sparse_item0.provider_id == ['prov0'] + assert _sparse_item0.platform_iri == ['http://osf.example'] + assert _sparse_item0.view_count == 3 + assert _sparse_item0.view_session_count == 2 + assert _sparse_item0.download_count == 0 + assert _sparse_item0.download_session_count == 0 + # sparse-month item1 + assert isinstance(_sparse_item1, PublicItemUsageReport) + assert _sparse_item1.item_osfid == 'item1' + assert _sparse_item1.provider_id == ['prov0'] + assert _sparse_item1.platform_iri == ['http://osf.example'] + assert _sparse_item1.view_count == 2 # including item2 + assert _sparse_item1.view_session_count == 1 # including item2 + assert _sparse_item1.download_count == 1 # NOT including item2 + assert _sparse_item1.download_session_count == 1 # NOT including item2 + # sparse-month item2 + assert isinstance(_sparse_item1, PublicItemUsageReport) + assert _sparse_item2.item_osfid == 'item2' + assert _sparse_item2.provider_id == ['prov0'] + assert _sparse_item2.platform_iri == ['http://osf.example'] + assert _sparse_item2.view_count == 1 + assert _sparse_item2.view_session_count == 1 + assert _sparse_item2.download_count == 1 + assert _sparse_item2.download_session_count == 1 + + # busy month: + assert len(_busy) == 3 + _busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_osfid')) + # busy-month item0 + assert isinstance(_busy_item0, PublicItemUsageReport) + assert _busy_item0.item_osfid == 'item0' + assert _busy_item0.provider_id == ['prov0'] + assert _busy_item0.platform_iri == ['http://osf.example'] + assert _busy_item0.view_count == 4 * 7 + assert _busy_item0.view_session_count == 4 + assert _busy_item0.download_count == 4 * 5 + assert _busy_item0.download_session_count == 4 + # busy-month item1 + assert isinstance(_busy_item1, PublicItemUsageReport) + assert _busy_item1.item_osfid == 'item1' + assert _busy_item1.provider_id == ['prov0', 'prov1'] + assert _busy_item1.platform_iri == ['http://osf.example'] + assert _busy_item1.view_count == 6 * 9 + 11 + assert _busy_item1.view_session_count == 6 + 11 + assert _busy_item1.download_count == 5 * 7 + assert _busy_item1.download_session_count == 5 + # busy-month item2 + assert isinstance(_busy_item2, PublicItemUsageReport) + assert _busy_item2.item_osfid == 'item2' + assert _busy_item2.provider_id == ['prov0'] + assert _busy_item2.platform_iri == ['http://osf.example'] + assert _busy_item2.view_count == 11 + assert _busy_item2.view_session_count == 11 + assert _busy_item2.download_count == 11 + assert _busy_item2.download_session_count == 11 + + +def _save_usage(**kwargs): + _kwargs = { # overridable defaults: + 'platform_iri': 'http://osf.example', + 'item_public': True, + 'provider_id': 'prov0', + **kwargs, + } + CountedAuthUsage(**_kwargs).save(refresh=True) diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py index 2089e7279c9..3840f5dba21 100644 --- a/osf_tests/metrics/test_daily_report.py +++ b/osf_tests/metrics/test_daily_report.py @@ -37,11 +37,11 @@ class Meta: assert report.meta.id == expected_key mock_save.reset_mock() - def test_with_duf(self, mock_save): + def test_with_unique_together(self, mock_save): # multiple reports of this type per day, unique by given field class UniqueByDateAndField(DailyReport): - DAILY_UNIQUE_FIELD = 'duf' - duf = metrics.Keyword() + UNIQUE_TOGETHER_FIELDS = ('report_date', 'uniquefield',) + uniquefield = metrics.Keyword() class Meta: app_label = 'osf' @@ -49,7 +49,7 @@ class Meta: today = date(2022, 5, 18) expected_blah = 'dca57e6cde89b19274ea24bc713971dab137a896b8e06d43a11a3f437cd1d151' - blah_report = UniqueByDateAndField(report_date=today, duf='blah') + blah_report = UniqueByDateAndField(report_date=today, uniquefield='blah') blah_report.save() assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is blah_report @@ -57,13 +57,16 @@ class Meta: mock_save.reset_mock() expected_fleh = 'e7dd5ff6b087807efcfa958077dc713878f21c65af79b3ccdb5dc2409bf5ad99' - fleh_report = UniqueByDateAndField(report_date=today, duf='fleh') + fleh_report = UniqueByDateAndField(report_date=today, uniquefield='fleh') fleh_report.save() assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is fleh_report assert fleh_report.meta.id == expected_fleh mock_save.reset_mock() - bad_report = UniqueByDateAndField(report_date=today) - with pytest.raises(ReportInvalid): - bad_report.save() + for _bad_report in ( + UniqueByDateAndField(report_date=today), + UniqueByDateAndField(report_date=today, uniquefield=['list', 'of', 'things']), + ): + with pytest.raises(ReportInvalid): + _bad_report.save() diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py new file mode 100644 index 00000000000..23546eb1fb3 --- /dev/null +++ b/osf_tests/metrics/test_monthly_report.py @@ -0,0 +1,146 @@ +import datetime +from unittest import mock + +import pytest +from elasticsearch_metrics import metrics + +from osf.metrics.reports import MonthlyReport, ReportInvalid, PublicItemUsageReport +from osf.metrics.utils import YearMonth + + +class TestMonthlyReportKey: + @pytest.fixture + def mock_save(self): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save + + def test_default(self, mock_save): + # only one of this type of report per month + class UniqueByMonth(MonthlyReport): + blah = metrics.Keyword() + + class Meta: + app_label = 'osf' + + yearmonth = YearMonth(2022, 5) + + reports = [ + UniqueByMonth(report_yearmonth=yearmonth), + UniqueByMonth(report_yearmonth=yearmonth, blah='blah'), + UniqueByMonth(report_yearmonth=yearmonth, blah='fleh'), + ] + expected_key = '8463aac67c1e5a038049196781d8f100f069225352d1829651892cf3fbfc50e2' + + for report in reports: + report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is report + assert report.meta.id == expected_key + mock_save.reset_mock() + + def test_with_unique_together(self, mock_save): + # multiple reports of this type per day, unique by given field + class UniqueByMonthAndField(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'uniquefield',) + uniquefield = metrics.Keyword() + + class Meta: + app_label = 'osf' + + yearmonth = YearMonth(2022, 5) + + expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08' + blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='blah') + blah_report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is blah_report + assert blah_report.meta.id == expected_blah + mock_save.reset_mock() + + expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f' + fleh_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='fleh') + fleh_report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is fleh_report + assert fleh_report.meta.id == expected_fleh + mock_save.reset_mock() + + for _bad_report in ( + UniqueByMonthAndField(report_yearmonth=yearmonth), + UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield=['list']), + ): + with pytest.raises(ReportInvalid): + _bad_report.save() + + +@pytest.mark.es_metrics +class TestLastMonthReport: + @pytest.fixture + def osfid(self): + return 'abced' + + @pytest.fixture + def this_month(self): + return YearMonth.from_date(datetime.date.today()) + + @pytest.fixture + def last_month(self, this_month): + return _prior_yearmonth(this_month) + + @pytest.fixture + def two_months_back(self, last_month): + return _prior_yearmonth(last_month) + + @pytest.fixture + def three_months_back(self, two_months_back): + return _prior_yearmonth(two_months_back) + + @pytest.fixture + def this_month_report(self, osfid, this_month): + return _item_usage_report(this_month, osfid, view_count=77) + + @pytest.fixture + def last_month_report(self, osfid, last_month): + return _item_usage_report(last_month, osfid, view_count=57) + + @pytest.fixture + def diff_last_month_report(self, last_month): + return _item_usage_report(last_month, 'zyxvt', view_count=17) + + @pytest.fixture + def two_months_back_report(self, osfid, two_months_back): + return _item_usage_report(two_months_back, osfid, view_count=27) + + @pytest.fixture + def three_months_back_report(self, osfid, three_months_back): + return _item_usage_report(three_months_back, osfid, view_count=37) + + def test_with_none(self, osfid): + assert PublicItemUsageReport.for_last_month(osfid) is None + + def test_with_others(self, osfid, this_month_report, three_months_back_report, diff_last_month_report): + assert PublicItemUsageReport.for_last_month(osfid) is None + + def test_with_prior_month(self, osfid, this_month_report, two_months_back_report, three_months_back_report, diff_last_month_report): + assert PublicItemUsageReport.for_last_month(osfid) == two_months_back_report + + def test_with_last_month(self, osfid, this_month_report, last_month_report, two_months_back_report, three_months_back_report, diff_last_month_report): + assert PublicItemUsageReport.for_last_month(osfid) == last_month_report + + +def _prior_yearmonth(ym: YearMonth) -> YearMonth: + return ( + YearMonth(ym.year - 1, 1) + if ym.month == 1 + else YearMonth(ym.year, ym.month - 1) + ) + + +def _item_usage_report(ym: YearMonth, osfid: str, **kwargs): + _report = PublicItemUsageReport( + report_yearmonth=ym, + item_osfid=osfid, + **kwargs + ) + _report.save(refresh=True) + return _report diff --git a/osf_tests/metrics/test_yearmonth.txt b/osf_tests/metrics/test_yearmonth.txt new file mode 100644 index 00000000000..646c73c42f9 --- /dev/null +++ b/osf_tests/metrics/test_yearmonth.txt @@ -0,0 +1,48 @@ +YearMonth tests +(doctest-style, in a way pytest will run; see https://docs.pytest.org/en/stable/how-to/doctest.html ) +>>> from osf.metrics.utils import YearMonth + +basic dataclass behavior: +>>> YearMonth(2000, 2) +YearMonth(year=2000, month=2) +>>> YearMonth(1999, 9) +YearMonth(year=1999, month=9) +>>> ym = YearMonth(2050, 2) +>>> ym.year +2050 +>>> ym.month +2 + +`from_date` constructor, accepts either `datetime.date` or `datetime.datetime`: +>>> import datetime +>>> YearMonth.from_date(datetime.date(1973, 1, 1)) +YearMonth(year=1973, month=1) +>>> YearMonth.from_date(datetime.datetime(1974, 3, 2)) +YearMonth(year=1974, month=3) + +`from_str` constructor, accepts "YYYY-MM" format: +>>> YearMonth.from_str('2000-12') +YearMonth(year=2000, month=12) + +`__str__` method gives "YYYY-MM" format: +>>> str(YearMonth(1491, 7)) +'1491-07' + +`next` method gives the next year-month: +>>> ym = YearMonth(1491, 11) +>>> ym.next() +YearMonth(year=1491, month=12) +>>> ym.next().next() +YearMonth(year=1492, month=1) + +`month_start` method: +>>> YearMonth(3333, 3).month_start() +datetime.datetime(3333, 3, 1, 0, 0, tzinfo=datetime.timezone.utc) +>>> YearMonth(1999, 12).month_start().isoformat() +'1999-12-01T00:00:00+00:00' + +`month_end` method: +>>> YearMonth(3333, 3).month_end() +datetime.datetime(3333, 4, 1, 0, 0, tzinfo=datetime.timezone.utc) +>>> YearMonth(1999, 12).month_end().isoformat() +'2000-01-01T00:00:00+00:00' diff --git a/osf_tests/test_management_commands.py b/osf_tests/test_management_commands.py index 8f29e72bc93..26e34601648 100644 --- a/osf_tests/test_management_commands.py +++ b/osf_tests/test_management_commands.py @@ -265,7 +265,7 @@ def test_data_storage_usage_command(self): assert (key, expected_summary_data[key]) == (key, actual_summary_data[key]) -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db class TestInstitutionMetricsUpdate: From d8e34ab08913b63fa32de79923b9e07b8cbf4199 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Thu, 14 Nov 2024 13:51:43 -0500 Subject: [PATCH 12/35] Update CHANGELOG, bump version --- CHANGELOG | 5 +++++ package.json | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 94705b38a79..32a02066ce0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,11 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO. +24.09.0 (2024-11-14) +==================== + +- Institutional Dashboard Project BE Release + 24.08.0 (2024-10-30) ==================== diff --git a/package.json b/package.json index 8b0edd12961..7fcf0590044 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "OSF", - "version": "24.08.0", + "version": "24.09.0", "description": "Facilitating Open Science", "repository": "https://github.com/CenterForOpenScience/osf.io", "author": "Center for Open Science", From 30b2df79de142490bd796d5fa755351e8c4ff8be Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 14 Nov 2024 13:54:52 -0500 Subject: [PATCH 13/35] skip deleted/private items in recatalog by default add `--also-decatalog` param to decatalog all deleted/private items --- osf/management/commands/recatalog_metadata.py | 18 ++++++ .../test_recatalog_metadata.py | 60 +++++++++++++++---- 2 files changed, 68 insertions(+), 10 deletions(-) diff --git a/osf/management/commands/recatalog_metadata.py b/osf/management/commands/recatalog_metadata.py index 43c647e5861..d46face0ba6 100644 --- a/osf/management/commands/recatalog_metadata.py +++ b/osf/management/commands/recatalog_metadata.py @@ -144,6 +144,11 @@ def add_arguments(self, parser): default=int(9e9), help='maximum number of chunks (default all/enough/lots)', ) + parser.add_argument( + '--also-decatalog', + action='store_true', + help='also remove private and deleted items from the catalog', + ) def handle(self, *args, **options): pls_all_types = options['all_types'] @@ -157,6 +162,7 @@ def handle(self, *args, **options): chunk_size = options['chunk_size'] chunk_count = options['chunk_count'] datacite_custom_types = options['datacite_custom_types'] + also_decatalog = options['also_decatalog'] if datacite_custom_types: # temporary arg for datacite 4.5 migration assert not start_id, 'oh no, cannot resume with `--datacite-custom-types`' @@ -185,4 +191,16 @@ def handle(self, *args, **options): _queryset = _queryset.filter( provider__in=AbstractProvider.objects.filter(_id__in=provider_ids), ) + if not also_decatalog: + if provided_model is OsfStorageFile: + _queryset = _queryset.filter(deleted__isnull=True) + elif provided_model is OSFUser: + _queryset = _queryset.filter( + deleted__isnull=True, + is_active=True, + ).exclude(allow_indexing=False) + elif provided_model is Preprint: + _queryset = _queryset.filter(is_public=True, is_published=True, deleted__isnull=True) + else: + _queryset = _queryset.filter(is_public=True, deleted__isnull=True) recatalog(_queryset, start_id, chunk_count, chunk_size) diff --git a/osf_tests/management_commands/test_recatalog_metadata.py b/osf_tests/management_commands/test_recatalog_metadata.py index 85742b76094..4354a54048e 100644 --- a/osf_tests/management_commands/test_recatalog_metadata.py +++ b/osf_tests/management_commands/test_recatalog_metadata.py @@ -1,3 +1,4 @@ +import datetime import pytest from unittest import mock from operator import attrgetter @@ -12,6 +13,7 @@ ProjectFactory, RegistrationProviderFactory, RegistrationFactory, + UserFactory, ) @@ -41,18 +43,15 @@ def registration_provider(self): @pytest.fixture def registrations(self, registration_provider): return sorted_by_id([ - RegistrationFactory(provider=registration_provider) + RegistrationFactory(provider=registration_provider, is_public=True) for _ in range(7) ]) @pytest.fixture def projects(self, registrations): return sorted_by_id([ - ProjectFactory() + ProjectFactory(is_public=True) for _ in range(7) - ] + [ - registration.registered_from - for registration in registrations ]) @pytest.fixture @@ -93,6 +92,23 @@ def items_with_custom_datacite_type(self, preprints, registrations, projects, fi *_nonpreprint_sample, } + @pytest.fixture + def decatalog_items(self, registrations): + _user = UserFactory(allow_indexing=False) + _registration = RegistrationFactory(is_public=False, creator=_user) + _implicit_projects = [ + _registration.registered_from, + *(_reg.registered_from for _reg in registrations), + ] + return [ + _user, + _registration, + *_implicit_projects, + PreprintFactory(is_published=False, creator=_user), + ProjectFactory(is_public=False, creator=_user), + ProjectFactory(deleted=datetime.datetime.now(), creator=_user), + ] + def test_recatalog_metadata( self, mock_update_share_task, @@ -104,7 +120,14 @@ def test_recatalog_metadata( files, users, items_with_custom_datacite_type, + decatalog_items, ): + def _actual_osfids() -> set[str]: + return { + _call[-1]['kwargs']['guid'] + for _call in mock_update_share_task.apply_async.mock_calls + } + # test preprints call_command( 'recatalog_metadata', @@ -189,11 +212,28 @@ def test_recatalog_metadata( '--datacite-custom-types', ) _expected_osfids = set(_iter_osfids(items_with_custom_datacite_type)) - _actual_osfids = { - _call[-1]['kwargs']['guid'] - for _call in mock_update_share_task.apply_async.mock_calls - } - assert _expected_osfids == _actual_osfids + assert _expected_osfids == _actual_osfids() + + mock_update_share_task.reset_mock() + + # all types + _all_public_items = [*preprints, *registrations, *projects, *files, *users] + call_command( + 'recatalog_metadata', + '--all-types', + ) + _expected_osfids = set(_iter_osfids(_all_public_items)) + assert _expected_osfids == _actual_osfids() + + # also decatalog private/deleted items + _all_items = [*_all_public_items, *decatalog_items] + call_command( + 'recatalog_metadata', + '--all-types', + '--also-decatalog', + ) + _expected_osfids = set(_iter_osfids(_all_items)) + assert _expected_osfids == _actual_osfids() ### From a57467fc2bf328311107546d3ac10f074e11229e Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 14 Nov 2024 13:57:14 -0500 Subject: [PATCH 14/35] remove 'temporary arg' --- osf/management/commands/recatalog_metadata.py | 30 ------------------- .../test_recatalog_metadata.py | 28 ----------------- 2 files changed, 58 deletions(-) diff --git a/osf/management/commands/recatalog_metadata.py b/osf/management/commands/recatalog_metadata.py index d46face0ba6..be52e9b0a0e 100644 --- a/osf/management/commands/recatalog_metadata.py +++ b/osf/management/commands/recatalog_metadata.py @@ -55,21 +55,6 @@ def _recatalog_all(queryset, chunk_size): recatalog(queryset, start_id=0, chunk_count=int(9e9), chunk_size=chunk_size) -def _recatalog_datacite_custom_types(chunk_size): - logger.info('recataloguing items with datacite custom type...') - # all preprints - _recatalog_all(Preprint.objects, chunk_size) - # objects with custom resource_type_general - for _model in {Registration, Node, OsfStorageFile}: - _queryset = ( - _model.objects - .exclude(guids__metadata_record__isnull=True) - .exclude(guids__metadata_record__resource_type_general='') - ) - _recatalog_all(_queryset, chunk_size) - logger.info('done recataloguing items with datacite custom type!') - - class Command(BaseCommand): def add_arguments(self, parser): type_group = parser.add_mutually_exclusive_group(required=True) @@ -103,14 +88,6 @@ def add_arguments(self, parser): action='store_true', help='recatalog metadata for users', ) - type_group.add_argument( - '--datacite-custom-types', - action='store_true', - help='''recatalog metadata for items with a specific datacite type, - including all preprints and items with custom resource_type_general - (may be slow for lack of database indexes) - ''', - ) provider_group = parser.add_mutually_exclusive_group() provider_group.add_argument( @@ -161,15 +138,8 @@ def handle(self, *args, **options): start_id = options['start_id'] chunk_size = options['chunk_size'] chunk_count = options['chunk_count'] - datacite_custom_types = options['datacite_custom_types'] also_decatalog = options['also_decatalog'] - if datacite_custom_types: # temporary arg for datacite 4.5 migration - assert not start_id, 'oh no, cannot resume with `--datacite-custom-types`' - assert not provider_ids, 'oh no, cannot filter providers with `--datacite-custom-types`' - _recatalog_datacite_custom_types(chunk_size) - return # end - if pls_all_types: assert not start_id, 'choose a specific type to resume with --start-id' provided_models = [Preprint, Registration, Node, OSFUser, OsfStorageFile] diff --git a/osf_tests/management_commands/test_recatalog_metadata.py b/osf_tests/management_commands/test_recatalog_metadata.py index 4354a54048e..550f06e4d13 100644 --- a/osf_tests/management_commands/test_recatalog_metadata.py +++ b/osf_tests/management_commands/test_recatalog_metadata.py @@ -2,11 +2,9 @@ import pytest from unittest import mock from operator import attrgetter -import random from django.core.management import call_command -from osf.models.metadata import GuidMetadataRecord from osf_tests.factories import ( PreprintProviderFactory, PreprintFactory, @@ -77,21 +75,6 @@ def users(self, preprints, registrations, projects): for preprint in preprints ]))) - @pytest.fixture - def items_with_custom_datacite_type(self, preprints, registrations, projects, files): - _nonpreprint_sample = [ - random.choice(_items) - for _items in (registrations, projects, files) - ] - for _item in _nonpreprint_sample: - _guid_record = GuidMetadataRecord.objects.for_guid(_item) - _guid_record.resource_type_general = 'BookChapter' # datacite resourceTypeGeneral value - _guid_record.save() - return { - *preprints, # every preprint has datacite type "Preprint" - *_nonpreprint_sample, - } - @pytest.fixture def decatalog_items(self, registrations): _user = UserFactory(allow_indexing=False) @@ -119,7 +102,6 @@ def test_recatalog_metadata( projects, files, users, - items_with_custom_datacite_type, decatalog_items, ): def _actual_osfids() -> set[str]: @@ -206,16 +188,6 @@ def _actual_osfids() -> set[str]: mock_update_share_task.reset_mock() - # datacite custom types - call_command( - 'recatalog_metadata', - '--datacite-custom-types', - ) - _expected_osfids = set(_iter_osfids(items_with_custom_datacite_type)) - assert _expected_osfids == _actual_osfids() - - mock_update_share_task.reset_mock() - # all types _all_public_items = [*preprints, *registrations, *projects, *files, *users] call_command( From 292dca292a0c0a5558cd0c4509db145719bd8d59 Mon Sep 17 00:00:00 2001 From: John Tordoff Date: Tue, 5 Nov 2024 10:41:31 -0500 Subject: [PATCH 15/35] [ENG-6364] Migrate Preprint Affilations (#10787) * add management command to migrate preprint affiliations * make sure migrations uses primary instead of replica, improve tests * remove redundant permission --------- Co-authored-by: John Tordoff <> --- .../commands/migrate_preprint_affiliation.py | 113 +++++++++++++++++ .../test_migrate_preprint_affiliations.py | 115 ++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 osf/management/commands/migrate_preprint_affiliation.py create mode 100644 osf_tests/management_commands/test_migrate_preprint_affiliations.py diff --git a/osf/management/commands/migrate_preprint_affiliation.py b/osf/management/commands/migrate_preprint_affiliation.py new file mode 100644 index 00000000000..78e7b2786ff --- /dev/null +++ b/osf/management/commands/migrate_preprint_affiliation.py @@ -0,0 +1,113 @@ +import datetime +import logging + +from django.core.management.base import BaseCommand +from django.db import transaction +from django.db.models import F, Exists, OuterRef + +from osf.models import PreprintContributor, InstitutionAffiliation + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + """Assign affiliations from users to preprints where they have write or admin permissions, with optional exclusion by user GUIDs.""" + + help = 'Assign affiliations from users to preprints where they have write or admin permissions.' + + def add_arguments(self, parser): + parser.add_argument( + '--exclude-guids', + nargs='+', + dest='exclude_guids', + help='List of user GUIDs to exclude from affiliation assignment' + ) + parser.add_argument( + '--dry-run', + action='store_true', + dest='dry_run', + help='If true, performs a dry run without making changes' + ) + parser.add_argument( + '--batch-size', + type=int, + default=1000, + dest='batch_size', + help='Number of contributors to process in each batch' + ) + + def handle(self, *args, **options): + start_time = datetime.datetime.now() + logger.info(f'Script started at: {start_time}') + + exclude_guids = set(options.get('exclude_guids') or []) + dry_run = options.get('dry_run', False) + batch_size = options.get('batch_size', 1000) + + if dry_run: + logger.info('Dry run mode activated.') + + processed_count, updated_count = assign_affiliations_to_preprints( + exclude_guids=exclude_guids, + dry_run=dry_run, + batch_size=batch_size + ) + + finish_time = datetime.datetime.now() + logger.info(f'Script finished at: {finish_time}') + logger.info(f'Total processed: {processed_count}, Updated: {updated_count}') + logger.info(f'Total run time: {finish_time - start_time}') + + +def assign_affiliations_to_preprints(exclude_guids=None, dry_run=True, batch_size=1000): + exclude_guids = exclude_guids or set() + processed_count = updated_count = 0 + + # Subquery to check if the user has any affiliated institutions + user_has_affiliations = Exists( + InstitutionAffiliation.objects.filter( + user=OuterRef('user') + ) + ) + + contributors_qs = PreprintContributor.objects.filter( + preprint__preprintgroupobjectpermission__permission__codename__in=['write_preprint'], + preprint__preprintgroupobjectpermission__group__user=F('user'), + ).filter( + user_has_affiliations + ).select_related( + 'user', + 'preprint' + ).exclude( + user__guids___id__in=exclude_guids + ).order_by('pk') # Ensure consistent ordering for batching + + total_contributors = contributors_qs.count() + logger.info(f'Total contributors to process: {total_contributors}') + + # Process contributors in batches + with transaction.atomic(): + for offset in range(0, total_contributors, batch_size): + # Use select_for_update() to ensure query hits the primary database + batch_contributors = contributors_qs[offset:offset + batch_size].select_for_update() + + logger.info(f'Processing contributors {offset + 1} to {min(offset + batch_size, total_contributors)}') + + for contributor in batch_contributors: + user = contributor.user + preprint = contributor.preprint + + user_institutions = user.get_affiliated_institutions() + processed_count += 1 + if not dry_run: + preprint.affiliated_institutions.add(*user_institutions) + updated_count += 1 + logger.info( + f'Assigned {len(user_institutions)} affiliations from user <{user._id}> to preprint <{preprint._id}>.' + ) + else: + logger.info( + f'Dry run: Would assign {len(user_institutions)} affiliations from user <{user._id}> to preprint <{preprint._id}>.' + ) + + return processed_count, updated_count diff --git a/osf_tests/management_commands/test_migrate_preprint_affiliations.py b/osf_tests/management_commands/test_migrate_preprint_affiliations.py new file mode 100644 index 00000000000..701638251f5 --- /dev/null +++ b/osf_tests/management_commands/test_migrate_preprint_affiliations.py @@ -0,0 +1,115 @@ +import pytest +from osf.management.commands.migrate_preprint_affiliation import assign_affiliations_to_preprints +from osf_tests.factories import ( + PreprintFactory, + InstitutionFactory, + AuthUserFactory, +) + + +@pytest.mark.django_db +class TestAssignAffiliationsToPreprints: + + @pytest.fixture() + def institution(self): + return InstitutionFactory() + + @pytest.fixture() + def user_with_affiliation(self, institution): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.save() + return user + + @pytest.fixture() + def user_without_affiliation(self): + return AuthUserFactory() + + @pytest.fixture() + def preprint_with_affiliated_contributor(self, user_with_affiliation): + preprint = PreprintFactory() + preprint.add_contributor( + user_with_affiliation, + permissions='admin', + visible=True + ) + return preprint + + @pytest.fixture() + def preprint_with_non_affiliated_contributor(self, user_without_affiliation): + preprint = PreprintFactory() + preprint.add_contributor( + user_without_affiliation, + permissions='admin', + visible=True + ) + return preprint + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_assign_affiliations_with_affiliated_contributor(self, preprint_with_affiliated_contributor, institution, dry_run): + preprint = preprint_with_affiliated_contributor + preprint.affiliated_institutions.clear() + preprint.save() + + assign_affiliations_to_preprints(dry_run=dry_run) + + if dry_run: + assert not preprint.affiliated_institutions.exists() + else: + assert institution in preprint.affiliated_institutions.all() + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_no_affiliations_for_non_affiliated_contributor(self, preprint_with_non_affiliated_contributor, dry_run): + preprint = preprint_with_non_affiliated_contributor + preprint.affiliated_institutions.clear() + preprint.save() + + assign_affiliations_to_preprints(dry_run=dry_run) + + assert not preprint.affiliated_institutions.exists() + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_exclude_contributor_by_guid(self, preprint_with_affiliated_contributor, user_with_affiliation, institution, dry_run): + preprint = preprint_with_affiliated_contributor + preprint.affiliated_institutions.clear() + preprint.save() + + assert user_with_affiliation.get_affiliated_institutions() + assert user_with_affiliation in preprint.contributors.all() + exclude_guids = {user._id for user in preprint.contributors.all()} + + assign_affiliations_to_preprints(exclude_guids=exclude_guids, dry_run=dry_run) + + assert not preprint.affiliated_institutions.exists() + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_affiliations_from_multiple_contributors(self, institution, dry_run): + institution_not_include = InstitutionFactory() + read_contrib = AuthUserFactory() + read_contrib.add_or_update_affiliated_institution(institution_not_include) + read_contrib.save() + + write_contrib = AuthUserFactory() + write_contrib.add_or_update_affiliated_institution(institution) + write_contrib.save() + + admin_contrib = AuthUserFactory() + institution2 = InstitutionFactory() + admin_contrib.add_or_update_affiliated_institution(institution2) + admin_contrib.save() + + preprint = PreprintFactory() + preprint.affiliated_institutions.clear() + preprint.add_contributor(read_contrib, permissions='read', visible=True) + preprint.add_contributor(write_contrib, permissions='write', visible=True) + preprint.add_contributor(admin_contrib, permissions='admin', visible=True) + preprint.save() + + assign_affiliations_to_preprints(dry_run=dry_run) + + if dry_run: + assert not preprint.affiliated_institutions.exists() + else: + affiliations = set(preprint.affiliated_institutions.all()) + assert affiliations == {institution, institution2} + assert institution_not_include not in affiliations From f832e5ed7ff36e2ac854789894cac3fc84e596a0 Mon Sep 17 00:00:00 2001 From: Uditi Mehta <57388785+uditijmehta@users.noreply.github.com> Date: Tue, 5 Nov 2024 14:59:44 -0500 Subject: [PATCH 16/35] [ENG-4438] Add OOPSpam and Akismet metrics to spam report (#10783) * Add OOPSpam and Akismet metrics to spam report * Add spam/ham metrics for OOPSpam and Akismet with FK join fix, test cleanup * Add private spam metrics report with preprint inclusion * Validate category; add PrivateSpamMetricsReporter to monthly reports --------- Co-authored-by: Uditi Mehta --- osf/external/askismet/client.py | 34 +++++++++++++++++ osf/external/oopspam/client.py | 34 +++++++++++++++++ osf/metrics/reporters/__init__.py | 2 + osf/metrics/reporters/private_spam_metrics.py | 28 ++++++++++++++ osf/metrics/reporters/spam_count.py | 1 - osf_tests/external/akismet/test_akismet.py | 36 ++++++++++++++++++ osf_tests/external/oopspam/test_oopspam.py | 36 ++++++++++++++++++ osf_tests/metrics/test_spam_count_reporter.py | 38 +++++++++++++++++++ 8 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 osf/metrics/reporters/private_spam_metrics.py create mode 100644 osf_tests/metrics/test_spam_count_reporter.py diff --git a/osf/external/askismet/client.py b/osf/external/askismet/client.py index 877f7ec4c23..db57b1d3cfa 100644 --- a/osf/external/askismet/client.py +++ b/osf/external/askismet/client.py @@ -133,3 +133,37 @@ def submit_ham(self, user_ip, user_agent, **kwargs): ) if res.status_code != requests.codes.ok: raise AkismetClientError(reason=res.text) + + def get_flagged_count(self, start_date, end_date, category='node'): + from osf.models import NodeLog, PreprintLog + + if category not in ['node', 'preprint']: + raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.") + + log_model = NodeLog if category == 'node' else PreprintLog + + flagged_count = log_model.objects.filter( + action=log_model.FLAG_SPAM, + created__gt=start_date, + created__lt=end_date, + **{f'{category}__spam_data__who_flagged__in': ['akismet', 'both']} + ).count() + + return flagged_count + + def get_hammed_count(self, start_date, end_date, category='node'): + from osf.models import NodeLog, PreprintLog + + if category not in ['node', 'preprint']: + raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.") + + log_model = NodeLog if category == 'node' else PreprintLog + + hammed_count = log_model.objects.filter( + action=log_model.CONFIRM_HAM, + created__gt=start_date, + created__lt=end_date, + **{f'{category}__spam_data__who_flagged__in': ['akismet', 'both']} + ).count() + + return hammed_count diff --git a/osf/external/oopspam/client.py b/osf/external/oopspam/client.py index ef22864a43d..0abdfdd021f 100644 --- a/osf/external/oopspam/client.py +++ b/osf/external/oopspam/client.py @@ -45,3 +45,37 @@ def check_content(self, user_ip, content, **kwargs): # OOPSpam returns a spam score out of 6. 3 or higher indicates spam return spam_score >= settings.OOPSPAM_SPAM_LEVEL, resp_json + + def get_flagged_count(self, start_date, end_date, category='node'): + from osf.models import NodeLog, PreprintLog + + if category not in ['node', 'preprint']: + raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.") + + log_model = NodeLog if category == 'node' else PreprintLog + + flagged_count = log_model.objects.filter( + action=log_model.FLAG_SPAM, + created__gt=start_date, + created__lt=end_date, + **{f'{category}__spam_data__who_flagged__in': ['oopspam', 'both']} + ).count() + + return flagged_count + + def get_hammed_count(self, start_date, end_date, category='node'): + from osf.models import NodeLog, PreprintLog + + if category not in ['node', 'preprint']: + raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.") + + log_model = NodeLog if category == 'node' else PreprintLog + + hammed_count = log_model.objects.filter( + action=log_model.CONFIRM_HAM, + created__gt=start_date, + created__lt=end_date, + **{f'{category}__spam_data__who_flagged__in': ['oopspam', 'both']} + ).count() + + return hammed_count diff --git a/osf/metrics/reporters/__init__.py b/osf/metrics/reporters/__init__.py index 412b1c2bf90..e6966414c3c 100644 --- a/osf/metrics/reporters/__init__.py +++ b/osf/metrics/reporters/__init__.py @@ -13,6 +13,7 @@ from .public_item_usage import PublicItemUsageReporter from .user_count import UserCountReporter from .spam_count import SpamCountReporter +from .private_spam_metrics import PrivateSpamMetricsReporter class AllDailyReporters(enum.Enum): @@ -32,3 +33,4 @@ class AllMonthlyReporters(enum.Enum): INSTITUTIONAL_USERS = InstitutionalUsersReporter INSTITUTIONAL_SUMMARY = InstitutionalSummaryMonthlyReporter ITEM_USAGE = PublicItemUsageReporter + PRIVATE_SPAM_METRICS = PrivateSpamMetricsReporter diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py new file mode 100644 index 00000000000..d6beba3a804 --- /dev/null +++ b/osf/metrics/reporters/private_spam_metrics.py @@ -0,0 +1,28 @@ +from osf.metrics.reports import SpamSummaryReport +from osf.external.oopspam.client import OOPSpamClient +from osf.external.askismet.client import AkismetClient +from ._base import MonthlyReporter + +class PrivateSpamMetricsReporter(MonthlyReporter): + report_name = 'Private Spam Metrics' + + def report(self, report_yearmonth): + target_month = report_yearmonth.target_month() + next_month = report_yearmonth.next_month() + + oopspam_client = OOPSpamClient() + akismet_client = AkismetClient() + + report = SpamSummaryReport( + report_yearmonth=str(report_yearmonth), + node_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='node'), + node_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='node'), + node_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='node'), + node_akismet_hammed=akismet_client.get_hammed_count(target_month, next_month, category='node'), + preprint_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='preprint'), + preprint_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='preprint'), + preprint_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='preprint'), + preprint_akismet_hammed=akismet_client.get_hammed_count(target_month, next_month, category='preprint') + ) + + return [report] diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py index 94290f96203..56fc03f8d32 100644 --- a/osf/metrics/reporters/spam_count.py +++ b/osf/metrics/reporters/spam_count.py @@ -5,7 +5,6 @@ from osf.models import PreprintLog, NodeLog from osf.models.spam import SpamStatus - class SpamCountReporter(MonthlyReporter): def report(self): diff --git a/osf_tests/external/akismet/test_akismet.py b/osf_tests/external/akismet/test_akismet.py index db3c5d0d584..46729e485e8 100644 --- a/osf_tests/external/akismet/test_akismet.py +++ b/osf_tests/external/akismet/test_akismet.py @@ -237,3 +237,39 @@ def test_meetings_skip_spam_check(self, mock_akismet, user, node_in_conference, node.check_spam(user, {'title'}, request_headers) node.refresh_from_db() assert node.spam_status == SpamStatus.FLAGGED + + @mock.patch('osf.models.NodeLog.objects.filter') + def test_get_flagged_count(self, mock_filter, user): + from osf.external.askismet.client import AkismetClient + from datetime import datetime + + client = AkismetClient() + start_date = datetime(2024, 10, 1) + end_date = datetime(2024, 10, 31) + + client.get_flagged_count(start_date, end_date) + + mock_filter.assert_called_with( + action='flag_spam', + created__gt=start_date, + created__lt=end_date, + node__spam_data__who_flagged__in=['akismet', 'both'] + ) + + @mock.patch('osf.models.NodeLog.objects.filter') + def test_get_hammed_count(self, mock_filter, user): + from osf.external.askismet.client import AkismetClient + from datetime import datetime + + client = AkismetClient() + start_date = datetime(2024, 10, 1) + end_date = datetime(2024, 10, 31) + + client.get_hammed_count(start_date, end_date) + + mock_filter.assert_called_with( + action='confirm_ham', + created__gt=start_date, + created__lt=end_date, + node__spam_data__who_flagged__in=['akismet', 'both'] + ) diff --git a/osf_tests/external/oopspam/test_oopspam.py b/osf_tests/external/oopspam/test_oopspam.py index 36740148116..96656ecc6da 100644 --- a/osf_tests/external/oopspam/test_oopspam.py +++ b/osf_tests/external/oopspam/test_oopspam.py @@ -125,3 +125,39 @@ def test_do_spam_check_false(self, mock_oopspam, user, request_headers): ) assert user.spam_status == SpamStatus.UNKNOWN + + @mock.patch('osf.models.NodeLog.objects.filter') + def test_get_flagged_count(self, mock_filter, user): + from osf.external.oopspam.client import OOPSpamClient + from datetime import datetime + + client = OOPSpamClient() + start_date = datetime(2024, 10, 1) + end_date = datetime(2024, 10, 31) + + client.get_flagged_count(start_date, end_date) + + mock_filter.assert_called_with( + action='flag_spam', + created__gt=start_date, + created__lt=end_date, + node__spam_data__who_flagged__in=['oopspam', 'both'] + ) + + @mock.patch('osf.models.NodeLog.objects.filter') + def test_get_hammed_count(self, mock_filter, user): + from osf.external.oopspam.client import OOPSpamClient + from datetime import datetime + + client = OOPSpamClient() + start_date = datetime(2024, 10, 1) + end_date = datetime(2024, 10, 31) + + client.get_hammed_count(start_date, end_date) + + mock_filter.assert_called_with( + action='confirm_ham', + created__gt=start_date, + created__lt=end_date, + node__spam_data__who_flagged__in=['oopspam', 'both'] + ) diff --git a/osf_tests/metrics/test_spam_count_reporter.py b/osf_tests/metrics/test_spam_count_reporter.py new file mode 100644 index 00000000000..db44dc848ff --- /dev/null +++ b/osf_tests/metrics/test_spam_count_reporter.py @@ -0,0 +1,38 @@ +import pytest +from datetime import datetime +from osf.metrics.reporters.private_spam_metrics import PrivateSpamMetricsReporter +from osf.metrics.utils import YearMonth +from osf_tests.factories import NodeLogFactory, NodeFactory +from unittest.mock import patch + +@pytest.mark.django_db +def test_private_spam_metrics_reporter(): + start_date = datetime(2024, 10, 1) + + oopspam_node = NodeFactory(spam_data={'who_flagged': 'oopspam'}) + akismet_node = NodeFactory(spam_data={'who_flagged': 'akismet'}) + + NodeLogFactory.create_batch(10, action='flag_spam', created=start_date, node=oopspam_node) + NodeLogFactory.create_batch(5, action='confirm_ham', created=start_date, node=oopspam_node) + NodeLogFactory.create_batch(20, action='flag_spam', created=start_date, node=akismet_node) + NodeLogFactory.create_batch(10, action='confirm_ham', created=start_date, node=akismet_node) + + report_yearmonth = YearMonth(2024, 10) + + with patch('osf.external.oopspam.client.OOPSpamClient.get_flagged_count') as mock_oopspam_get_flagged_count, \ + patch('osf.external.oopspam.client.OOPSpamClient.get_hammed_count') as mock_oopspam_get_hammed_count, \ + patch('osf.external.askismet.client.AkismetClient.get_flagged_count') as mock_akismet_get_flagged_count, \ + patch('osf.external.askismet.client.AkismetClient.get_hammed_count') as mock_akismet_get_hammed_count: + + mock_oopspam_get_flagged_count.return_value = 10 + mock_oopspam_get_hammed_count.return_value = 5 + mock_akismet_get_flagged_count.return_value = 20 + mock_akismet_get_hammed_count.return_value = 10 + + reporter = PrivateSpamMetricsReporter() + report = reporter.report(report_yearmonth)[0] + + assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}" + assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}" + assert report.node_akismet_flagged == 20, f"Expected 20, got {report.node_akismet_flagged}" + assert report.node_akismet_hammed == 10, f"Expected 10, got {report.node_akismet_hammed}" From f67b86facec647cef3a930aad14117e33eed6fdf Mon Sep 17 00:00:00 2001 From: mfraezz Date: Thu, 7 Nov 2024 15:13:07 -0500 Subject: [PATCH 17/35] Add PrivateSpamMetricsReport (#10791) --- osf/metrics/reporters/private_spam_metrics.py | 4 ++-- osf/metrics/reports.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py index d6beba3a804..39b5fb16cb7 100644 --- a/osf/metrics/reporters/private_spam_metrics.py +++ b/osf/metrics/reporters/private_spam_metrics.py @@ -1,4 +1,4 @@ -from osf.metrics.reports import SpamSummaryReport +from osf.metrics.reports import PrivateSpamMetricsReport from osf.external.oopspam.client import OOPSpamClient from osf.external.askismet.client import AkismetClient from ._base import MonthlyReporter @@ -13,7 +13,7 @@ def report(self, report_yearmonth): oopspam_client = OOPSpamClient() akismet_client = AkismetClient() - report = SpamSummaryReport( + report = PrivateSpamMetricsReport( report_yearmonth=str(report_yearmonth), node_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='node'), node_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='node'), diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index 43bdd0fabd1..d1e21db9c45 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -323,3 +323,14 @@ def for_last_month(cls, item_osfid: str) -> PublicItemUsageReport | None: ) _response = _search.execute() return _response[0] if _response else None + + +class PrivateSpamMetricsReport(MonthlyReport): + node_oopspam_flagged = metrics.Integer() + node_oopspam_hammed = metrics.Integer() + node_akismet_flagged = metrics.Integer() + node_akismet_hammed = metrics.Integer() + preprint_oopspam_flagged = metrics.Integer() + preprint_oopspam_hammed = metrics.Integer() + preprint_akismet_flagged = metrics.Integer() + preprint_akismet_hammed = metrics.Integer() From eadb41fd8bc5b2d7b06b2721131002a435b2b14c Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 15 Nov 2024 13:54:45 -0500 Subject: [PATCH 18/35] [ENG-6435] Fix: duplicate reports when run for past years (#10800) --- osf/metrics/reports.py | 27 +++++++++++++++++------- osf/metrics/utils.py | 10 +++++++++ osf_tests/metrics/test_daily_report.py | 21 +++++++++++++++--- osf_tests/metrics/test_monthly_report.py | 5 +++++ osf_tests/metrics/test_yearmonth.txt | 18 ++++++++++++++++ 5 files changed, 70 insertions(+), 11 deletions(-) diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index d1e21db9c45..28ca6cdb964 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -30,6 +30,16 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + def save(self, *args, **kwargs): + if self.timestamp is None: + self.timestamp = datetime.datetime( + self.report_date.year, + self.report_date.month, + self.report_date.day, + tzinfo=datetime.UTC, + ) + super().save(*args, **kwargs) + class Meta: abstract = True dynamic = metrics.MetaField('strict') @@ -41,19 +51,15 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs, format='strict_year_month') def deserialize(self, data): - if isinstance(data, YearMonth): - return data - elif isinstance(data, str): - return YearMonth.from_str(data) - elif isinstance(data, (datetime.datetime, datetime.date)): - return YearMonth.from_date(data) - elif isinstance(data, int): + if isinstance(data, int): # elasticsearch stores dates in milliseconds since the unix epoch _as_datetime = datetime.datetime.fromtimestamp(data // 1000) return YearMonth.from_date(_as_datetime) elif data is None: return None - else: + try: + return YearMonth.from_any(data) + except ValueError: raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') def serialize(self, data): @@ -102,6 +108,11 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + def save(self, *args, **kwargs): + if self.timestamp is None: + self.timestamp = YearMonth.from_any(self.report_yearmonth).month_start() + super().save(*args, **kwargs) + @receiver(metrics_pre_save) def set_report_id(sender, instance, **kwargs): diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 910b1f3104c..7c9fed2c6fb 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -46,6 +46,16 @@ def from_str(cls, input_str: str) -> YearMonth: else: raise ValueError(f'expected YYYY-MM format, got "{input_str}"') + @classmethod + def from_any(cls, data) -> YearMonth: + if isinstance(data, YearMonth): + return data + elif isinstance(data, str): + return YearMonth.from_str(data) + elif isinstance(data, (datetime.datetime, datetime.date)): + return YearMonth.from_date(data) + raise ValueError(f'cannot coerce {data} into YearMonth') + def __str__(self): """convert to string of "YYYY-MM" format""" return f'{self.year}-{self.month:0>2}' diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py index 3840f5dba21..46375184f95 100644 --- a/osf_tests/metrics/test_daily_report.py +++ b/osf_tests/metrics/test_daily_report.py @@ -1,4 +1,4 @@ -from datetime import date +import datetime from unittest import mock import pytest @@ -21,7 +21,13 @@ class UniqueByDate(DailyReport): class Meta: app_label = 'osf' - today = date(2022, 5, 18) + today = datetime.date(2022, 5, 18) + expected_timestamp = datetime.datetime( + today.year, + today.month, + today.day, + tzinfo=datetime.UTC, + ) reports = [ UniqueByDate(report_date=today), @@ -35,6 +41,7 @@ class Meta: assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is report assert report.meta.id == expected_key + assert report.timestamp == expected_timestamp mock_save.reset_mock() def test_with_unique_together(self, mock_save): @@ -46,7 +53,13 @@ class UniqueByDateAndField(DailyReport): class Meta: app_label = 'osf' - today = date(2022, 5, 18) + today = datetime.date(2022, 5, 18) + expected_timestamp = datetime.datetime( + today.year, + today.month, + today.day, + tzinfo=datetime.UTC, + ) expected_blah = 'dca57e6cde89b19274ea24bc713971dab137a896b8e06d43a11a3f437cd1d151' blah_report = UniqueByDateAndField(report_date=today, uniquefield='blah') @@ -54,6 +67,7 @@ class Meta: assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is blah_report assert blah_report.meta.id == expected_blah + assert blah_report.timestamp == expected_timestamp mock_save.reset_mock() expected_fleh = 'e7dd5ff6b087807efcfa958077dc713878f21c65af79b3ccdb5dc2409bf5ad99' @@ -62,6 +76,7 @@ class Meta: assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is fleh_report assert fleh_report.meta.id == expected_fleh + assert fleh_report.timestamp == expected_timestamp mock_save.reset_mock() for _bad_report in ( diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py index 23546eb1fb3..0c0302a7f08 100644 --- a/osf_tests/metrics/test_monthly_report.py +++ b/osf_tests/metrics/test_monthly_report.py @@ -23,6 +23,7 @@ class Meta: app_label = 'osf' yearmonth = YearMonth(2022, 5) + expected_timestamp = datetime.datetime(yearmonth.year, yearmonth.month, 1, tzinfo=datetime.UTC) reports = [ UniqueByMonth(report_yearmonth=yearmonth), @@ -36,6 +37,7 @@ class Meta: assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is report assert report.meta.id == expected_key + assert report.timestamp == expected_timestamp mock_save.reset_mock() def test_with_unique_together(self, mock_save): @@ -48,6 +50,7 @@ class Meta: app_label = 'osf' yearmonth = YearMonth(2022, 5) + expected_timestamp = datetime.datetime(yearmonth.year, yearmonth.month, 1, tzinfo=datetime.UTC) expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08' blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='blah') @@ -55,6 +58,7 @@ class Meta: assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is blah_report assert blah_report.meta.id == expected_blah + assert blah_report.timestamp == expected_timestamp mock_save.reset_mock() expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f' @@ -63,6 +67,7 @@ class Meta: assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is fleh_report assert fleh_report.meta.id == expected_fleh + assert fleh_report.timestamp == expected_timestamp mock_save.reset_mock() for _bad_report in ( diff --git a/osf_tests/metrics/test_yearmonth.txt b/osf_tests/metrics/test_yearmonth.txt index 646c73c42f9..17d847f689b 100644 --- a/osf_tests/metrics/test_yearmonth.txt +++ b/osf_tests/metrics/test_yearmonth.txt @@ -24,6 +24,24 @@ YearMonth(year=1974, month=3) >>> YearMonth.from_str('2000-12') YearMonth(year=2000, month=12) +`from_any` constructor, accepts YearMonth, "YYYY-MM", or date/datetime +>>> YearMonth.from_any('2000-12') +YearMonth(year=2000, month=12) +>>> YearMonth.from_any(_) is _ +True +>>> YearMonth.from_any(datetime.date(1973, 1, 1)) +YearMonth(year=1973, month=1) +>>> YearMonth.from_any(datetime.datetime(1974, 3, 2)) +YearMonth(year=1974, month=3) +>>> YearMonth.from_any(None) +Traceback (most recent call last): + ... +ValueError: cannot coerce None into YearMonth +>>> YearMonth.from_any(7) +Traceback (most recent call last): + ... +ValueError: cannot coerce 7 into YearMonth + `__str__` method gives "YYYY-MM" format: >>> str(YearMonth(1491, 7)) '1491-07' From 913889d200a258b15dd06a0c0eba5838d6ac3e3b Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 15 Nov 2024 13:55:16 -0500 Subject: [PATCH 19/35] [ENG-6506] Fix: counted-usage clobbers (#10799) prevent counted-usages with different `action_labels` from overwriting each other -- deduplicate only when `action_labels` match exactly --- api_tests/metrics/test_counted_usage.py | 16 ++++++++-------- osf/metrics/counted_usage.py | 1 + 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/api_tests/metrics/test_counted_usage.py b/api_tests/metrics/test_counted_usage.py index 9e20f2c0238..568d663be9e 100644 --- a/api_tests/metrics/test_counted_usage.py +++ b/api_tests/metrics/test_counted_usage.py @@ -99,8 +99,8 @@ def test_by_client_session_id(self, app, mock_save, user): assert resp.status_code == 201 assert_saved_with( mock_save, - # doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3').hexdigest() - expected_doc_id='55fffffdc0d674d15a5e8763d14e4ae90f658fbfb6fbf94f88a5d24978f02e72', + # doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|api,view').hexdigest() + expected_doc_id='3239044c7462dd318edd0522a0ed7d84b9c6502ef16cb40dfcae6c1f456d57a2', expected_attrs={ 'platform_iri': 'http://example.foo/', 'item_guid': 'zyxwv', @@ -132,8 +132,8 @@ def test_by_client_session_id_anon(self, app, mock_save): assert resp.status_code == 201 assert_saved_with( mock_save, - # doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3').hexdigest() - expected_doc_id='e559ffbc4bd3e3e69252d34c273f0e771ec89ee455ec9b60fbbadf3944e4af4e', + # doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|view,web').hexdigest() + expected_doc_id='d01759e963893f9dc9b2ccf016a5ef29135673779802b5578f31449543677e82', expected_attrs={ 'platform_iri': 'http://example.foo/', 'item_guid': 'zyxwv', @@ -166,8 +166,8 @@ def test_by_user_auth(self, app, mock_save, user): assert resp.status_code == 201 assert_saved_with( mock_save, - # doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3').hexdigest() - expected_doc_id='743494d8a55079b91e202da1dbdfce5aea72e310c57a34b36df2c2af5ed4d362', + # doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3|view,web').hexdigest() + expected_doc_id='7b8bc27c6d90fb45aa5bbd02deceba9f7384ed61b9a6e7253317c262020b94c2', expected_attrs={ 'platform_iri': 'http://example.foo/', 'item_guid': 'yxwvu', @@ -196,8 +196,8 @@ def test_by_useragent_header(self, app, mock_save): assert resp.status_code == 201 assert_saved_with( mock_save, - # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3').hexdigest() - expected_doc_id='a50ac1b2dc1c918cdea7be50b005117fdb6ee00ea069ca3aa4aaf03c0f905fa0', + # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3|api,view').hexdigest() + expected_doc_id='d669528b30f443ffe506e183537af9624ef290090e90a200ecce7b7ca19c77f7', expected_attrs={ 'platform_iri': 'http://example.foo/', 'item_guid': 'yxwvu', diff --git a/osf/metrics/counted_usage.py b/osf/metrics/counted_usage.py index c3c6d4cc1aa..39b3b74129b 100644 --- a/osf/metrics/counted_usage.py +++ b/osf/metrics/counted_usage.py @@ -142,6 +142,7 @@ def _fill_document_id(counted_usage): counted_usage.session_id, counted_usage.timestamp.date(), time_window, + ','.join(sorted(counted_usage.action_labels)), ) From 674231ed2225d740edd25409bd2726fbde129989 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Wed, 13 Nov 2024 11:38:24 -0500 Subject: [PATCH 20/35] Add undated AGU conference campaign for annual use --- framework/auth/campaigns.py | 9 +++++++ framework/auth/views.py | 2 +- tests/test_campaigns.py | 1 + tests/test_views.py | 6 ++--- website/mails/mails.py | 4 +++ .../emails/confirm_agu_conference.html.mako | 26 +++++++++++++++++++ website/util/metrics.py | 1 + 7 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 website/templates/emails/confirm_agu_conference.html.mako diff --git a/framework/auth/campaigns.py b/framework/auth/campaigns.py index 8a902245817..a47b3cf637b 100644 --- a/framework/auth/campaigns.py +++ b/framework/auth/campaigns.py @@ -100,6 +100,15 @@ def get_campaigns(): } }) + newest_campaigns.update({ + 'agu_conference': { + 'system_tag': CampaignSourceTags.AguConference.value, + 'redirect_url': furl(DOMAIN).add(path='dashboard/').url, + 'confirmation_email_template': mails.CONFIRM_EMAIL_AGU_CONFERENCE, + 'login_type': 'native', + } + }) + CAMPAIGNS = newest_campaigns CAMPAIGNS_LAST_REFRESHED = timezone.now() diff --git a/framework/auth/views.py b/framework/auth/views.py index e398a6db0a5..5f999aaaca6 100644 --- a/framework/auth/views.py +++ b/framework/auth/views.py @@ -944,7 +944,7 @@ def register_user(**kwargs): ) if settings.CONFIRM_REGISTRATIONS_BY_EMAIL: - send_confirm_email_async(user, email=user.username) + send_confirm_email(user, email=user.username) message = language.REGISTRATION_SUCCESS.format(email=user.username) return {'message': message} else: diff --git a/tests/test_campaigns.py b/tests/test_campaigns.py index 587aaaa82d8..1df6a32169a 100644 --- a/tests/test_campaigns.py +++ b/tests/test_campaigns.py @@ -46,6 +46,7 @@ def setUp(self): 'osf-registries', 'osf-registered-reports', 'agu_conference_2023', + 'agu_conference', ] self.refresh = timezone.now() campaigns.CAMPAIGNS = None # force campaign refresh now that preprint providers are populated diff --git a/tests/test_views.py b/tests/test_views.py index f1dbaa3285d..d78e7760c17 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -3438,8 +3438,8 @@ def test_register_after_being_invited_as_unreg_contributor(self, mock_update_sea assert new_user.check_password(password) assert new_user.fullname == real_name - @mock.patch('framework.auth.views.send_confirm_email_async') - def test_register_sends_user_registered_signal(self, mock_send_confirm_email_async): + @mock.patch('framework.auth.views.send_confirm_email') + def test_register_sends_user_registered_signal(self, mock_send_confirm_email): url = api_url_for('register_user') name, email, password = fake.name(), fake_email(), 'underpressure' with capture_signals() as mock_signals: @@ -3453,7 +3453,7 @@ def test_register_sends_user_registered_signal(self, mock_send_confirm_email_asy } ) assert mock_signals.signals_sent() == {auth.signals.user_registered, auth.signals.unconfirmed_user_created} - assert mock_send_confirm_email_async.called + assert mock_send_confirm_email.called @mock.patch('framework.auth.views.mails.send_mail') def test_resend_confirmation(self, send_mail: MagicMock): diff --git a/website/mails/mails.py b/website/mails/mails.py index da66ad8d083..afca9e78f03 100644 --- a/website/mails/mails.py +++ b/website/mails/mails.py @@ -191,6 +191,10 @@ def get_english_article(word): 'confirm_agu_conference_2023', subject='OSF Account Verification, from the American Geophysical Union Conference' ) +CONFIRM_EMAIL_AGU_CONFERENCE = Mail( + 'confirm_agu_conference', + subject='OSF Account Verification, from the American Geophysical Union Conference' +) CONFIRM_EMAIL_PREPRINTS = lambda name, provider: Mail( f'confirm_preprints_{name}', subject=f'OSF Account Verification, {provider}' diff --git a/website/templates/emails/confirm_agu_conference.html.mako b/website/templates/emails/confirm_agu_conference.html.mako new file mode 100644 index 00000000000..603e2c39e8d --- /dev/null +++ b/website/templates/emails/confirm_agu_conference.html.mako @@ -0,0 +1,26 @@ +<%inherit file="notify_base.mako" /> + +<%def name="content()"> + + + Hello ${user.fullname},
+
+ + Thank you for joining us at the AGU Open Science Pavilion, and welcome to the Open Science Framework (OSF). + + We are pleased to offer a special AGU attendees exclusive 1:1 consultation to continue our conversation and to help + you get oriented on the OSF. This is an opportunity for us to show you useful OSF features, talk about + open science in Earth and space sciences, and for you to ask any questions you may have. + You can sign up to participate by completing this form, and a member of our team will be in touch to + determine your availability: +
+ https://docs.google.com/forms/d/e/1FAIpQLSeJ23YPaEMdbLY1OqbcP85Tt6rhLpFoOtH0Yg4vY_wSKULRcw/viewform?usp=sf_link +

+ To confirm your OSF account, please verify your email address by visiting this link:
+
+ ${confirmation_url}
+
+ From the team at the Center for Open Science
+ + + diff --git a/website/util/metrics.py b/website/util/metrics.py index 7324a410138..c76adb89f5a 100644 --- a/website/util/metrics.py +++ b/website/util/metrics.py @@ -57,6 +57,7 @@ class CampaignSourceTags(Enum): OsfRegisteredReports = campaign_source_tag('osf_registered_reports') Osf4m = campaign_source_tag('osf4m') AguConference2023 = campaign_source_tag('agu_conference_2023') + AguConference = campaign_source_tag('agu_conference') class OsfClaimedTags(Enum): From cb0c07844eb87a9b7344777f1986c23861a6adb1 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 22 Nov 2024 06:54:43 -0500 Subject: [PATCH 21/35] [ENG-6590] Fix: Monthly Usage Data - update monthly reporters with `iter_report_kwargs` (mostly affects `PublicItemUsageReporter`, which was badly optimized to generate many reports at once) - add `schedule_monthly_reporter` task that schedules tasks from `iter_report_kwargs` results - change `MonthlyReporter.followup_task()` to run per-report --- admin/management/views.py | 8 +- osf/features.yaml | 5 + .../commands/monthly_reporters_go.py | 127 ++++-- osf/metrics/preprint_metrics.py | 4 +- osf/metrics/reporters/_base.py | 19 +- .../reporters/institution_summary_monthly.py | 13 +- osf/metrics/reporters/institutional_users.py | 33 +- osf/metrics/reporters/public_item_usage.py | 378 ++++++++++-------- osf/metrics/reporters/spam_count.py | 7 +- osf/metrics/utils.py | 8 + osf_tests/metrics/reporters/__init__.py | 0 osf_tests/metrics/reporters/_testutils.py | 10 + .../test_institutional_summary_reporter.py | 7 +- .../test_institutional_users_reporter.py | 15 +- .../test_public_item_usage_reporter.py | 148 ++++--- osf_tests/metrics/test_yearmonth.txt | 7 + 16 files changed, 485 insertions(+), 304 deletions(-) create mode 100644 osf_tests/metrics/reporters/__init__.py create mode 100644 osf_tests/metrics/reporters/_testutils.py diff --git a/admin/management/views.py b/admin/management/views.py index 88548a518d1..bb7065c1062 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -12,6 +12,7 @@ from scripts.find_spammy_content import manage_spammy_content from django.urls import reverse from django.shortcuts import redirect +from osf.metrics.utils import YearMonth from osf.models import Preprint, Node, Registration @@ -122,8 +123,11 @@ def post(self, request, *args, **kwargs): report_date = None errors = monthly_reporters_go( - report_month=getattr(report_date, 'month', None), - report_year=getattr(report_date, 'year', None) + yearmonth=( + str(YearMonth.from_date(report_date)) + if report_date is not None + else '' + ), ) if errors: diff --git a/osf/features.yaml b/osf/features.yaml index a3f0fcc1f14..1b41e4b2cdc 100644 --- a/osf/features.yaml +++ b/osf/features.yaml @@ -221,3 +221,8 @@ switches: - flag_name: ENABLE_INACTIVE_SCHEMAS name: enable_inactive_schemas note: This is no longer used + + - flag_name: COUNTEDUSAGE_UNIFIED_METRICS_2024 + name: countedusage_unified_metrics_2024 + note: use only `osf.metrics.counted_usage`-based metrics where possible; un-use PageCounter, PreprintView, PreprintDownload, etc + active: false diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py index c467640cd15..7ab7b843434 100644 --- a/osf/management/commands/monthly_reporters_go.py +++ b/osf/management/commands/monthly_reporters_go.py @@ -1,68 +1,125 @@ +import datetime import logging from django.core.management.base import BaseCommand -from django.db.utils import OperationalError -from django.utils import timezone +from django.db import OperationalError as DjangoOperationalError +from elasticsearch.exceptions import ConnectionError as ElasticConnectionError +from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app +import framework.sentry from osf.metrics.reporters import AllMonthlyReporters from osf.metrics.utils import YearMonth -from website.app import init_app logger = logging.getLogger(__name__) -MAXMONTH = 12 - +_CONTINUE_AFTER_ERRORS = ( + DjangoOperationalError, + ElasticConnectionError, + PostgresOperationalError, +) @celery_app.task(name='management.commands.monthly_reporters_go') -def monthly_reporters_go(report_year=None, report_month=None): - init_app() # OSF-specific setup - - if report_year and report_month: - report_yearmonth = YearMonth(report_year, report_month) - else: # default to last month if year and month not provided - today = timezone.now().date() - report_yearmonth = YearMonth( - year=today.year if today.month > 1 else today.year - 1, - month=today.month - 1 or MAXMONTH, - ) - for _reporter_key in AllMonthlyReporters.__members__.keys(): - monthly_reporter_go.apply_async(kwargs={ +def monthly_reporters_go(yearmonth: str = '', reporter_key: str = ''): + _yearmonth = ( + YearMonth.from_str(yearmonth) + if yearmonth + else YearMonth.from_date(datetime.date.today()).prior() # default last month + ) + _reporter_keys = ( + [reporter_key] + if reporter_key + else _enum_names(AllMonthlyReporters) + ) + for _reporter_key in _reporter_keys: + schedule_monthly_reporter.apply_async(kwargs={ + 'yearmonth': str(_yearmonth), 'reporter_key': _reporter_key, - 'yearmonth': str(report_yearmonth), }) +@celery_app.task(name='management.commands.schedule_monthly_reporter') +def schedule_monthly_reporter( + yearmonth: str, + reporter_key: str, + continue_after: dict | None = None, +): + _reporter = _get_reporter(reporter_key, yearmonth) + _last_kwargs = None + try: + for _kwargs in _reporter.iter_report_kwargs(continue_after=continue_after): + monthly_reporter_do.apply_async(kwargs={ + 'yearmonth': yearmonth, + 'reporter_key': reporter_key, + 'report_kwargs': _kwargs, + }) + _last_kwargs = _kwargs + except _CONTINUE_AFTER_ERRORS as _error: + # let the celery task succeed but log the error + framework.sentry.log_exception(_error) + # schedule another task to continue scheduling + if _last_kwargs is not None: + schedule_monthly_reporter.apply_async(kwargs={ + 'yearmonth': yearmonth, + 'reporter_key': reporter_key, + 'continue_after': _last_kwargs, + }) + + @celery_app.task( - name='management.commands.monthly_reporter_go', - autoretry_for=(OperationalError,), + name='management.commands.monthly_reporter_do', + autoretry_for=( + DjangoOperationalError, + ElasticConnectionError, + PostgresOperationalError, + ), max_retries=5, retry_backoff=True, - bind=True, ) -def monthly_reporter_go(task, reporter_key: str, yearmonth: str): - _reporter_class = AllMonthlyReporters[reporter_key].value - _reporter = _reporter_class(YearMonth.from_str(yearmonth)) - _reporter.run_and_record_for_month() - _followup = _reporter.followup_task() - if _followup is not None: - _followup.apply_async() +def monthly_reporter_do(reporter_key: str, yearmonth: str, report_kwargs: dict): + _reporter = _get_reporter(reporter_key, yearmonth) + _report = _reporter.report(**report_kwargs) + if _report is not None: + _report.report_yearmonth = _reporter.yearmonth + _report.save() + _followup_task = _reporter.followup_task(_report) + if _followup_task is not None: + _followup_task.apply_async() class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( 'yearmonth', - type=YearMonth.from_str, - default={'year': None, 'month': None}, + type=str, help='year and month (YYYY-MM)', ) + parser.add_argument( + '-r', '--reporter', + type=str, + choices={_name.lower() for _name in _enum_names(AllMonthlyReporters)}, + default='', + help='name of the reporter to run (default all)', + ) - def handle(self, *args, **options): + def handle(self, *args, **kwargs): monthly_reporters_go( - report_year=getattr(options.get('yearmonth'), 'year', None), - report_month=getattr(options.get('yearmonth'), 'month', None), + yearmonth=kwargs['yearmonth'], + reporter_key=kwargs['reporter'].upper(), ) - self.stdout.write(self.style.SUCCESS('reporter tasks scheduled.')) + self.stdout.write(self.style.SUCCESS( + f'scheduling tasks for monthly reporter "{kwargs['reporter']}"...' + if kwargs['reporter'] + else 'scheduling tasks for all monthly reporters...' + )) + + +def _get_reporter(reporter_key: str, yearmonth: str): + _reporter_class = AllMonthlyReporters[reporter_key].value + return _reporter_class(YearMonth.from_str(yearmonth)) + + +def _enum_names(enum_cls) -> list[str]: + return list(enum_cls.__members__.keys()) diff --git a/osf/metrics/preprint_metrics.py b/osf/metrics/preprint_metrics.py index 472cd01f698..4b64398a5c6 100644 --- a/osf/metrics/preprint_metrics.py +++ b/osf/metrics/preprint_metrics.py @@ -37,8 +37,8 @@ def record_for_preprint(cls, preprint, user=None, **kwargs): ) @classmethod - def get_count_for_preprint(cls, preprint, after=None, before=None, index=None): - search = cls.search(after=after, before=before, index=index).filter('match', preprint_id=preprint._id) + def get_count_for_preprint(cls, preprint, after=None, before=None, index=None) -> int: + search = cls.search(index=index).filter('term', preprint_id=preprint._id) timestamp = {} if after: timestamp['gte'] = after diff --git a/osf/metrics/reporters/_base.py b/osf/metrics/reporters/_base.py index 931afe23fd0..707e869522b 100644 --- a/osf/metrics/reporters/_base.py +++ b/osf/metrics/reporters/_base.py @@ -15,18 +15,17 @@ class MonthlyReporter: yearmonth: YearMonth - def report(self) -> abc.Iterable[MonthlyReport] | abc.Iterator[MonthlyReport]: + def iter_report_kwargs(self, continue_after: dict | None = None) -> abc.Iterator[dict]: + # override for multiple reports per month + if continue_after is None: + yield {} # by default, calls `.report()` once with no kwargs + + def report(self, **report_kwargs) -> MonthlyReport | None: """build a report for the given month """ - raise NotImplementedError(f'{self.__name__} must implement `report`') - - def run_and_record_for_month(self) -> None: - reports = self.report() - for report in reports: - report.report_yearmonth = self.yearmonth - report.save() + raise NotImplementedError(f'{self.__class__.__name__} must implement `report`') - def followup_task(self) -> celery.Signature | None: + def followup_task(self, report) -> celery.Signature | None: return None @@ -36,7 +35,7 @@ def report(self, report_date): return an iterable of DailyReport (unsaved) """ - raise NotImplementedError(f'{self.__name__} must implement `report`') + raise NotImplementedError(f'{self.__class__.__name__} must implement `report`') def run_and_record_for_date(self, report_date): reports = self.report(report_date) diff --git a/osf/metrics/reporters/institution_summary_monthly.py b/osf/metrics/reporters/institution_summary_monthly.py index 998cc056298..4748860db32 100644 --- a/osf/metrics/reporters/institution_summary_monthly.py +++ b/osf/metrics/reporters/institution_summary_monthly.py @@ -11,9 +11,16 @@ class InstitutionalSummaryMonthlyReporter(MonthlyReporter): """Generate an InstitutionMonthlySummaryReport for each institution.""" - def report(self): - for institution in Institution.objects.all(): - yield self.generate_report(institution) + def iter_report_kwargs(self, continue_after: dict | None = None): + _inst_qs = Institution.objects.order_by('pk') + if continue_after: + _inst_qs = _inst_qs.filter(pk__gt=continue_after['institution_pk']) + for _pk in _inst_qs.values_list('pk', flat=True): + yield {'institution_pk': _pk} + + def report(self, **report_kwargs): + _institution = Institution.objects.get(pk=report_kwargs['institution_pk']) + return self.generate_report(_institution) def generate_report(self, institution): node_queryset = institution.nodes.filter( diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py index e0f7f42a156..e34875d4b28 100644 --- a/osf/metrics/reporters/institutional_users.py +++ b/osf/metrics/reporters/institutional_users.py @@ -1,5 +1,4 @@ import dataclasses -import datetime from django.contrib.contenttypes.models import ContentType from django.db.models import Q, F, Sum @@ -12,9 +11,6 @@ from ._base import MonthlyReporter -_CHUNK_SIZE = 500 - - class InstitutionalUsersReporter(MonthlyReporter): '''build an InstitutionalUserReport for each institution-user affiliation @@ -22,13 +18,27 @@ class InstitutionalUsersReporter(MonthlyReporter): which offers institutional admins insight into how people at their institution are using osf, based on their explicitly-affiliated osf objects ''' - def report(self): + def iter_report_kwargs(self, continue_after: dict | None = None): _before_datetime = self.yearmonth.month_end() - for _institution in osfdb.Institution.objects.filter(created__lt=_before_datetime): + _inst_qs = ( + osfdb.Institution.objects + .filter(created__lt=_before_datetime) + .order_by('pk') + ) + if continue_after: + _inst_qs = _inst_qs.filter(pk__gte=continue_after['institution_pk']) + for _institution in _inst_qs: _user_qs = _institution.get_institution_users().filter(created__lt=_before_datetime) - for _user in _user_qs.iterator(chunk_size=_CHUNK_SIZE): - _helper = _InstiUserReportHelper(_institution, _user, self.yearmonth, _before_datetime) - yield _helper.report + if continue_after and (_institution.pk == continue_after['institution_pk']): + _user_qs = _user_qs.filter(pk__gt=continue_after['user_pk']) + for _user_pk in _user_qs.values_list('pk', flat=True): + yield {'institution_pk': _institution.pk, 'user_pk': _user_pk} + + def report(self, **report_kwargs): + _institution = osfdb.Institution.objects.get(pk=report_kwargs['institution_pk']) + _user = osfdb.OSFUser.objects.get(pk=report_kwargs['user_pk']) + _helper = _InstiUserReportHelper(_institution, _user, self.yearmonth) + return _helper.report # helper @@ -37,7 +47,6 @@ class _InstiUserReportHelper: institution: osfdb.Institution user: osfdb.OSFUser yearmonth: YearMonth - before_datetime: datetime.datetime report: InstitutionalUserReport = dataclasses.field(init=False) def __post_init__(self): @@ -64,6 +73,10 @@ def __post_init__(self): storage_byte_count=self._storage_byte_count(), ) + @property + def before_datetime(self): + return self.yearmonth.month_end() + def _node_queryset(self): _institution_node_qs = self.institution.nodes.filter( created__lt=self.before_datetime, diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py index ecc34a5d9c7..cc401d50bd7 100644 --- a/osf/metrics/reporters/public_item_usage.py +++ b/osf/metrics/reporters/public_item_usage.py @@ -1,17 +1,24 @@ from __future__ import annotations +import datetime import typing -import celery +import waffle if typing.TYPE_CHECKING: import elasticsearch_dsl as edsl +import osf.features from osf.metadata.osf_gathering import OsfmapPartition from osf.metrics.counted_usage import ( CountedAuthUsage, get_item_type, get_provider_id, ) +from osf.metrics.preprint_metrics import ( + PreprintDownload, + PreprintView, +) from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth from osf import models as osfdb from website import settings as website_settings from ._base import MonthlyReporter @@ -31,80 +38,128 @@ class PublicItemUsageReporter(MonthlyReporter): includes projects, project components, registrations, registration components, and preprints ''' - - def report(self): - # use two composite aggregations in parallel to page thru every - # public item viewed or downloaded this month, counting: - # - views and downloads for each item (using `CountedAuthUsage.item_guid`) - # - views for each item's components and files (using `CountedAuthUsage.surrounding_guids`) - for _exact_bucket, _contained_views_bucket in _zip_composite_aggs( - self._exact_item_search(), 'agg_osfid', - self._contained_item_views_search(), 'agg_surrounding_osfid', + def iter_report_kwargs(self, continue_after: dict | None = None): + _after_osfid = continue_after['osfid'] if continue_after else None + for _osfid in _zip_sorted( + self._countedusage_osfids(_after_osfid), + self._preprintview_osfids(_after_osfid), + self._preprintdownload_osfids(_after_osfid), ): - try: - _report = self._report_from_buckets(_exact_bucket, _contained_views_bucket) - yield _report - except _SkipItem: - pass + yield {'osfid': _osfid} + + def report(self, **report_kwargs): + _osfid = report_kwargs['osfid'] + # get usage metrics from several sources: + # - osf.metrics.counted_usage: + # - views and downloads for each item (using `CountedAuthUsage.item_guid`) + # - views for each item's components and files (using `CountedAuthUsage.surrounding_guids`) + # - osf.metrics.preprint_metrics: + # - preprint views and downloads + # - PageCounter? (no) + try: + _guid = osfdb.Guid.load(_osfid) + if _guid is None or _guid.referent is None: + raise _SkipItem + _obj = _guid.referent + _report = self._init_report(_obj) + self._fill_report_counts(_report, _obj) + if not any(( + _report.view_count, + _report.view_session_count, + _report.download_count, + _report.download_session_count, + )): + raise _SkipItem + return _report + except _SkipItem: + return None + + def followup_task(self, report): + _is_last_month = report.report_yearmonth.next() == YearMonth.from_date(datetime.date.today()) + if _is_last_month: + from api.share.utils import task__update_share + return task__update_share.signature( + args=(report.item_osfid,), + kwargs={ + 'is_backfill': True, + 'osfmap_partition_name': OsfmapPartition.MONTHLY_SUPPLEMENT.name, + }, + countdown=30, # give index time to settle + ) - def followup_task(self): - return task__update_monthly_metadatas.signature( - args=[str(self.yearmonth)], - countdown=30, # give index time to settle + def _countedusage_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + _search = self._base_usage_search() + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'item_guid'}}}], + size=_CHUNK_SIZE, ) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - def _report_from_buckets(self, exact_bucket, contained_views_bucket): - # either exact_bucket or contained_views_bucket may be None, but not both - assert (exact_bucket is not None) or (contained_views_bucket is not None) - _report = ( - self._init_report_from_exact_bucket(exact_bucket) - if exact_bucket is not None - else self._init_report_from_osfid(contained_views_bucket.key.osfid) + def _preprintview_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + _search = ( + PreprintView.search() + .filter('range', timestamp={ + 'gte': self.yearmonth.month_start(), + 'lt': self.yearmonth.month_end(), + }) + .extra(size=0) # only aggregations, no hits + ) + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], + size=_CHUNK_SIZE, ) - # view counts include views on contained items (components, files) - _report.view_count, _report.view_session_count = self._get_view_counts(_report.item_osfid) - return _report + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - def _init_report_from_exact_bucket(self, exact_bucket) -> PublicItemUsageReport: - # in the (should-be common) case of an item that has been directly viewed in - # this month, the stored metrics already have the data required - _report = PublicItemUsageReport( - item_osfid=exact_bucket.key.osfid, - item_type=_agg_keys(exact_bucket.agg_item_type), - provider_id=_agg_keys(exact_bucket.agg_provider_id), - platform_iri=_agg_keys(exact_bucket.agg_platform_iri), - # default counts to zero, will be updated if non-zero - view_count=0, - view_session_count=0, - download_count=0, - download_session_count=0, + def _preprintdownload_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + _search = ( + PreprintDownload.search() + .filter('range', timestamp={ + 'gte': self.yearmonth.month_start(), + 'lt': self.yearmonth.month_end(), + }) + .extra(size=0) # only aggregations, no hits ) - for _actionbucket in exact_bucket.agg_action: - # note: view counts computed separately to avoid double-counting - if _actionbucket.key == CountedAuthUsage.ActionLabel.DOWNLOAD.value: - _report.download_count = _actionbucket.doc_count - _report.download_session_count = _actionbucket.agg_session_count.value - return _report + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], + size=_CHUNK_SIZE, + ) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - def _init_report_from_osfid(self, osfid: str) -> PublicItemUsageReport: - # for the (should-be unusual) case where the components/files contained by - # an item have views in this month, but the item itself does not -- - # load necessary info via django models, instead - _osfguid = osfdb.Guid.load(osfid) - if _osfguid is None or not getattr(_osfguid.referent, 'is_public', False): + def _init_report(self, osf_obj) -> PublicItemUsageReport: + if not _is_item_public(osf_obj): raise _SkipItem return PublicItemUsageReport( - item_osfid=osfid, - item_type=[get_item_type(_osfguid.referent)], - provider_id=[get_provider_id(_osfguid.referent)], + item_osfid=osf_obj._id, + item_type=[get_item_type(osf_obj)], + provider_id=[get_provider_id(osf_obj)], platform_iri=[website_settings.DOMAIN], - # default counts to zero, will be updated if non-zero - view_count=0, - view_session_count=0, - download_count=0, - download_session_count=0, + # leave counts null; will be set if there's data ) + def _fill_report_counts(self, report, osf_obj): + if ( + isinstance(osf_obj, osfdb.Preprint) + and not waffle.switch_is_active(osf.features.COUNTEDUSAGE_UNIFIED_METRICS_2024) # type: ignore[attr-defined] + ): + # note: no session-count info in preprint metrics + report.view_count = self._preprint_views(osf_obj) + report.download_count = self._preprint_downloads(osf_obj) + else: + ( + report.view_count, + report.view_session_count, + ) = self._countedusage_view_counts(osf_obj) + ( + report.download_count, + report.download_session_count, + ) = self._countedusage_download_counts(osf_obj) + def _base_usage_search(self): return ( CountedAuthUsage.search() @@ -113,59 +168,10 @@ def _base_usage_search(self): 'gte': self.yearmonth.month_start(), 'lt': self.yearmonth.month_end(), }) - .update_from_dict({'size': 0}) # only aggregations, no hits + .extra(size=0) # only aggregations, no hits ) - def _exact_item_search(self) -> edsl.Search: - '''aggregate views and downloads on each osfid (not including components/files)''' - _search = self._base_usage_search() - # the main agg: use a composite aggregation to page thru *every* item - _agg_osfid = _search.aggs.bucket( - 'agg_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'item_guid'}}}], - size=_CHUNK_SIZE, - ) - # nested agg: for each item, get platform_iri values - _agg_osfid.bucket('agg_platform_iri', 'terms', field='platform_iri') - # nested agg: for each item, get provider_id values - _agg_osfid.bucket('agg_provider_id', 'terms', field='provider_id') - # nested agg: for each item, get item_type values - _agg_osfid.bucket('agg_item_type', 'terms', field='item_type') - # nested agg: for each item, get download count - _agg_action = _agg_osfid.bucket( - 'agg_action', - 'terms', - field='action_labels', - include=[ - CountedAuthUsage.ActionLabel.DOWNLOAD.value, - ], - ) - # nested nested agg: get download session count - _agg_action.metric( - 'agg_session_count', - 'cardinality', - field='session_id', - precision_threshold=_MAX_CARDINALITY_PRECISION, - ) - return _search - - def _contained_item_views_search(self) -> edsl.Search: - '''iterate osfids with views on contained components and files''' - _search = ( - self._base_usage_search() - .filter('term', action_labels=CountedAuthUsage.ActionLabel.VIEW.value) - ) - # the main agg: use a composite aggregation to page thru *every* item - _search.aggs.bucket( - 'agg_surrounding_osfid', - 'composite', - sources=[{'osfid': {'terms': {'field': 'surrounding_guids'}}}], - size=_CHUNK_SIZE, - ) - return _search - - def _get_view_counts(self, osfid: str) -> tuple[int, int]: + def _countedusage_view_counts(self, osf_obj) -> tuple[int, int]: '''compute view_session_count separately to avoid double-counting (the same session may be represented in both the composite agg on `item_guid` @@ -179,8 +185,8 @@ def _get_view_counts(self, osfid: str) -> tuple[int, int]: {'term': {'action_labels': CountedAuthUsage.ActionLabel.VIEW.value}}, ], should=[ - {'term': {'item_guid': osfid}}, - {'term': {'surrounding_guids': osfid}}, + {'term': {'item_guid': osf_obj._id}}, + {'term': {'surrounding_guids': osf_obj._id}}, ], minimum_should_match=1, ) @@ -193,86 +199,108 @@ def _get_view_counts(self, osfid: str) -> tuple[int, int]: ) _response = _search.execute() _view_count = _response.hits.total - _view_session_count = _response.aggregations.agg_session_count.value + _view_session_count = ( + _response.aggregations.agg_session_count.value + if 'agg_session_count' in _response.aggregations + else 0 + ) return (_view_count, _view_session_count) - -### -# followup celery task -@celery.shared_task -def task__update_monthly_metadatas(yearmonth: str): - from api.share.utils import task__update_share - _report_search = ( - PublicItemUsageReport.search() - .filter('term', report_yearmonth=yearmonth) - .source(['item_osfid']) # return only the 'item_osfid' field - ) - for _hit in _report_search.scan(): - task__update_share.delay( - _hit.item_osfid, - is_backfill=True, - osfmap_partition_name=OsfmapPartition.MONTHLY_SUPPLEMENT.name, + def _countedusage_download_counts(self, osf_obj) -> tuple[int, int]: + '''aggregate downloads on each osfid (not including components/files)''' + _search = ( + self._base_usage_search() + .filter('term', item_guid=osf_obj._id) + .filter('term', action_labels=CountedAuthUsage.ActionLabel.DOWNLOAD.value) + ) + # agg: get download session count + _search.aggs.metric( + 'agg_session_count', + 'cardinality', + field='session_id', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _download_count = _response.hits.total + _download_session_count = ( + _response.aggregations.agg_session_count.value + if 'agg_session_count' in _response.aggregations + else 0 + ) + return (_download_count, _download_session_count) + + def _preprint_views(self, preprint: osfdb.Preprint) -> int: + '''aggregate views on each preprint''' + return PreprintView.get_count_for_preprint( + preprint=preprint, + after=self.yearmonth.month_start(), + before=self.yearmonth.month_end(), ) + def _preprint_downloads(self, preprint: osfdb.Preprint) -> int: + '''aggregate downloads on each preprint''' + return PreprintDownload.get_count_for_preprint( + preprint=preprint, + after=self.yearmonth.month_start(), + before=self.yearmonth.month_end(), + ) -### -# local helpers - -def _agg_keys(bucket_agg_result) -> list: - return [_bucket.key for _bucket in bucket_agg_result] +def _is_item_public(osfid_referent) -> bool: + if isinstance(osfid_referent, osfdb.Preprint): + return bool(osfid_referent.verified_publishable) # quacks like Preprint + return getattr(osfid_referent, 'is_public', False) # quacks like AbstractNode -def _zip_composite_aggs( - search_a: edsl.Search, - composite_agg_name_a: str, - search_b: edsl.Search, - composite_agg_name_b: str, -): - '''iterate thru two composite aggregations, yielding pairs of buckets matched by key - the composite aggregations must have matching names in `sources` so their keys can be compared +def _zip_sorted( + *iterators: typing.Iterator[str], +) -> typing.Iterator[str]: + '''loop thru multiple iterators on sorted (ascending) sequences of strings ''' - _iter_a = _iter_composite_buckets(search_a, composite_agg_name_a) - _iter_b = _iter_composite_buckets(search_b, composite_agg_name_b) - _next_a = next(_iter_a, None) - _next_b = next(_iter_b, None) + _nexts = { # holds the next value from each iterator, or None + _i: next(_iter, None) + for _i, _iter in enumerate(iterators) + } while True: - if _next_a is None and _next_b is None: - return # both done - elif _next_a is None or _next_b is None: - # one is done but not the other -- no matching needed - yield (_next_a, _next_b) - _next_a = next(_iter_a, None) - _next_b = next(_iter_b, None) - elif _next_a.key == _next_b.key: - # match -- yield and increment both - yield (_next_a, _next_b) - _next_a = next(_iter_a, None) - _next_b = next(_iter_b, None) - elif _orderable_key(_next_a) < _orderable_key(_next_b): - # mismatch -- yield and increment a (but not b) - yield (_next_a, None) - _next_a = next(_iter_a, None) - else: - # mismatch -- yield and increment b (but not a) - yield (None, _next_b) - _next_b = next(_iter_b, None) - - -def _iter_composite_buckets(search: edsl.Search, composite_agg_name: str): + _nonnull_nexts = [ + _next + for _next in _nexts.values() + if _next is not None + ] + if not _nonnull_nexts: + return # all done + _value = min(_nonnull_nexts) + yield _value + for _i, _iter in enumerate(iterators): + if _nexts[_i] == _value: + _nexts[_i] = next(_iter, None) + + +def _iter_composite_bucket_keys( + search: edsl.Search, + composite_agg_name: str, + composite_source_name: str, + after: str | None = None, +) -> typing.Iterator[str]: '''iterate thru *all* buckets of a composite aggregation, requesting new pages as needed assumes the given search has a composite aggregation of the given name + with a single value source of the given name updates the search in-place for subsequent pages ''' + if after is not None: + search.aggs[composite_agg_name].after = {composite_source_name: after} while True: _page_response = search.execute(ignore_cache=True) # reused search object has the previous page cached try: _agg_result = _page_response.aggregations[composite_agg_name] except KeyError: return # no data; all done - yield from _agg_result.buckets + for _bucket in _agg_result.buckets: + _key = _bucket.key.to_dict() + assert set(_key.keys()) == {composite_source_name}, f'expected only one key ("{composite_source_name}") in {_bucket.key}' + yield _key[composite_source_name] # update the search for the next page try: _next_after = _agg_result.after_key @@ -280,7 +308,3 @@ def _iter_composite_buckets(search: edsl.Search, composite_agg_name: str): return # all done else: search.aggs[composite_agg_name].after = _next_after - - -def _orderable_key(composite_bucket) -> list: - return sorted(composite_bucket.key.to_dict().items()) diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py index 94290f96203..cb1c3eeb641 100644 --- a/osf/metrics/reporters/spam_count.py +++ b/osf/metrics/reporters/spam_count.py @@ -8,11 +8,12 @@ class SpamCountReporter(MonthlyReporter): - def report(self): + def report(self, **report_kwargs): + assert not report_kwargs target_month = self.yearmonth.month_start() next_month = self.yearmonth.month_end() - report = SpamSummaryReport( + return SpamSummaryReport( # Node Log entries node_confirmed_spam=NodeLog.objects.filter( action=NodeLog.CONFIRM_SPAM, @@ -79,5 +80,3 @@ def report(self): created__lt=next_month, ).count() ) - - return [report] diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 910b1f3104c..febfd24d6d2 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -58,6 +58,14 @@ def next(self) -> YearMonth: else YearMonth(self.year, self.month + 1) ) + def prior(self) -> YearMonth: + """get a new YearMonth for the month before this one""" + return ( + YearMonth(self.year - 1, int(calendar.DECEMBER)) + if self.month == calendar.JANUARY + else YearMonth(self.year, self.month - 1) + ) + def month_start(self) -> datetime.datetime: """get a datetime (in UTC timezone) when this YearMonth starts""" return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.UTC) diff --git a/osf_tests/metrics/reporters/__init__.py b/osf_tests/metrics/reporters/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/osf_tests/metrics/reporters/_testutils.py b/osf_tests/metrics/reporters/_testutils.py new file mode 100644 index 00000000000..0d18f3bcac9 --- /dev/null +++ b/osf_tests/metrics/reporters/_testutils.py @@ -0,0 +1,10 @@ +from osf.metrics.reporters._base import MonthlyReporter +from osf.metrics.reports import MonthlyReport + + +def list_monthly_reports(reporter: MonthlyReporter) -> list[MonthlyReport]: + _reports = ( + reporter.report(**_kwargs) + for _kwargs in reporter.iter_report_kwargs() + ) + return [_report for _report in _reports if (_report is not None)] diff --git a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py index 715a2cd1553..05baa4d38e7 100644 --- a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py +++ b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py @@ -11,6 +11,7 @@ PreprintFactory, AuthUserFactory, ) +from ._testutils import list_monthly_reports class TestInstiSummaryMonthlyReporter(TestCase): @@ -78,7 +79,7 @@ def _create_active_user(cls, institution, date_confirmed): def test_report_generation(self): reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) - reports = list(reporter.report()) + reports = list_monthly_reports(reporter) self.assertEqual(len(reports), 1) report = reports[0] @@ -114,7 +115,7 @@ def test_report_generation_multiple_institutions(self): # Run the reporter for the current month (February 2018) reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) - reports = list(reporter.report()) + reports = list_monthly_reports(reporter) self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3 # Extract reports by institution @@ -263,7 +264,7 @@ def test_high_counts_multiple_institutions(self): if enable_benchmarking: reporter_start_time = time.time() reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) - reports = list(reporter.report()) + reports = list_monthly_reports(reporter) assert len(reports) == additional_institution_count + 1 if enable_benchmarking: diff --git a/osf_tests/metrics/reporters/test_institutional_users_reporter.py b/osf_tests/metrics/reporters/test_institutional_users_reporter.py index 876fd08cf9b..275fcb1e8a1 100644 --- a/osf_tests/metrics/reporters/test_institutional_users_reporter.py +++ b/osf_tests/metrics/reporters/test_institutional_users_reporter.py @@ -18,6 +18,7 @@ UserFactory, EmbargoFactory, ) +from ._testutils import list_monthly_reports def _patch_now(fakenow: datetime.datetime): @@ -67,24 +68,24 @@ def _assert_report_matches_setup(self, report: InstitutionalUserReport, setup: _ self.assertEqual(report.published_preprint_count, setup.published_preprint_count) def test_no_users(self): - _actual_reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _actual_reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(_actual_reports, []) def test_one_user_with_nothing(self): self._user_setup_with_nothing.affiliate_user() - _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(len(_reports), 1) self._assert_report_matches_setup(_reports[0], self._user_setup_with_nothing) def test_one_user_with_ones(self): self._user_setup_with_ones.affiliate_user() - _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(len(_reports), 1) self._assert_report_matches_setup(_reports[0], self._user_setup_with_ones) def test_one_user_with_stuff_and_no_files(self): self._user_setup_with_stuff.affiliate_user() - _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(len(_reports), 1) self._assert_report_matches_setup(_reports[0], self._user_setup_with_stuff) self.assertEqual(_reports[0].public_file_count, 2) # preprint 2 files @@ -96,7 +97,7 @@ def test_one_user_with_stuff_and_a_file(self): _project = _user.nodes.first() with _patch_now(self._now): create_test_file(target=_project, user=_user, size=37) - (_report,) = InstitutionalUsersReporter(self._yearmonth).report() + (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self._assert_report_matches_setup(_report, self._user_setup_with_stuff) self.assertEqual(_report.public_file_count, 3) # 2 preprint files self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files @@ -113,7 +114,7 @@ def test_one_user_with_stuff_and_multiple_files(self): create_test_file(target=_component, user=_user, size=53, filename='bla') create_test_file(target=_component, user=_user, size=51, filename='blar') create_test_file(target=_component, user=_user, size=47, filename='blarg') - (_report,) = InstitutionalUsersReporter(self._yearmonth).report() + (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self._assert_report_matches_setup(_report, self._user_setup_with_stuff) self.assertEqual(_report.public_file_count, 7) # 2 preprint files self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47 @@ -130,7 +131,7 @@ def test_several_users(self): _setup.user._id: _setup for _setup in _setups } - _reports = list(InstitutionalUsersReporter(self._yearmonth).report()) + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) self.assertEqual(len(_reports), len(_setup_by_userid)) for _actual_report in _reports: _setup = _setup_by_userid[_actual_report.user_id] diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py index 454b8d6700d..b75c420b1a2 100644 --- a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py +++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py @@ -1,27 +1,48 @@ -from datetime import timedelta +from datetime import datetime, timedelta from operator import attrgetter from unittest import mock import pytest from osf.metrics.counted_usage import CountedAuthUsage +from osf.metrics.preprint_metrics import ( + PreprintDownload, + PreprintView, +) from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter from osf.metrics.reports import PublicItemUsageReport from osf.metrics.utils import YearMonth +from osf import models as osfdb +from osf_tests import factories +from ._testutils import list_monthly_reports @pytest.mark.es_metrics +@pytest.mark.django_db class TestPublicItemUsageReporter: @pytest.fixture(autouse=True) - def _mocks(self): - with ( - # set a tiny page size to force aggregation pagination: - mock.patch('osf.metrics.reporters.public_item_usage._CHUNK_SIZE', 1), - # HACK: skip auto-filling fields from the database: - mock.patch('osf.models.base.Guid.load', return_value=None), - ): + def _patch_settings(self): + with mock.patch('website.settings.DOMAIN', 'http://osf.example'): yield + @pytest.fixture + def item0(self): + _item0 = factories.PreprintFactory(is_public=True) + _item0._id = 'item0' + return _item0 + + @pytest.fixture + def item1(self): + _item1 = factories.ProjectFactory(is_public=True) + _item1._id = 'item1' + return _item1 + + @pytest.fixture + def item2(self, item1): + _item2 = factories.ProjectFactory(is_public=True, parent=item1) + _item2._id = 'item2' + return _item2 + @pytest.fixture def ym_empty(self) -> YearMonth: return YearMonth(2012, 7) @@ -35,89 +56,87 @@ def ym_busy(self) -> YearMonth: return YearMonth(2023, 7) @pytest.fixture - def sparse_month_usage(self, ym_sparse): + def sparse_month_usage(self, ym_sparse, item0, item1, item2): # "sparse" month: # item0: 3 views, 0 downloads, 2 sessions # item1: 1 views, 1 download, 1 session (plus 1 view from child item2) # item2: 1 views, 0 downloads, 1 session _month_start = ym_sparse.month_start() _save_usage( + item0, timestamp=_month_start, - item_guid='item0', session_id='sesh0', action_labels=['view'], ) _save_usage( + item0, timestamp=_month_start + timedelta(minutes=2), - item_guid='item0', session_id='sesh0', action_labels=['view'], ) _save_usage( + item1, timestamp=_month_start + timedelta(minutes=3), - item_guid='item1', session_id='sesh0', action_labels=['download'], ) _save_usage( + item0, timestamp=_month_start + timedelta(days=17), - item_guid='item0', session_id='sesh1', action_labels=['view'], ) _save_usage( + item1, timestamp=_month_start + timedelta(days=17, minutes=3), - item_guid='item1', session_id='sesh1', action_labels=['view'], ) _save_usage( + item2, timestamp=_month_start + timedelta(days=17, minutes=5), - item_guid='item2', - surrounding_guids=['item1'], session_id='sesh1', action_labels=['view'], ) _save_usage( + item2, timestamp=_month_start + timedelta(days=17, minutes=11), - item_guid='item2', - surrounding_guids=['item1'], session_id='sesh1', action_labels=['download'], ) @pytest.fixture - def busy_month_item0(self, ym_busy): + def busy_month_item0(self, ym_busy, item0): # item0: 4 sessions, 4*7 views, 4*5 downloads _month_start = ym_busy.month_start() for _sesh in range(0, 4): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(0, 7): _save_usage( + item0, timestamp=_sesh_start + timedelta(minutes=_minute), - item_guid='item0', session_id=f'sesh0{_sesh}', action_labels=['view'], ) for _minute in range(10, 15): _save_usage( + item0, timestamp=_sesh_start + timedelta(minutes=_minute), - item_guid='item0', session_id=f'sesh0{_sesh}', action_labels=['download'], ) @pytest.fixture - def busy_month_item1(self, ym_busy): - # item1: 10 sessions, 6*9 views, 5*7 downloads, 2 providers + def busy_month_item1(self, ym_busy, item1): + # item1: 10 sessions, 6*9 views, 5*7 downloads # (plus 11 views in 11 sessions from child item2) _month_start = ym_busy.month_start() for _sesh in range(0, 6): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(0, 9): _save_usage( + item1, timestamp=_sesh_start + timedelta(minutes=_minute), - item_guid='item1', session_id=f'sesh1{_sesh}', action_labels=['view'], ) @@ -125,42 +144,39 @@ def busy_month_item1(self, ym_busy): _sesh_start = _month_start + timedelta(days=_sesh) for _minute in range(10, 17): _save_usage( + item1, timestamp=_sesh_start + timedelta(minutes=_minute), - item_guid='item1', session_id=f'sesh1{_sesh}', action_labels=['download'], - provider_id='prov1', # additional provider_id ) @pytest.fixture - def busy_month_item2(self, ym_busy): + def busy_month_item2(self, ym_busy, item2): # item2: 11 sessions, 11 views, 11 downloads (child of item1) _month_start = ym_busy.month_start() for _sesh in range(1, 12): _save_usage( + item2, timestamp=_month_start + timedelta(days=_sesh), - item_guid='item2', - surrounding_guids=['item1'], session_id=f'sesh2{_sesh}', action_labels=['view'], ) _save_usage( + item2, timestamp=_month_start + timedelta(days=_sesh, hours=_sesh), - item_guid='item2', - surrounding_guids=['item1'], session_id=f'sesh2{_sesh}', action_labels=['download'], ) def test_no_data(self, ym_empty): _reporter = PublicItemUsageReporter(ym_empty) - _empty = list(_reporter.report()) + _empty = list_monthly_reports(_reporter) assert _empty == [] - def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2): - _empty = list(PublicItemUsageReporter(ym_empty).report()) - _sparse = list(PublicItemUsageReporter(ym_sparse).report()) - _busy = list(PublicItemUsageReporter(ym_busy).report()) + def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2, item0): + _empty = list_monthly_reports(PublicItemUsageReporter(ym_empty)) + _sparse = list_monthly_reports(PublicItemUsageReporter(ym_sparse)) + _busy = list_monthly_reports(PublicItemUsageReporter(ym_busy)) # empty month: assert _empty == [] @@ -171,16 +187,16 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m # sparse-month item0 assert isinstance(_sparse_item0, PublicItemUsageReport) assert _sparse_item0.item_osfid == 'item0' - assert _sparse_item0.provider_id == ['prov0'] + assert _sparse_item0.provider_id == [item0.provider._id] assert _sparse_item0.platform_iri == ['http://osf.example'] assert _sparse_item0.view_count == 3 - assert _sparse_item0.view_session_count == 2 + assert _sparse_item0.view_session_count is None # no session count for preprints assert _sparse_item0.download_count == 0 - assert _sparse_item0.download_session_count == 0 + assert _sparse_item0.download_session_count is None # no session count for preprints # sparse-month item1 assert isinstance(_sparse_item1, PublicItemUsageReport) assert _sparse_item1.item_osfid == 'item1' - assert _sparse_item1.provider_id == ['prov0'] + assert _sparse_item1.provider_id == ['osf'] assert _sparse_item1.platform_iri == ['http://osf.example'] assert _sparse_item1.view_count == 2 # including item2 assert _sparse_item1.view_session_count == 1 # including item2 @@ -189,7 +205,7 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m # sparse-month item2 assert isinstance(_sparse_item1, PublicItemUsageReport) assert _sparse_item2.item_osfid == 'item2' - assert _sparse_item2.provider_id == ['prov0'] + assert _sparse_item2.provider_id == ['osf'] assert _sparse_item2.platform_iri == ['http://osf.example'] assert _sparse_item2.view_count == 1 assert _sparse_item2.view_session_count == 1 @@ -202,16 +218,16 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m # busy-month item0 assert isinstance(_busy_item0, PublicItemUsageReport) assert _busy_item0.item_osfid == 'item0' - assert _busy_item0.provider_id == ['prov0'] + assert _busy_item0.provider_id == [item0.provider._id] assert _busy_item0.platform_iri == ['http://osf.example'] assert _busy_item0.view_count == 4 * 7 - assert _busy_item0.view_session_count == 4 + assert _busy_item0.view_session_count is None # no session count for preprints assert _busy_item0.download_count == 4 * 5 - assert _busy_item0.download_session_count == 4 + assert _busy_item0.download_session_count is None # no session count for preprints # busy-month item1 assert isinstance(_busy_item1, PublicItemUsageReport) assert _busy_item1.item_osfid == 'item1' - assert _busy_item1.provider_id == ['prov0', 'prov1'] + assert _busy_item1.provider_id == ['osf'] assert _busy_item1.platform_iri == ['http://osf.example'] assert _busy_item1.view_count == 6 * 9 + 11 assert _busy_item1.view_session_count == 6 + 11 @@ -220,7 +236,7 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m # busy-month item2 assert isinstance(_busy_item2, PublicItemUsageReport) assert _busy_item2.item_osfid == 'item2' - assert _busy_item2.provider_id == ['prov0'] + assert _busy_item2.provider_id == ['osf'] assert _busy_item2.platform_iri == ['http://osf.example'] assert _busy_item2.view_count == 11 assert _busy_item2.view_session_count == 11 @@ -228,11 +244,41 @@ def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_m assert _busy_item2.download_session_count == 11 -def _save_usage(**kwargs): - _kwargs = { # overridable defaults: +def _save_usage( + item, + *, + timestamp: datetime, + action_labels: list[str], + **kwargs, +): + _countedusage_kwargs = { + 'timestamp': timestamp, + 'item_guid': item._id, + 'action_labels': action_labels, 'platform_iri': 'http://osf.example', - 'item_public': True, - 'provider_id': 'prov0', **kwargs, } - CountedAuthUsage(**_kwargs).save(refresh=True) + CountedAuthUsage(**_countedusage_kwargs).save(refresh=True) + if isinstance(item, osfdb.Preprint): + if 'view' in action_labels: + _save_preprint_view(item, timestamp) + if 'download' in action_labels: + _save_preprint_download(item, timestamp) + + +def _save_preprint_view(preprint, timestamp): + PreprintView( + timestamp=timestamp, + count=1, + preprint_id=preprint._id, + provider_id=preprint.provider._id, + ).save(refresh=True) + + +def _save_preprint_download(preprint, timestamp): + PreprintDownload( + timestamp=timestamp, + count=1, + preprint_id=preprint._id, + provider_id=preprint.provider._id, + ).save(refresh=True) diff --git a/osf_tests/metrics/test_yearmonth.txt b/osf_tests/metrics/test_yearmonth.txt index 646c73c42f9..fae6b990c36 100644 --- a/osf_tests/metrics/test_yearmonth.txt +++ b/osf_tests/metrics/test_yearmonth.txt @@ -35,6 +35,13 @@ YearMonth(year=1491, month=12) >>> ym.next().next() YearMonth(year=1492, month=1) +`prior` method gives the prior year-month: +>>> ym = YearMonth(1492, 2) +>>> ym.prior() +YearMonth(year=1492, month=1) +>>> ym.prior().prior() +YearMonth(year=1491, month=12) + `month_start` method: >>> YearMonth(3333, 3).month_start() datetime.datetime(3333, 3, 1, 0, 0, tzinfo=datetime.timezone.utc) From 0ec9101d1bbcc4df8e163fc7283ee9b5b1e7da2b Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Mon, 2 Dec 2024 15:45:02 -0500 Subject: [PATCH 22/35] Avoid Sequence Scans on BFN --- osf/metrics/reporters/institutional_users.py | 37 ++++++++++++++------ 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py index e34875d4b28..512472a3d96 100644 --- a/osf/metrics/reporters/institutional_users.py +++ b/osf/metrics/reporters/institutional_users.py @@ -68,7 +68,7 @@ def __post_init__(self): private_project_count=self._private_project_queryset().count(), public_registration_count=self._public_registration_queryset().count(), embargoed_registration_count=self._embargoed_registration_queryset().count(), - public_file_count=self._public_osfstorage_file_queryset().count(), + public_file_count=self._public_osfstorage_file_count(), published_preprint_count=self._published_preprint_queryset().count(), storage_byte_count=self._storage_byte_count(), ) @@ -127,7 +127,7 @@ def _published_preprint_queryset(self): .exclude(spam_status=SpamStatus.SPAM) ) - def _public_osfstorage_file_queryset(self): + def _public_osfstorage_file_querysets(self): _target_node_q = Q( # any public project, registration, project component, or registration component target_object_id__in=self._node_queryset().filter(is_public=True).values('pk'), @@ -137,23 +137,40 @@ def _public_osfstorage_file_queryset(self): target_object_id__in=self._published_preprint_queryset().values('pk'), target_content_type=ContentType.objects.get_for_model(osfdb.Preprint), ) - return ( + return ( # split into two queries to avoid a parallel sequence scan on BFN + OsfStorageFile.objects + .filter( + created__lt=self.before_datetime, + deleted__isnull=True, + purged__isnull=True, + ) + .filter(_target_node_q), OsfStorageFile.objects .filter( created__lt=self.before_datetime, deleted__isnull=True, purged__isnull=True, ) - .filter(_target_node_q | _target_preprint_q) + .filter(_target_preprint_q) + ) + + def _public_osfstorage_file_count(self): + return sum( + _target_queryset.count() for _target_queryset + in self._public_osfstorage_file_querysets() ) def _storage_byte_count(self): - return osfdb.FileVersion.objects.filter( - size__gt=0, - created__lt=self.before_datetime, - purged__isnull=True, - basefilenode__in=self._public_osfstorage_file_queryset(), - ).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes'] + return sum( + osfdb.FileVersion.objects.filter( + size__gt=0, + created__lt=self.before_datetime, + purged__isnull=True, + basefilenode__in=_target_queryset, + ).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes'] + for _target_queryset + in self._public_osfstorage_file_querysets() + ) def _get_last_active(self): end_date = self.yearmonth.month_end() From 0a510f5cf477a8018a2c1886cdda3cdbb5a1ccf6 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Mon, 2 Dec 2024 15:47:55 -0500 Subject: [PATCH 23/35] Use low queue for metric reporters - h/t @aaxelb --- website/settings/defaults.py | 1 + 1 file changed, 1 insertion(+) diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 0467ef3c166..91e3c1bacc6 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -446,6 +446,7 @@ class CeleryConfig: 'osf.management.commands.daily_reporters_go', 'osf.management.commands.monthly_reporters_go', 'osf.management.commands.ingest_cedar_metadata_templates', + 'osf.metrics.reporters', } med_pri_modules = { From d34cac037c9435b4dfdd3358f53d3d1c3a5eaf68 Mon Sep 17 00:00:00 2001 From: Longze Chen Date: Wed, 4 Dec 2024 16:11:57 -0500 Subject: [PATCH 24/35] Fix failures caused by base class MonthlyReporter update --- osf/metrics/reporters/private_spam_metrics.py | 8 ++++---- osf_tests/metrics/test_spam_count_reporter.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py index 39b5fb16cb7..6e92f7f279b 100644 --- a/osf/metrics/reporters/private_spam_metrics.py +++ b/osf/metrics/reporters/private_spam_metrics.py @@ -6,15 +6,15 @@ class PrivateSpamMetricsReporter(MonthlyReporter): report_name = 'Private Spam Metrics' - def report(self, report_yearmonth): - target_month = report_yearmonth.target_month() - next_month = report_yearmonth.next_month() + def report(self): + target_month = self.yearmonth.target_month() + next_month = self.yearmonth.next_month() oopspam_client = OOPSpamClient() akismet_client = AkismetClient() report = PrivateSpamMetricsReport( - report_yearmonth=str(report_yearmonth), + report_yearmonth=str(self.yearmonth), node_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='node'), node_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='node'), node_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='node'), diff --git a/osf_tests/metrics/test_spam_count_reporter.py b/osf_tests/metrics/test_spam_count_reporter.py index db44dc848ff..30d53cd4c1b 100644 --- a/osf_tests/metrics/test_spam_count_reporter.py +++ b/osf_tests/metrics/test_spam_count_reporter.py @@ -29,8 +29,8 @@ def test_private_spam_metrics_reporter(): mock_akismet_get_flagged_count.return_value = 20 mock_akismet_get_hammed_count.return_value = 10 - reporter = PrivateSpamMetricsReporter() - report = reporter.report(report_yearmonth)[0] + reporter = PrivateSpamMetricsReporter(report_yearmonth) + report = reporter.report()[0] assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}" assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}" From 8997814196694447ddb2184acc580bb9cc460bd6 Mon Sep 17 00:00:00 2001 From: Longze Chen Date: Thu, 5 Dec 2024 10:20:22 -0500 Subject: [PATCH 25/35] Follow-up fix for target/next (start/end) month --- osf/metrics/reporters/private_spam_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py index 6e92f7f279b..32ed9ac57d3 100644 --- a/osf/metrics/reporters/private_spam_metrics.py +++ b/osf/metrics/reporters/private_spam_metrics.py @@ -7,8 +7,8 @@ class PrivateSpamMetricsReporter(MonthlyReporter): report_name = 'Private Spam Metrics' def report(self): - target_month = self.yearmonth.target_month() - next_month = self.yearmonth.next_month() + target_month = self.yearmonth.month_start() + next_month = self.yearmonth.month_end() oopspam_client = OOPSpamClient() akismet_client = AkismetClient() From 40e7f269346a7b09ff470d40e52470a9d8056295 Mon Sep 17 00:00:00 2001 From: Longze Chen Date: Thu, 5 Dec 2024 11:23:05 -0500 Subject: [PATCH 26/35] Update changelog and bump versions --- CHANGELOG | 10 ++++++++++ package.json | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 32a02066ce0..082f35c282f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,16 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO. +24.10.0 (2024-12-05) +==================== + +- Migrate Preprint Affilations +- Add OOPSpam and Akismet metrics to spam report +- Add PrivateSpamMetricsReport +- Update PrivateSpamMetricsReporter to work with refactored MonthlyReporter +- Fix duplicate reports when run for past years +- Fix counted-usage clobbers + 24.09.0 (2024-11-14) ==================== diff --git a/package.json b/package.json index 7fcf0590044..904ec2be4d2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "OSF", - "version": "24.09.0", + "version": "24.10.0", "description": "Facilitating Open Science", "repository": "https://github.com/CenterForOpenScience/osf.io", "author": "Center for Open Science", From d9b459805d3185cc649ee15f804a028ab9994252 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Fri, 6 Dec 2024 10:19:17 -0500 Subject: [PATCH 27/35] Fix backfill, report --- .../commands/migrate_preprint_affiliation.py | 5 +++ osf/metrics/reporters/private_spam_metrics.py | 2 +- .../test_migrate_preprint_affiliations.py | 38 ++++++++++++++++++- osf_tests/metrics/test_spam_count_reporter.py | 2 +- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/osf/management/commands/migrate_preprint_affiliation.py b/osf/management/commands/migrate_preprint_affiliation.py index 78e7b2786ff..e34c6dc6b27 100644 --- a/osf/management/commands/migrate_preprint_affiliation.py +++ b/osf/management/commands/migrate_preprint_affiliation.py @@ -9,6 +9,8 @@ logger = logging.getLogger(__name__) +AFFILIATION_TARGET_DATE = datetime.datetime(2024, 9, 19, 14, 37, 48, tzinfo=datetime.timezone.utc) + class Command(BaseCommand): """Assign affiliations from users to preprints where they have write or admin permissions, with optional exclusion by user GUIDs.""" @@ -97,6 +99,9 @@ def assign_affiliations_to_preprints(exclude_guids=None, dry_run=True, batch_siz user = contributor.user preprint = contributor.preprint + if preprint.created > AFFILIATION_TARGET_DATE: + continue + user_institutions = user.get_affiliated_institutions() processed_count += 1 if not dry_run: diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py index 32ed9ac57d3..40f259af325 100644 --- a/osf/metrics/reporters/private_spam_metrics.py +++ b/osf/metrics/reporters/private_spam_metrics.py @@ -25,4 +25,4 @@ def report(self): preprint_akismet_hammed=akismet_client.get_hammed_count(target_month, next_month, category='preprint') ) - return [report] + return report diff --git a/osf_tests/management_commands/test_migrate_preprint_affiliations.py b/osf_tests/management_commands/test_migrate_preprint_affiliations.py index 701638251f5..8c80737b3dd 100644 --- a/osf_tests/management_commands/test_migrate_preprint_affiliations.py +++ b/osf_tests/management_commands/test_migrate_preprint_affiliations.py @@ -1,5 +1,6 @@ import pytest -from osf.management.commands.migrate_preprint_affiliation import assign_affiliations_to_preprints +from datetime import timedelta +from osf.management.commands.migrate_preprint_affiliation import AFFILIATION_TARGET_DATE, assign_affiliations_to_preprints from osf_tests.factories import ( PreprintFactory, InstitutionFactory, @@ -33,6 +34,8 @@ def preprint_with_affiliated_contributor(self, user_with_affiliation): permissions='admin', visible=True ) + preprint.created = AFFILIATION_TARGET_DATE - timedelta(days=1) + preprint.save() return preprint @pytest.fixture() @@ -43,6 +46,20 @@ def preprint_with_non_affiliated_contributor(self, user_without_affiliation): permissions='admin', visible=True ) + preprint.created = AFFILIATION_TARGET_DATE - timedelta(days=1) + preprint.save() + return preprint + + @pytest.fixture() + def preprint_past_target_date_with_affiliated_contributor(self, user_with_affiliation): + preprint = PreprintFactory() + preprint.add_contributor( + user_with_affiliation, + permissions='admin', + visible=True + ) + preprint.created = AFFILIATION_TARGET_DATE + timedelta(days=1) + preprint.save() return preprint @pytest.mark.parametrize('dry_run', [True, False]) @@ -100,6 +117,7 @@ def test_affiliations_from_multiple_contributors(self, institution, dry_run): preprint = PreprintFactory() preprint.affiliated_institutions.clear() + preprint.created = AFFILIATION_TARGET_DATE - timedelta(days=1) preprint.add_contributor(read_contrib, permissions='read', visible=True) preprint.add_contributor(write_contrib, permissions='write', visible=True) preprint.add_contributor(admin_contrib, permissions='admin', visible=True) @@ -113,3 +131,21 @@ def test_affiliations_from_multiple_contributors(self, institution, dry_run): affiliations = set(preprint.affiliated_institutions.all()) assert affiliations == {institution, institution2} assert institution_not_include not in affiliations + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_exclude_recent_preprints(self, preprint_past_target_date_with_affiliated_contributor, preprint_with_affiliated_contributor, institution, dry_run): + new_preprint = preprint_past_target_date_with_affiliated_contributor + new_preprint.affiliated_institutions.clear() + new_preprint.save() + + old_preprint = preprint_with_affiliated_contributor + old_preprint.affiliated_institutions.clear() + old_preprint.save() + + assign_affiliations_to_preprints(dry_run=dry_run) + + assert not new_preprint.affiliated_institutions.exists() + if dry_run: + assert not old_preprint.affiliated_institutions.exists() + else: + assert institution in old_preprint.affiliated_institutions.all() diff --git a/osf_tests/metrics/test_spam_count_reporter.py b/osf_tests/metrics/test_spam_count_reporter.py index 30d53cd4c1b..0e7ba6956bf 100644 --- a/osf_tests/metrics/test_spam_count_reporter.py +++ b/osf_tests/metrics/test_spam_count_reporter.py @@ -30,7 +30,7 @@ def test_private_spam_metrics_reporter(): mock_akismet_get_hammed_count.return_value = 10 reporter = PrivateSpamMetricsReporter(report_yearmonth) - report = reporter.report()[0] + report = reporter.report() assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}" assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}" From c966face6faa251419d64fcedc4da2e80fbf70aa Mon Sep 17 00:00:00 2001 From: mfraezz Date: Wed, 11 Dec 2024 13:07:18 -0500 Subject: [PATCH 28/35] [Feature] Dashboard B&I (#10843) Co-authored-by: abram axel booth --- api/institutions/serializers.py | 2 ++ .../institutions/views/test_institution_summary_metrics.py | 2 ++ .../institutions/views/test_institution_user_metric_list.py | 5 +++++ 3 files changed, 9 insertions(+) diff --git a/api/institutions/serializers.py b/api/institutions/serializers.py index e3679b2a9c5..1d1e0761715 100644 --- a/api/institutions/serializers.py +++ b/api/institutions/serializers.py @@ -330,6 +330,7 @@ class Meta: }) id = IDField(source='meta.id', read_only=True) + report_yearmonth = YearmonthField(read_only=True) user_name = ser.CharField(read_only=True) department = ser.CharField(read_only=True, source='department_name') orcid_id = ser.CharField(read_only=True) @@ -372,6 +373,7 @@ class Meta: id = IDField(read_only=True) + report_yearmonth = YearmonthField(read_only=True) user_count = ser.IntegerField(read_only=True) public_project_count = ser.IntegerField(read_only=True) private_project_count = ser.IntegerField(read_only=True) diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py index d423663ea89..f1641ea923c 100644 --- a/api_tests/institutions/views/test_institution_summary_metrics.py +++ b/api_tests/institutions/views/test_institution_summary_metrics.py @@ -188,6 +188,7 @@ def test_get_report(self, app, url, institutional_admin, institution, reports, u assert data['type'] == 'institution-summary-metrics' attributes = data['attributes'] + assert attributes['report_yearmonth'] == '2024-08' assert attributes['user_count'] == 200 assert attributes['public_project_count'] == 150 assert attributes['private_project_count'] == 125 @@ -254,6 +255,7 @@ def test_get_report_with_multiple_months_and_institutions( attributes = data['attributes'] + assert attributes['report_yearmonth'] == '2024-09' assert attributes['user_count'] == 250 assert attributes['public_project_count'] == 200 assert attributes['private_project_count'] == 150 diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py index f83fd7fc3fa..b1bf3490788 100644 --- a/api_tests/institutions/views/test_institution_user_metric_list.py +++ b/api_tests/institutions/views/test_institution_user_metric_list.py @@ -445,6 +445,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu response_body = resp.text expected_response = [ [ + 'report_yearmonth', 'account_creation_date', 'department', 'embargoed_registration_count', @@ -460,6 +461,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu 'user_name' ], [ + '2024-08', '2018-02', 'Center, \t Greatest Ever', '1', @@ -512,6 +514,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu month_last_login='2018-02', ) expected_data.append([ + '2024-08', '2018-02', 'QBatman', '1', @@ -552,6 +555,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu response_rows = list(reader) # Validate header row expected_header = [ + 'report_yearmonth', 'account_creation_date', 'department', 'embargoed_registration_count', @@ -606,6 +610,7 @@ def test_get_report_format_table_json(self, app, url, institutional_admin, insti response_data = json.loads(resp.body) expected_data = [ { + 'report_yearmonth': '2024-08', 'account_creation_date': '2018-02', 'department': 'Safety "The Wolverine" Weapon X', 'embargoed_registration_count': 1, From 4d1708f966e325dc017c2d47326c04c9878110cd Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Wed, 11 Dec 2024 13:09:28 -0500 Subject: [PATCH 29/35] Update CHANGELOG, bump version --- CHANGELOG | 4 ++++ package.json | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 082f35c282f..be32693ee55 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,10 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO. +24.11.0 (2024-12-11) +==================== +- Institutional Dashboard Project Bugfix Release + 24.10.0 (2024-12-05) ==================== diff --git a/package.json b/package.json index 904ec2be4d2..fba6f9fe0b7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "OSF", - "version": "24.10.0", + "version": "24.11.0", "description": "Facilitating Open Science", "repository": "https://github.com/CenterForOpenScience/osf.io", "author": "Center for Open Science", From 6dce520bf30f138824d3eceedbfe42027c844307 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Tue, 3 Dec 2024 15:23:38 -0500 Subject: [PATCH 30/35] Assume default for global_ notifications --- website/notifications/emails.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/notifications/emails.py b/website/notifications/emails.py index 245baf9f0af..d26d43351d5 100644 --- a/website/notifications/emails.py +++ b/website/notifications/emails.py @@ -176,7 +176,7 @@ def get_user_subscriptions(user, event): if user_subscription: return {key: list(getattr(user_subscription, key).all().values_list('guids___id', flat=True)) for key in constants.NOTIFICATION_TYPES} else: - return {key: [] for key in constants.NOTIFICATION_TYPES} + return {key: [user._id] if (event in constants.USER_SUBSCRIPTIONS_AVAILABLE and key == 'email_transactional') else [] for key in constants.NOTIFICATION_TYPES} def get_node_lineage(node): From 1f3be808830ae8e81d2501505102645d609029fa Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Thu, 19 Dec 2024 05:35:33 -0500 Subject: [PATCH 31/35] Avoid superfluous PrivateLink query --- website/routes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/website/routes.py b/website/routes.py index 61d6c96c9aa..2acd71db1a6 100644 --- a/website/routes.py +++ b/website/routes.py @@ -176,8 +176,11 @@ def get_globals(): def is_private_link_anonymous_view(): # Avoid circular import from osf.models import PrivateLink + view_only = request.args.get('view_only') + if not view_only: + return False try: - return PrivateLink.objects.filter(key=request.args.get('view_only')).values_list('anonymous', flat=True).get() + return PrivateLink.objects.filter(key=view_only).values_list('anonymous', flat=True).get() except PrivateLink.DoesNotExist: return False From bf3c7d8b22f98355aed46a4c668ff97236dfde66 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Thu, 19 Dec 2024 12:51:28 -0500 Subject: [PATCH 32/35] Improve script resumability, update template --- osf/management/commands/email_all_users.py | 19 ++++++++++--------- .../test_email_all_users.py | 8 ++++---- website/templates/emails/tou_notif.html.mako | 8 ++++---- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/osf/management/commands/email_all_users.py b/osf/management/commands/email_all_users.py index 334ad58933b..f5cbd677fb7 100644 --- a/osf/management/commands/email_all_users.py +++ b/osf/management/commands/email_all_users.py @@ -19,13 +19,13 @@ OFFSET = 500000 -def email_all_users(email_template, dry_run=False, ids=None, run=0, offset=OFFSET): +def email_all_users(email_template, dry_run=False, ids=None, start_id=0, offset=OFFSET): if ids: active_users = OSFUser.objects.filter(id__in=ids) else: - lower_bound = run * offset - upper_bound = (run + 1) * offset + lower_bound = start_id + upper_bound = start_id + offset base_query = OSFUser.objects.filter(date_confirmed__isnull=False, deleted=None).exclude(date_disabled__isnull=False).exclude(is_active=False) active_users = base_query.filter(id__gt=lower_bound, id__lte=upper_bound).order_by('id') @@ -42,11 +42,12 @@ def email_all_users(email_template, dry_run=False, ids=None, run=0, offset=OFFSE total_sent = 0 for user in active_users.iterator(): + logger.info(f'Sending email to {user.id}') try: mails.send_mail( to_addr=user.email, mail=template, - fullname=user.fullname, + given_name=user.given_name or user.fullname, ) except Exception as e: logger.error(f'Exception encountered sending email to {user.id}') @@ -80,11 +81,11 @@ def add_arguments(self, parser): ) parser.add_argument( - '--r', + '--start-id', type=int, - dest='run', + dest='start_id', default=0, - help='Specify which run this is' + help='Specify id to start from.' ) parser.add_argument( @@ -105,9 +106,9 @@ def add_arguments(self, parser): def handle(self, *args, **options): dry_run = options.get('dry_run', False) template = options.get('template') - run = options.get('run') + start_id = options.get('start_id') ids = options.get('ids') offset = options.get('offset', OFFSET) - email_all_users(template, dry_run, run=run, ids=ids, offset=offset) + email_all_users(template, dry_run, start_id=start_id, ids=ids, offset=offset) if dry_run: raise RuntimeError('Dry run, only superusers emailed') diff --git a/osf_tests/management_commands/test_email_all_users.py b/osf_tests/management_commands/test_email_all_users.py index 3392e77a470..c10c84b49d1 100644 --- a/osf_tests/management_commands/test_email_all_users.py +++ b/osf_tests/management_commands/test_email_all_users.py @@ -49,7 +49,7 @@ def test_email_all_users_dry(self, mock_email, superuser): mock_email.assert_called_with( to_addr=superuser.email, mail=mails.TOU_NOTIF, - fullname=superuser.fullname + given_name=superuser.given_name ) @pytest.mark.django_db @@ -64,10 +64,10 @@ def test_dont_email_inactive_users( @pytest.mark.django_db @mock.patch('website.mails.send_mail') def test_email_all_users_offset(self, mock_email, user, user2): - email_all_users('TOU_NOTIF', offset=1, run=0) + email_all_users('TOU_NOTIF', offset=1, start_id=0) - email_all_users('TOU_NOTIF', offset=1, run=1) + email_all_users('TOU_NOTIF', offset=1, start_id=1) - email_all_users('TOU_NOTIF', offset=1, run=2) + email_all_users('TOU_NOTIF', offset=1, start_id=2) assert mock_email.call_count == 2 diff --git a/website/templates/emails/tou_notif.html.mako b/website/templates/emails/tou_notif.html.mako index 1da8c0cbc07..56130626668 100644 --- a/website/templates/emails/tou_notif.html.mako +++ b/website/templates/emails/tou_notif.html.mako @@ -3,12 +3,12 @@ <%def name="content()"> - Hi ${fullname},
+ Hi ${given_name},

- On August 10, 2020 the COS Websites and Services Terms of Use will change. The updates to the Terms are necessary to support continued use of the Websites and Services by the public.
- To better understand what has changed, go here.
+ On Friday, January 10, 2025 the COS Websites and Services Terms of Use and Privacy Policy will change. The updates to the Terms are necessary to support continued use of the Websites and Services by the public.
+ To better understand what has changed, see the Terms of Use change summary and Privacy Policy change summary.

- If you have any questions email support@osf.io.
+ You do not need to take any actions to acknowledge these updates. If you have any questions, please email support@osf.io.

Regards,

From 3bf4fc1861b35f535a4399467eea2c0bd61add77 Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Mon, 9 Dec 2024 15:17:49 -0500 Subject: [PATCH 33/35] Add internal policy views - Pull latest policy versions on build --- Dockerfile | 4 ++++ website/policies/views.py | 19 +++++++++++++++++++ website/routes.py | 13 +++++++++++++ website/settings/defaults.py | 11 ++++++++--- .../templates/policies/generic_policy.mako | 16 ++++++++++++++++ 5 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 website/policies/views.py create mode 100644 website/templates/policies/generic_policy.mako diff --git a/Dockerfile b/Dockerfile index 189b0e998b9..d5dce5b303a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,6 +52,10 @@ RUN set -ex \ libffi-dev WORKDIR /code + +# Policies +ADD https://github.com/CenterForOpenScience/cos.io.git#master ./COS_POLICIES/ + COPY pyproject.toml . COPY poetry.lock . # Fix: https://github.com/CenterForOpenScience/osf.io/pull/6783 diff --git a/website/policies/views.py b/website/policies/views.py new file mode 100644 index 00000000000..c13ad197dae --- /dev/null +++ b/website/policies/views.py @@ -0,0 +1,19 @@ +import markdown + +from website.settings import \ + PRIVACY_POLICY_PATH, PRIVACY_POLICY_GITHUB_LINK, \ + TERMS_POLICY_PATH, TERMS_POLICY_GITHUB_LINK + +def privacy_policy(): + with open(PRIVACY_POLICY_PATH, 'r') as policy_file: + return { + 'policy_content': markdown.markdown(policy_file.read(), extensions=['toc']), + 'POLICY_GITHUB_LINK': PRIVACY_POLICY_GITHUB_LINK + } + +def terms_policy(): + with open(TERMS_POLICY_PATH, 'r') as policy_file: + return { + 'policy_content': markdown.markdown(policy_file.read(), extensions=['toc']), + 'POLICY_GITHUB_LINK': TERMS_POLICY_GITHUB_LINK + } diff --git a/website/routes.py b/website/routes.py index 2acd71db1a6..ce328c3dcd7 100644 --- a/website/routes.py +++ b/website/routes.py @@ -53,6 +53,7 @@ from addons.base import views as addon_views from website.discovery import views as discovery_views from website.conferences import views as conference_views +from website.policies import views as policy_views from website.preprints import views as preprint_views from website.registries import views as registries_views from website.reviews import views as reviews_views @@ -1145,6 +1146,18 @@ def make_url_map(app): Rule('/goodbye/', 'get', goodbye, notemplate), + Rule( + '/privacy_policy/', + 'get', + policy_views.privacy_policy, + OsfWebRenderer('policies/generic_policy.mako', trust=True) + ), + Rule( + '/terms_of_use/', + 'get', + policy_views.terms_policy, + OsfWebRenderer('policies/generic_policy.mako', trust=True) + ), Rule( [ '/project//', diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 91e3c1bacc6..ee667f4130e 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -26,6 +26,9 @@ def parent_dir(path): STATIC_FOLDER = os.path.join(BASE_PATH, 'static') STATIC_URL_PATH = '/static' ASSET_HASH_PATH = os.path.join(APP_PATH, 'webpack-assets.json') +POLICY_PATH = os.path.join(APP_PATH, 'COS_POLICIES') +PRIVACY_POLICY_PATH = os.path.join(POLICY_PATH, 'PRIVACY_POLICY.md') +TERMS_POLICY_PATH = os.path.join(POLICY_PATH, 'TERMS_OF_USE.md') ROOT = os.path.join(BASE_PATH, '..') BCRYPT_LOG_ROUNDS = 12 LOG_LEVEL = logging.INFO @@ -2048,10 +2051,12 @@ class CeleryConfig: OSF_REGISTRIES_LOGO = 'osf_registries' OSF_LOGO_LIST = [OSF_LOGO, OSF_PREPRINTS_LOGO, OSF_MEETINGS_LOGO, OSF_PREREG_LOGO, OSF_REGISTRIES_LOGO] +PRIVACY_POLICY_GITHUB_LINK = 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/PRIVACY_POLICY.md' +TERMS_POLICY_GITHUB_LINK = 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/TERMS_OF_USE.md' FOOTER_LINKS = { - 'terms': 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/TERMS_OF_USE.md', - 'privacyPolicy': 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/PRIVACY_POLICY.md', - 'cookies': 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/PRIVACY_POLICY.md#f-cookies', + 'terms': 'https://osf.io/terms_of_use/', + 'privacyPolicy': 'https://osf.io/privacy_policy/', + 'cookies': 'https://osf.io/privacy_policy/#f-cookies', 'cos': 'https://cos.io', 'statusPage': 'https://status.cos.io/', 'apiDocs': 'https://developer.osf.io/', diff --git a/website/templates/policies/generic_policy.mako b/website/templates/policies/generic_policy.mako new file mode 100644 index 00000000000..6ae8581d350 --- /dev/null +++ b/website/templates/policies/generic_policy.mako @@ -0,0 +1,16 @@ +<%inherit file="base.mako"/> + +<%def name="content()"> +
+
+
+
+ ${policy_content} +
+
+
+ Version history for this policy is available here +
+
+
+ From d053a6289a17eb7cc34e53e1986c1659b1bca21e Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Tue, 7 Jan 2025 15:55:34 +0200 Subject: [PATCH 34/35] fixed yearmonth method --- osf_tests/metrics/test_monthly_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py index 0c0302a7f08..3c841e6555c 100644 --- a/osf_tests/metrics/test_monthly_report.py +++ b/osf_tests/metrics/test_monthly_report.py @@ -135,7 +135,7 @@ def test_with_last_month(self, osfid, this_month_report, last_month_report, two_ def _prior_yearmonth(ym: YearMonth) -> YearMonth: return ( - YearMonth(ym.year - 1, 1) + YearMonth(ym.year - 1, 12) if ym.month == 1 else YearMonth(ym.year, ym.month - 1) ) From a42ee32fe08f98765b8e6e2b3722a6fbd1bb3dee Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Thu, 9 Jan 2025 10:36:41 -0500 Subject: [PATCH 35/35] Add view, form to update moderation state --- admin/nodes/urls.py | 1 + admin/nodes/views.py | 11 +++++++++++ admin/templates/nodes/node.html | 7 ++++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/admin/nodes/urls.py b/admin/nodes/urls.py index 5036b9dd06d..d081b544d61 100644 --- a/admin/nodes/urls.py +++ b/admin/nodes/urls.py @@ -38,4 +38,5 @@ re_path(r'^(?P[a-z0-9]+)/make_private/$', views.NodeMakePrivate.as_view(), name='make-private'), re_path(r'^(?P[a-z0-9]+)/make_public/$', views.NodeMakePublic.as_view(), name='make-public'), re_path(r'^(?P[a-z0-9]+)/remove_notifications/$', views.NodeRemoveNotificationView.as_view(), name='node-remove-notifications'), + re_path(r'^(?P[a-z0-9]+)/update_moderation_state/$', views.NodeUpdateModerationStateView.as_view(), name='node-update-mod-state'), ] diff --git a/admin/nodes/views.py b/admin/nodes/views.py index 74b6b08feae..f1e90d72c09 100644 --- a/admin/nodes/views.py +++ b/admin/nodes/views.py @@ -118,6 +118,17 @@ def post(self, request, *args, **kwargs): return redirect('nodes:node', guid=kwargs.get('guid')) + +class NodeUpdateModerationStateView(View): + def post(self, request, *args, **kwargs): + guid = kwargs.get('guid') + node = AbstractNode.load(guid) + node.update_moderation_state() + messages.success(request, 'Moderation state successfully updated.') + + return redirect('nodes:node', guid=kwargs.get('guid')) + + class NodeSearchView(PermissionRequiredMixin, FormView): """ Allows authorized users to search for a node by it's guid. """ diff --git a/admin/templates/nodes/node.html b/admin/templates/nodes/node.html index 6ec71e2dfdc..cb12f49c375 100644 --- a/admin/templates/nodes/node.html +++ b/admin/templates/nodes/node.html @@ -64,7 +64,12 @@

{{ node.type|cut:'osf.'|title }}: {{ node.title }} + {% csrf_token %} + + + Creator