diff --git a/CHANGELOG b/CHANGELOG index 205d47de13c..be32693ee55 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,30 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO. +24.11.0 (2024-12-11) +==================== +- Institutional Dashboard Project Bugfix Release + +24.10.0 (2024-12-05) +==================== + +- Migrate Preprint Affilations +- Add OOPSpam and Akismet metrics to spam report +- Add PrivateSpamMetricsReport +- Update PrivateSpamMetricsReporter to work with refactored MonthlyReporter +- Fix duplicate reports when run for past years +- Fix counted-usage clobbers + +24.09.0 (2024-11-14) +==================== + +- Institutional Dashboard Project BE Release + +24.08.0 (2024-10-30) +==================== + +- Fix admin confirmation link generation and handling + 24.07.0 (2024-09-19) ==================== diff --git a/Dockerfile b/Dockerfile index 189b0e998b9..d5dce5b303a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,6 +52,10 @@ RUN set -ex \ libffi-dev WORKDIR /code + +# Policies +ADD https://github.com/CenterForOpenScience/cos.io.git#master ./COS_POLICIES/ + COPY pyproject.toml . COPY poetry.lock . # Fix: https://github.com/CenterForOpenScience/osf.io/pull/6783 diff --git a/admin/management/views.py b/admin/management/views.py index 3bd675790dd..bb7065c1062 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -1,4 +1,3 @@ -import datetime from dateutil.parser import isoparse from django.views.generic import TemplateView, View from django.contrib import messages @@ -13,6 +12,7 @@ from scripts.find_spammy_content import manage_spammy_content from django.urls import reverse from django.shortcuts import redirect +from osf.metrics.utils import YearMonth from osf.models import Preprint, Node, Registration @@ -120,11 +120,14 @@ def post(self, request, *args, **kwargs): if monthly_report_date: report_date = isoparse(monthly_report_date).date() else: - report_date = datetime.datetime.now().date() + report_date = None errors = monthly_reporters_go( - report_month=report_date.month, - report_year=report_date.year + yearmonth=( + str(YearMonth.from_date(report_date)) + if report_date is not None + else '' + ), ) if errors: diff --git a/admin/nodes/urls.py b/admin/nodes/urls.py index 5036b9dd06d..d081b544d61 100644 --- a/admin/nodes/urls.py +++ b/admin/nodes/urls.py @@ -38,4 +38,5 @@ re_path(r'^(?P[a-z0-9]+)/make_private/$', views.NodeMakePrivate.as_view(), name='make-private'), re_path(r'^(?P[a-z0-9]+)/make_public/$', views.NodeMakePublic.as_view(), name='make-public'), re_path(r'^(?P[a-z0-9]+)/remove_notifications/$', views.NodeRemoveNotificationView.as_view(), name='node-remove-notifications'), + re_path(r'^(?P[a-z0-9]+)/update_moderation_state/$', views.NodeUpdateModerationStateView.as_view(), name='node-update-mod-state'), ] diff --git a/admin/nodes/views.py b/admin/nodes/views.py index 74b6b08feae..f1e90d72c09 100644 --- a/admin/nodes/views.py +++ b/admin/nodes/views.py @@ -118,6 +118,17 @@ def post(self, request, *args, **kwargs): return redirect('nodes:node', guid=kwargs.get('guid')) + +class NodeUpdateModerationStateView(View): + def post(self, request, *args, **kwargs): + guid = kwargs.get('guid') + node = AbstractNode.load(guid) + node.update_moderation_state() + messages.success(request, 'Moderation state successfully updated.') + + return redirect('nodes:node', guid=kwargs.get('guid')) + + class NodeSearchView(PermissionRequiredMixin, FormView): """ Allows authorized users to search for a node by it's guid. """ diff --git a/admin/templates/nodes/node.html b/admin/templates/nodes/node.html index 6ec71e2dfdc..cb12f49c375 100644 --- a/admin/templates/nodes/node.html +++ b/admin/templates/nodes/node.html @@ -64,7 +64,12 @@

{{ node.type|cut:'osf.'|title }}: {{ node.title }} + {% csrf_token %} + + + Creator diff --git a/admin/users/views.py b/admin/users/views.py index 69bfa821c5c..1e6d6e3b09a 100644 --- a/admin/users/views.py +++ b/admin/users/views.py @@ -16,6 +16,7 @@ from django.core.mail import send_mail from django.shortcuts import redirect from django.core.paginator import Paginator +from django.core.exceptions import ValidationError from osf.exceptions import UserStateError from osf.models.base import Guid @@ -456,10 +457,19 @@ def get_context_data(self, **kwargs): class GetUserConfirmationLink(GetUserLink): def get_link(self, user): + if user.is_confirmed: + return f'User {user._id} is already confirmed' + + if user.deleted or user.is_merged: + return f'User {user._id} is deleted or merged' + try: - return user.get_confirmation_url(user.username, force=True) - except KeyError as e: - return str(e) + confirmation_link = user.get_or_create_confirmation_url(user.username, force=True, renew=True) + return confirmation_link + except ValidationError: + return f'Invalid email for user {user._id}' + except KeyError: + return 'Could not generate or refresh confirmation link' def get_link_type(self): return 'User Confirmation' diff --git a/admin_tests/users/test_views.py b/admin_tests/users/test_views.py index 80da9721651..cd51459e134 100644 --- a/admin_tests/users/test_views.py +++ b/admin_tests/users/test_views.py @@ -486,10 +486,15 @@ def test_get_user_confirmation_link(self): view = views.GetUserConfirmationLink() view = setup_view(view, request, guid=user._id) + link = view.get_link(user) + + user.refresh_from_db() + user_token = list(user.email_verifications.keys())[0] + ideal_link_path = f'/confirm/{user._id}/{user_token}/' - link = view.get_link(user) - link_path = str(furl(link).path) + + link_path = str(furl(link).path).rstrip('/') + '/' assert link_path == ideal_link_path @@ -511,6 +516,45 @@ def test_get_user_confirmation_link_with_expired_token(self): assert link_path == ideal_link_path + def test_get_user_confirmation_link_generates_new_token_if_expired(self): + user = UnconfirmedUserFactory() + request = RequestFactory().get('/fake_path') + view = views.GetUserConfirmationLink() + view = setup_view(view, request, guid=user._id) + + old_user_token = list(user.email_verifications.keys())[0] + user.email_verifications[old_user_token]['expiration'] = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(hours=24) + user.save() + + link = view.get_link(user) + user.refresh_from_db() + + new_user_token = list(user.email_verifications.keys())[0] + + assert new_user_token != old_user_token + + link_path = str(furl(link).path) + ideal_link_path = f'/confirm/{user._id}/{new_user_token}/' + assert link_path == ideal_link_path + + def test_get_user_confirmation_link_does_not_change_unexpired_token(self): + user = UnconfirmedUserFactory() + request = RequestFactory().get('/fake_path') + view = views.GetUserConfirmationLink() + view = setup_view(view, request, guid=user._id) + + user_token_before = list(user.email_verifications.keys())[0] + + user.email_verifications[user_token_before]['expiration'] = datetime.utcnow().replace(tzinfo=pytz.utc) + timedelta(hours=24) + user.save() + + with mock.patch('osf.models.user.OSFUser.get_or_create_confirmation_url') as mock_method: + mock_method.return_value = user.get_confirmation_url(user.username, force=False, renew=False) + + user_token_after = list(user.email_verifications.keys())[0] + + assert user_token_before == user_token_after + def test_get_password_reset_link(self): user = UnconfirmedUserFactory() request = RequestFactory().get('/fake_path') diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py new file mode 100644 index 00000000000..6199fd82d0e --- /dev/null +++ b/api/base/elasticsearch_dsl_views.py @@ -0,0 +1,172 @@ +from __future__ import annotations +import abc +import datetime +import typing + +import elasticsearch_dsl as edsl +from rest_framework import generics, exceptions as drf_exceptions +from rest_framework.settings import api_settings as drf_settings +from api.base.settings.defaults import REPORT_FILENAME_FORMAT + +if typing.TYPE_CHECKING: + from rest_framework import serializers + +from api.base.filters import FilterMixin +from api.base.views import JSONAPIBaseView +from api.metrics.renderers import ( + MetricsReportsCsvRenderer, + MetricsReportsTsvRenderer, + MetricsReportsJsonRenderer, +) +from api.base.pagination import ElasticsearchQuerySizeMaximumPagination, JSONAPIPagination +from api.base.renderers import JSONAPIRenderer + + +class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC): + '''abstract view class using `elasticsearch_dsl.Search` as a queryset-analogue + + builds a `Search` based on `self.get_default_search()` and the request's + query parameters for filtering, sorting, and pagination -- fetches only + the data required for the response, just like with a queryset! + ''' + serializer_class: type[serializers.BaseSerializer] # required on subclasses + + default_ordering: str | None = None # name of a serializer field, prepended with "-" for descending sort + ordering_fields: frozenset[str] = frozenset() # serializer field names + + @abc.abstractmethod + def get_default_search(self) -> edsl.Search | None: + '''the base `elasticsearch_dsl.Search` for this list, based on url path + + (common jsonapi query parameters will be considered automatically) + ''' + ... + + FILE_RENDERER_CLASSES = { + MetricsReportsCsvRenderer, + MetricsReportsTsvRenderer, + MetricsReportsJsonRenderer, + } + + def set_content_disposition(self, response, renderer: str): + """Set the Content-Disposition header to prompt a file download with the appropriate filename. + + Args: + response: The HTTP response object to modify. + renderer: The renderer instance used for the response, which determines the file extension. + """ + current_date = datetime.datetime.now().strftime('%Y-%m') + + if isinstance(renderer, JSONAPIRenderer): + extension = 'json' + else: + extension = getattr(renderer, 'extension', renderer.format) + + filename = REPORT_FILENAME_FORMAT.format( + view_name=self.view_name, + date_created=current_date, + extension=extension, + ) + + response['Content-Disposition'] = f'attachment; filename="{filename}"' + + def finalize_response(self, request, response, *args, **kwargs): + # Call the parent method to finalize the response first + response = super().finalize_response(request, response, *args, **kwargs) + # Check if this is a direct download request or file renderer classes, set to the Content-Disposition header + # so filename and attachment for browser download + if isinstance(request.accepted_renderer, tuple(self.FILE_RENDERER_CLASSES)): + self.set_content_disposition(response, request.accepted_renderer) + + return response + + ### + # beware! inheritance shenanigans below + + # override FilterMixin to disable all operators besides 'eq' and 'ne' + MATCHABLE_FIELDS = () + COMPARABLE_FIELDS = () + DEFAULT_OPERATOR_OVERRIDES = {} + # (if you want to add fulltext-search or range-filter support, remove the override + # and update `__add_search_filter` to handle those operators -- tho note that the + # underlying elasticsearch field mapping will need to be compatible with the query) + + # override DEFAULT_FILTER_BACKENDS rest_framework setting + # (filtering handled in-view to reuse logic from FilterMixin) + filter_backends = () + + # note: because elasticsearch_dsl.Search supports slicing and gives results when iterated on, + # it works fine with default pagination + + # override rest_framework.generics.GenericAPIView + @property + def pagination_class(self): + """ + When downloading a file assume no pagination is necessary unless the user specifies + """ + is_file_download = any( + self.request.accepted_renderer.format == renderer.format + for renderer in self.FILE_RENDERER_CLASSES + ) + # if it's a file download of the JSON respect default page size + if is_file_download: + return ElasticsearchQuerySizeMaximumPagination + return JSONAPIPagination + + def get_queryset(self): + _search = self.get_default_search() + if _search is None: + return [] + # using parsing logic from FilterMixin (oddly nested dict and all) + for _parsed_param in self.parse_query_params(self.request.query_params).values(): + for _parsed_filter in _parsed_param.values(): + _search = self.__add_search_filter( + _search, + elastic_field_name=_parsed_filter['source_field_name'], + operator=_parsed_filter['op'], + value=_parsed_filter['value'], + ) + return self.__add_sort(_search) + + ### + # private methods + + def __add_sort(self, search: edsl.Search) -> edsl.Search: + _elastic_sort = self.__get_elastic_sort() + return (search if _elastic_sort is None else search.sort(_elastic_sort)) + + def __get_elastic_sort(self) -> str | None: + _sort_param = self.request.query_params.get(drf_settings.ORDERING_PARAM, self.default_ordering) + if not _sort_param: + return None + _sort_field, _ascending = ( + (_sort_param[1:], False) + if _sort_param.startswith('-') + else (_sort_param, True) + ) + if _sort_field not in self.ordering_fields: + raise drf_exceptions.ValidationError( + f'invalid value for {drf_settings.ORDERING_PARAM} query param (valid values: {", ".join(self.ordering_fields)})', + ) + _serializer_field = self.get_serializer().fields[_sort_field] + _elastic_sort_field = _serializer_field.source + return (_elastic_sort_field if _ascending else f'-{_elastic_sort_field}') + + def __add_search_filter( + self, + search: edsl.Search, + elastic_field_name: str, + operator: str, + value: str, + ) -> edsl.Search: + match operator: # operators from FilterMixin + case 'eq': + if value == '': + return search.exclude('exists', field=elastic_field_name) + return search.filter('term', **{elastic_field_name: value}) + case 'ne': + if value == '': + return search.filter('exists', field=elastic_field_name) + return search.exclude('term', **{elastic_field_name: value}) + case _: + raise NotImplementedError(f'unsupported filter operator "{operator}"') diff --git a/api/base/pagination.py b/api/base/pagination.py index 7ed3db5f6e3..676f0baa8fb 100644 --- a/api/base/pagination.py +++ b/api/base/pagination.py @@ -10,7 +10,7 @@ replace_query_param, remove_query_param, ) from api.base.serializers import is_anonymized -from api.base.settings import MAX_PAGE_SIZE +from api.base.settings import MAX_PAGE_SIZE, MAX_SIZE_OF_ES_QUERY from api.base.utils import absolute_reverse from osf.models import AbstractNode, Comment, Preprint, Guid, DraftRegistration @@ -172,6 +172,13 @@ class MaxSizePagination(JSONAPIPagination): max_page_size = None page_size_query_param = None + +class ElasticsearchQuerySizeMaximumPagination(JSONAPIPagination): + page_size = MAX_SIZE_OF_ES_QUERY + max_page_size = MAX_SIZE_OF_ES_QUERY + page_size_query_param = None + + class NoMaxPageSizePagination(JSONAPIPagination): max_page_size = None diff --git a/api/base/serializers.py b/api/base/serializers.py index ac28139da97..3c8c518ea16 100644 --- a/api/base/serializers.py +++ b/api/base/serializers.py @@ -17,6 +17,7 @@ from api.base import utils from api.base.exceptions import EnumFieldMemberError +from osf.metrics.utils import YearMonth from osf.utils import permissions as osf_permissions from osf.utils import sanitize from osf.utils import functional @@ -171,6 +172,18 @@ def should_show(self, instance): return request and (request.user.is_anonymous or has_admin_scope) +class ShowIfObjectPermission(ConditionalField): + """Show the field only for users with a given object permission + """ + def __init__(self, field, *, permission: str, **kwargs): + super().__init__(field, **kwargs) + self._required_object_permission = permission + + def should_show(self, instance): + _request = self.context.get('request') + return _request.user.has_perm(self._required_object_permission, obj=instance) + + class HideIfRegistration(ConditionalField): """ If node is a registration, this field will return None. @@ -2012,3 +2025,18 @@ def to_internal_value(self, data): return self._enum_class[data.upper()].value except KeyError: raise EnumFieldMemberError(self._enum_class, data) + + +class YearmonthField(ser.Field): + def to_representation(self, value: YearMonth | None) -> str | None: + if value is None: + return None + return str(value) + + def to_internal_value(self, data: str | None) -> YearMonth | None: + if data is None: + return None + try: + return YearMonth.from_str(data) + except ValueError as e: + raise ser.ValidationError(str(e)) diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index 136f7f48b6b..367ca1b04f9 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -359,8 +359,11 @@ MAX_SIZE_OF_ES_QUERY = 10000 DEFAULT_ES_NULL_VALUE = 'N/A' +REPORT_FILENAME_FORMAT = '{view_name}_{date_created}.{extension}' CI_ENV = False CITATION_STYLES_REPO_URL = 'https://github.com/CenterForOpenScience/styles/archive/88e6ed31a91e9f5a480b486029cda97b535935d4.zip' DEFAULT_AUTO_FIELD = 'django.db.models.AutoField' + +WAFFLE_ENABLE_ADMIN_PAGES = False # instead, customized waffle admins in osf/admin.py diff --git a/api/base/utils.py b/api/base/utils.py index 9e0dcbc7e8c..1da52026d7e 100644 --- a/api/base/utils.py +++ b/api/base/utils.py @@ -2,6 +2,7 @@ from urllib.parse import urlunsplit, urlsplit, parse_qs, urlencode from packaging.version import Version from hashids import Hashids +import waffle from django.apps import apps from django.core.exceptions import ObjectDoesNotExist @@ -275,3 +276,21 @@ def __len__(self): def add_dict_as_item(self, dict): item = type('item', (object,), dict) self.append(item) + + +def toggle_view_by_flag(flag_name, old_view, new_view): + '''toggle between view implementations based on a feature flag + + returns a wrapper view function that: + - when the given flag is inactive, passes thru to `old_view` + - when the given flag is active, passes thru to `new_view` + ''' + def _view_by_flag(request, *args, **kwargs): + if waffle.flag_is_active(request, flag_name): + return new_view(request, *args, **kwargs) + return old_view(request, *args, **kwargs) + if hasattr(new_view, 'view_class'): + # set view_class to masquerade as a class-based view, for sake of assumptions + # in `api_tests.base.test_views` and `api.base.serializers.RelationshipField` + _view_by_flag.view_class = new_view.view_class # type: ignore[attr-defined] + return _view_by_flag diff --git a/api/caching/tasks.py b/api/caching/tasks.py index 0b7a4b6670f..b3afba02c2e 100644 --- a/api/caching/tasks.py +++ b/api/caching/tasks.py @@ -1,11 +1,11 @@ +import logging from urllib.parse import urlparse + +from django.apps import apps from django.db import connection from django.db.models import Sum - import requests -import logging -from django.apps import apps from api.caching.utils import storage_usage_cache from framework.postcommit_tasks.handlers import enqueue_postcommit_task @@ -16,6 +16,9 @@ logger = logging.getLogger(__name__) +_DEFAULT_FILEVERSION_PAGE_SIZE = 500000 + + def get_varnish_servers(): # TODO: this should get the varnish servers from HAProxy or a setting return settings.VARNISH_SERVERS @@ -111,35 +114,60 @@ def ban_url(instance): @app.task(max_retries=5, default_retry_delay=10) -def update_storage_usage_cache(target_id, target_guid, per_page=500000): +def update_storage_usage_cache(target_id, target_guid, per_page=_DEFAULT_FILEVERSION_PAGE_SIZE): if not settings.ENABLE_STORAGE_USAGE_CACHE: return + from osf.models import Guid + storage_usage_total = compute_storage_usage_total(Guid.load(target_guid).referent, per_page=per_page) + key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) + storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + + +def compute_storage_usage_total(target_obj, per_page=_DEFAULT_FILEVERSION_PAGE_SIZE): + from django.contrib.contenttypes.models import ContentType sql = """ SELECT count(size), sum(size) from (SELECT size FROM osf_basefileversionsthrough AS obfnv LEFT JOIN osf_basefilenode file ON obfnv.basefilenode_id = file.id LEFT JOIN osf_fileversion version ON obfnv.fileversion_id = version.id - LEFT JOIN django_content_type type on file.target_content_type_id = type.id WHERE file.provider = 'osfstorage' - AND type.model = 'abstractnode' AND file.deleted_on IS NULL - AND file.target_object_id=%s + AND file.target_object_id=%(target_pk)s + AND file.target_content_type_id=%(target_content_type_pk)s ORDER BY version.id - LIMIT %s OFFSET %s) file_page + LIMIT %(per_page)s OFFSET %(offset)s + ) file_page """ - count = per_page + last_count = 1 # initialize non-zero offset = 0 storage_usage_total = 0 + content_type_pk = ContentType.objects.get_for_model(target_obj).pk with connection.cursor() as cursor: - while count: - cursor.execute(sql, [target_id, per_page, offset]) - result = cursor.fetchall() - storage_usage_total += int(result[0][1]) if result[0][1] else 0 - count = int(result[0][0]) if result[0][0] else 0 - offset += count - - key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) - storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + while last_count: + cursor.execute( + sql, { + 'target_pk': target_obj.pk, + 'target_content_type_pk': content_type_pk, + 'per_page': per_page, + 'offset': offset, + }, + ) + this_count, size_sum = cursor.fetchall()[0] + storage_usage_total += int(size_sum or 0) + last_count = (this_count or 0) + offset += last_count + return storage_usage_total + + +def get_storage_usage_total(target_obj): + if not settings.ENABLE_STORAGE_USAGE_CACHE: + return compute_storage_usage_total(target_obj) + _cache_key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_obj._id) + _storage_usage_total = storage_usage_cache.get(_cache_key) + if _storage_usage_total is None: + _storage_usage_total = compute_storage_usage_total(target_obj) + storage_usage_cache.set(_cache_key, _storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + return _storage_usage_total def update_storage_usage(target): diff --git a/api/institutions/serializers.py b/api/institutions/serializers.py index f1124d896f8..1d1e0761715 100644 --- a/api/institutions/serializers.py +++ b/api/institutions/serializers.py @@ -12,8 +12,10 @@ BaseAPISerializer, ShowIfVersion, IDField, + ShowIfObjectPermission, ) +from api.base.serializers import YearmonthField from api.nodes.serializers import CompoundIDField from api.base.exceptions import RelationshipPostMakesNoChanges from api.base.utils import absolute_reverse @@ -35,6 +37,10 @@ class InstitutionSerializer(JSONAPISerializer): ror_iri = ser.CharField(read_only=True, source='ror_uri') iris = ser.SerializerMethodField(read_only=True) assets = ser.SerializerMethodField(read_only=True) + link_to_external_reports_archive = ShowIfObjectPermission( + ser.CharField(read_only=True), + permission='view_institutional_metrics', + ) links = LinksField({ 'self': 'get_api_url', 'html': 'get_absolute_html_url', @@ -55,19 +61,28 @@ class InstitutionSerializer(JSONAPISerializer): related_view_kwargs={'institution_id': '<_id>'}, ) - department_metrics = RelationshipField( - related_view='institutions:institution-department-metrics', - related_view_kwargs={'institution_id': '<_id>'}, + department_metrics = ShowIfObjectPermission( + RelationshipField( + related_view='institutions:institution-department-metrics', + related_view_kwargs={'institution_id': '<_id>'}, + ), + permission='view_institutional_metrics', ) - user_metrics = RelationshipField( - related_view='institutions:institution-user-metrics', - related_view_kwargs={'institution_id': '<_id>'}, + user_metrics = ShowIfObjectPermission( + RelationshipField( + related_view='institutions:institution-user-metrics', + related_view_kwargs={'institution_id': '<_id>'}, + ), + permission='view_institutional_metrics', ) - summary_metrics = RelationshipField( - related_view='institutions:institution-summary-metrics', - related_view_kwargs={'institution_id': '<_id>'}, + summary_metrics = ShowIfObjectPermission( + RelationshipField( + related_view='institutions:institution-summary-metrics', + related_view_kwargs={'institution_id': '<_id>'}, + ), + permission='view_institutional_metrics', ) def get_api_url(self, obj): @@ -256,7 +271,12 @@ def get_absolute_url(self, obj): ) -class InstitutionUserMetricsSerializer(JSONAPISerializer): +class OldInstitutionUserMetricsSerializer(JSONAPISerializer): + '''serializer for institution-users metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is NOT active + (and should be removed when that flag is permanently active) + ''' class Meta: type_ = 'institution-users' @@ -294,6 +314,92 @@ def get_absolute_url(self, obj): ) +class NewInstitutionUserMetricsSerializer(JSONAPISerializer): + '''serializer for institution-users metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is active + (and should be renamed without "New" when that flag is permanently active) + ''' + + class Meta: + type_ = 'institution-users' + + filterable_fields = frozenset({ + 'department', + 'orcid_id', + }) + + id = IDField(source='meta.id', read_only=True) + report_yearmonth = YearmonthField(read_only=True) + user_name = ser.CharField(read_only=True) + department = ser.CharField(read_only=True, source='department_name') + orcid_id = ser.CharField(read_only=True) + month_last_login = YearmonthField(read_only=True) + month_last_active = YearmonthField(read_only=True) + account_creation_date = YearmonthField(read_only=True) + + public_projects = ser.IntegerField(read_only=True, source='public_project_count') + private_projects = ser.IntegerField(read_only=True, source='private_project_count') + public_registration_count = ser.IntegerField(read_only=True) + embargoed_registration_count = ser.IntegerField(read_only=True) + published_preprint_count = ser.IntegerField(read_only=True) + public_file_count = ser.IntegerField(read_only=True) + storage_byte_count = ser.IntegerField(read_only=True) + + user = RelationshipField( + related_view='users:user-detail', + related_view_kwargs={'user_id': ''}, + ) + institution = RelationshipField( + related_view='institutions:institution-detail', + related_view_kwargs={'institution_id': ''}, + ) + + links = LinksField({}) + + def get_absolute_url(self): + return None # there is no detail view for institution-users + + +class NewInstitutionSummaryMetricsSerializer(JSONAPISerializer): + '''serializer for institution-summary metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is active + (and should be renamed without "New" when that flag is permanently active) + ''' + + class Meta: + type_ = 'institution-summary-metrics' + + id = IDField(read_only=True) + + report_yearmonth = YearmonthField(read_only=True) + user_count = ser.IntegerField(read_only=True) + public_project_count = ser.IntegerField(read_only=True) + private_project_count = ser.IntegerField(read_only=True) + public_registration_count = ser.IntegerField(read_only=True) + embargoed_registration_count = ser.IntegerField(read_only=True) + published_preprint_count = ser.IntegerField(read_only=True) + public_file_count = ser.IntegerField(read_only=True) + storage_byte_count = ser.IntegerField(read_only=True) + monthly_logged_in_user_count = ser.IntegerField(read_only=True) + monthly_active_user_count = ser.IntegerField(read_only=True) + + user = RelationshipField( + related_view='users:user-detail', + related_view_kwargs={'user_id': ''}, + ) + institution = RelationshipField( + related_view='institutions:institution-detail', + related_view_kwargs={'institution_id': ''}, + ) + + links = LinksField({}) + + def get_absolute_url(self): + return None # there is no detail view for institution-users + + class InstitutionRelated(JSONAPIRelationshipSerializer): id = ser.CharField(source='_id', required=False, allow_null=True) class Meta: diff --git a/api/institutions/urls.py b/api/institutions/urls.py index be4f9ca0b43..477fe8d9377 100644 --- a/api/institutions/urls.py +++ b/api/institutions/urls.py @@ -13,7 +13,7 @@ re_path(r'^(?P\w+)/relationships/registrations/$', views.InstitutionRegistrationsRelationship.as_view(), name=views.InstitutionRegistrationsRelationship.view_name), re_path(r'^(?P\w+)/relationships/nodes/$', views.InstitutionNodesRelationship.as_view(), name=views.InstitutionNodesRelationship.view_name), re_path(r'^(?P\w+)/users/$', views.InstitutionUserList.as_view(), name=views.InstitutionUserList.view_name), - re_path(r'^(?P\w+)/metrics/summary/$', views.InstitutionSummaryMetrics.as_view(), name=views.InstitutionSummaryMetrics.view_name), + re_path(r'^(?P\w+)/metrics/summary/$', views.institution_summary_metrics_detail_view, name=views.institution_summary_metrics_detail_view.view_name), re_path(r'^(?P\w+)/metrics/departments/$', views.InstitutionDepartmentList.as_view(), name=views.InstitutionDepartmentList.view_name), - re_path(r'^(?P\w+)/metrics/users/$', views.InstitutionUserMetricsList.as_view(), name=views.InstitutionUserMetricsList.view_name), + re_path(r'^(?P\w+)/metrics/users/$', views.institution_user_metrics_list_view, name=views.institution_user_metrics_list_view.view_name), ] diff --git a/api/institutions/views.py b/api/institutions/views.py index d21c15e0746..124e523c7e8 100644 --- a/api/institutions/views.py +++ b/api/institutions/views.py @@ -8,12 +8,16 @@ from framework.auth.oauth_scopes import CoreScopes +import osf.features from osf.metrics import InstitutionProjectCounts from osf.models import OSFUser, Node, Institution, Registration from osf.metrics import UserInstitutionProjectCounts +from osf.metrics.reports import InstitutionalUserReport, InstitutionMonthlySummaryReport +from osf.metrics.utils import YearMonth from osf.utils import permissions as osf_permissions from api.base import permissions as base_permissions +from api.base.elasticsearch_dsl_views import ElasticsearchListView from api.base.filters import ListFilterMixin from api.base.views import JSONAPIBaseView from api.base.serializers import JSONAPISerializer @@ -25,9 +29,17 @@ ) from api.base.settings import MAX_SIZE_OF_ES_QUERY from api.base.exceptions import RelationshipPostMakesNoChanges -from api.base.utils import MockQueryset +from api.base.utils import ( + MockQueryset, + toggle_view_by_flag, +) from api.base.settings import DEFAULT_ES_NULL_VALUE from api.metrics.permissions import IsInstitutionalMetricsUser +from api.metrics.renderers import ( + MetricsReportsCsvRenderer, + MetricsReportsTsvRenderer, + MetricsReportsJsonRenderer, +) from api.nodes.serializers import NodeSerializer from api.nodes.filters import NodesFilterMixin from api.users.serializers import UserSerializer @@ -40,7 +52,9 @@ InstitutionRegistrationsRelationshipSerializer, InstitutionSummaryMetricSerializer, InstitutionDepartmentMetricsSerializer, - InstitutionUserMetricsSerializer, + NewInstitutionUserMetricsSerializer, + OldInstitutionUserMetricsSerializer, + NewInstitutionSummaryMetricsSerializer, ) from api.institutions.permissions import UserIsAffiliated from api.institutions.renderers import InstitutionDepartmentMetricsCSVRenderer, InstitutionUserMetricsCSVRenderer, MetricsCSVRenderer @@ -384,7 +398,7 @@ def create(self, *args, **kwargs): return ret -class InstitutionSummaryMetrics(JSONAPIBaseView, generics.RetrieveAPIView, InstitutionMixin): +class _OldInstitutionSummaryMetrics(JSONAPIBaseView, generics.RetrieveAPIView, InstitutionMixin): permission_classes = ( drf_permissions.IsAuthenticatedOrReadOnly, base_permissions.TokenHasScope, @@ -493,10 +507,15 @@ def get_default_queryset(self): return self._make_elasticsearch_results_filterable(search, id=institution._id) -class InstitutionUserMetricsList(InstitutionImpactList): +class _OldInstitutionUserMetricsList(InstitutionImpactList): + '''list view for institution-users metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is NOT active + (and should be removed when that flag is permanently active) + ''' view_name = 'institution-user-metrics' - serializer_class = InstitutionUserMetricsSerializer + serializer_class = OldInstitutionUserMetricsSerializer renderer_classes = tuple(api_settings.DEFAULT_RENDERER_CLASSES) + (InstitutionUserMetricsCSVRenderer,) ordering_fields = ('user_name', 'department') @@ -521,3 +540,120 @@ def get_default_queryset(self): institution = self.get_institution() search = UserInstitutionProjectCounts.get_current_user_metrics(institution) return self._make_elasticsearch_results_filterable(search, id=institution._id, department=DEFAULT_ES_NULL_VALUE) + + +class _NewInstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView): + '''list view for institution-users metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is active + (and should be renamed without "New" when that flag is permanently active) + ''' + permission_classes = ( + drf_permissions.IsAuthenticatedOrReadOnly, + base_permissions.TokenHasScope, + IsInstitutionalMetricsUser, + ) + + required_read_scopes = [CoreScopes.INSTITUTION_METRICS_READ] + required_write_scopes = [CoreScopes.NULL] + + view_category = 'institutions' + view_name = 'institution-user-metrics' + renderer_classes = ( + *api_settings.DEFAULT_RENDERER_CLASSES, + MetricsReportsCsvRenderer, + MetricsReportsTsvRenderer, + MetricsReportsJsonRenderer, + ) + + serializer_class = NewInstitutionUserMetricsSerializer + + default_ordering = '-storage_byte_count' + ordering_fields = frozenset(( + 'user_name', + 'department', + 'month_last_login', + 'month_last_active', + 'account_creation_date', + 'public_projects', + 'private_projects', + 'public_registration_count', + 'embargoed_registration_count', + 'published_preprint_count', + 'public_file_count', + 'storage_byte_count', + )) + + def get_default_search(self): + _yearmonth = InstitutionalUserReport.most_recent_yearmonth() + if _yearmonth is None: + return None + return ( + InstitutionalUserReport.search() + .filter('term', report_yearmonth=str(_yearmonth)) + .filter('term', institution_id=self.get_institution()._id) + ) + + +class _NewInstitutionSummaryMetricsDetail(JSONAPIBaseView, generics.RetrieveAPIView, InstitutionMixin): + '''detail view for institution-summary metrics + + used only when the INSTITUTIONAL_DASHBOARD_2024 feature flag is active + (and should be renamed without "New" when that flag is permanently active) + ''' + permission_classes = ( + drf_permissions.IsAuthenticatedOrReadOnly, + base_permissions.TokenHasScope, + IsInstitutionalMetricsUser, + ) + + required_read_scopes = [CoreScopes.INSTITUTION_METRICS_READ] + required_write_scopes = [CoreScopes.NULL] + + view_category = 'institutions' + view_name = 'institution-summary-metrics' + + serializer_class = NewInstitutionSummaryMetricsSerializer + + def get_object(self): + institution = self.get_institution() + search_object = self.get_default_search() + if search_object: + object = search_object.execute()[0] + object.id = institution._id + return object + + def get_default_search(self): + yearmonth = InstitutionMonthlySummaryReport.most_recent_yearmonth() + if report_date_str := self.request.query_params.get('report_yearmonth'): + try: + yearmonth = YearMonth.from_str(report_date_str) + except ValueError: + pass + + if yearmonth is None: + return None + + return InstitutionMonthlySummaryReport.search().filter( + 'term', + report_yearmonth=str(yearmonth), + ).filter( + 'term', + institution_id=self.get_institution()._id, + ) + + +institution_summary_metrics_detail_view = toggle_view_by_flag( + flag_name=osf.features.INSTITUTIONAL_DASHBOARD_2024, + old_view=_OldInstitutionSummaryMetrics.as_view(), + new_view=_NewInstitutionSummaryMetricsDetail.as_view(), +) +institution_summary_metrics_detail_view.view_name = 'institution-summary-metrics' + + +institution_user_metrics_list_view = toggle_view_by_flag( + flag_name=osf.features.INSTITUTIONAL_DASHBOARD_2024, + old_view=_OldInstitutionUserMetricsList.as_view(), + new_view=_NewInstitutionUserMetricsList.as_view(), +) +institution_user_metrics_list_view.view_name = 'institution-user-metrics' diff --git a/api/metrics/renderers.py b/api/metrics/renderers.py index fd4bdc78da2..1e33515b68c 100644 --- a/api/metrics/renderers.py +++ b/api/metrics/renderers.py @@ -1,6 +1,6 @@ import csv import io - +import json from django.http import Http404 from rest_framework import renderers @@ -42,11 +42,7 @@ def get_csv_row(keys_list, report_attrs): ] -class MetricsReportsCsvRenderer(renderers.BaseRenderer): - media_type = 'text/csv' - format = 'csv' - CSV_DIALECT = csv.excel - +class MetricsReportsRenderer(renderers.BaseRenderer): def render(self, json_response, accepted_media_type=None, renderer_context=None): serialized_reports = ( jsonapi_resource['attributes'] @@ -67,7 +63,24 @@ def render(self, json_response, accepted_media_type=None, renderer_context=None) return csv_filecontent.getvalue() -class MetricsReportsTsvRenderer(MetricsReportsCsvRenderer): +class MetricsReportsCsvRenderer(MetricsReportsRenderer): + format = 'csv' + extension = 'csv' + media_type = 'text/csv' + CSV_DIALECT = csv.excel + + +class MetricsReportsTsvRenderer(MetricsReportsRenderer): format = 'tsv' + extension = 'tsv' media_type = 'text/tab-separated-values' CSV_DIALECT = csv.excel_tab + + +class MetricsReportsJsonRenderer(renderers.BaseRenderer): + format = 'json_report' + extension = 'json' + media_type = 'application/json' + + def render(self, json_response, accepted_media_type=None, renderer_context=None): + return json.dumps([item['attributes'] for item in json_response['data']]) diff --git a/api/preprints/serializers.py b/api/preprints/serializers.py index 97cc3f3fb7c..b8ad259aa3e 100644 --- a/api/preprints/serializers.py +++ b/api/preprints/serializers.py @@ -289,6 +289,135 @@ def update(self, preprint, validated_data): if not preprint.has_permission(auth.user, osf_permissions.WRITE): raise exceptions.PermissionDenied(detail='User must have admin or write permissions to update a preprint.') + save_preprint = False + recently_published = False + + for field in ['conflict_of_interest_statement', 'why_no_data', 'why_no_prereg']: + if field in validated_data: + value = validated_data[field] + if isinstance(value, str) and not value.strip(): + validated_data[field] = None + + updated_has_coi = validated_data.get('has_coi', preprint.has_coi) + updated_conflict_statement = validated_data.get('conflict_of_interest_statement', preprint.conflict_of_interest_statement) + + updated_has_data_links = validated_data.get('has_data_links', preprint.has_data_links) + updated_why_no_data = validated_data.get('why_no_data', preprint.why_no_data) + + updated_has_prereg_links = validated_data.get('has_prereg_links', preprint.has_prereg_links) + updated_why_no_prereg = validated_data.get('why_no_prereg', preprint.why_no_prereg) + + if updated_has_coi is False and updated_conflict_statement: + raise exceptions.ValidationError( + detail='Cannot provide conflict of interest statement when has_coi is set to False.', + ) + + if updated_has_data_links != 'no' and updated_why_no_data: + raise exceptions.ValidationError( + detail='You cannot edit this statement while your data links availability is set to true or is unanswered.', + ) + + if updated_has_data_links == 'no' and 'data_links' in validated_data and validated_data['data_links']: + raise exceptions.ValidationError( + detail='Cannot provide data links when has_data_links is set to "no".', + ) + + if updated_has_prereg_links != 'no' and updated_why_no_prereg: + raise exceptions.ValidationError( + detail='You cannot edit this statement while your prereg links availability is set to true or is unanswered.', + ) + + if updated_has_prereg_links != 'available': + if 'prereg_links' in validated_data and validated_data['prereg_links']: + raise exceptions.ValidationError( + detail='You cannot edit this field while your prereg links availability is set to false or is unanswered.', + ) + if 'prereg_link_info' in validated_data and validated_data['prereg_link_info']: + raise exceptions.ValidationError( + detail='You cannot edit this field while your prereg links availability is set to false or is unanswered.', + ) + + def require_admin_permission(): + if not preprint.has_permission(auth.user, osf_permissions.ADMIN): + raise exceptions.PermissionDenied(detail='Must have admin permissions to update author assertion fields.') + + if 'has_coi' in validated_data: + require_admin_permission() + try: + preprint.update_has_coi(auth, validated_data['has_coi']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'conflict_of_interest_statement' in validated_data: + require_admin_permission() + try: + preprint.update_conflict_of_interest_statement(auth, validated_data['conflict_of_interest_statement']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'has_data_links' in validated_data: + require_admin_permission() + try: + preprint.update_has_data_links(auth, validated_data['has_data_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'why_no_data' in validated_data: + require_admin_permission() + try: + preprint.update_why_no_data(auth, validated_data['why_no_data']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'data_links' in validated_data: + require_admin_permission() + try: + preprint.update_data_links(auth, validated_data['data_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + else: + if updated_has_data_links == 'no' and preprint.data_links: + preprint.update_data_links(auth, []) + save_preprint = True + + if 'has_prereg_links' in validated_data: + require_admin_permission() + + try: + preprint.update_has_prereg_links(auth, validated_data['has_prereg_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'why_no_prereg' in validated_data: + require_admin_permission() + try: + preprint.update_why_no_prereg(auth, validated_data['why_no_prereg']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'prereg_links' in validated_data: + require_admin_permission() + try: + preprint.update_prereg_links(auth, validated_data['prereg_links']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + + if 'prereg_link_info' in validated_data: + require_admin_permission() + try: + preprint.update_prereg_link_info(auth, validated_data['prereg_link_info']) + save_preprint = True + except PreprintStateError as e: + raise exceptions.ValidationError(detail=str(e)) + published = validated_data.pop('is_published', None) if published and preprint.provider.is_reviewed: url = absolute_reverse( @@ -369,8 +498,6 @@ def update(self, preprint, validated_data): preprint.custom_publication_citation = validated_data['custom_publication_citation'] or None save_preprint = True - self.handle_author_assertions(preprint, validated_data, auth) - if published is not None: if not preprint.primary_file: raise exceptions.ValidationError( @@ -396,76 +523,6 @@ def update(self, preprint, validated_data): return preprint - def handle_author_assertions(self, preprint, validated_data, auth): - author_assertions = { - 'has_coi', - 'conflict_of_interest_statement', - 'has_data_links', - 'why_no_data', - 'data_links', - 'why_no_prereg', - 'prereg_links', - 'has_prereg_links', - 'prereg_link_info', - } - if author_assertions & validated_data.keys(): - if not preprint.is_admin_contributor(auth.user): - raise exceptions.PermissionDenied('User must be admin to add author assertions') - - if 'has_coi' in validated_data: - try: - preprint.update_has_coi(auth, validated_data['has_coi']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'conflict_of_interest_statement' in validated_data: - try: - preprint.update_conflict_of_interest_statement(auth, validated_data['conflict_of_interest_statement']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'has_data_links' in validated_data: - try: - preprint.update_has_data_links(auth, validated_data['has_data_links']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'why_no_data' in validated_data: - try: - preprint.update_why_no_data(auth, validated_data['why_no_data']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'data_links' in validated_data: - try: - preprint.update_data_links(auth, validated_data['data_links']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'has_prereg_links' in validated_data: - try: - preprint.update_has_prereg_links(auth, validated_data['has_prereg_links']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'why_no_prereg' in validated_data: - try: - preprint.update_why_no_prereg(auth, validated_data['why_no_prereg']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'prereg_links' in validated_data: - try: - preprint.update_prereg_links(auth, validated_data['prereg_links']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - - if 'prereg_link_info' in validated_data: - try: - preprint.update_prereg_link_info(auth, validated_data['prereg_link_info']) - except PreprintStateError as e: - raise exceptions.ValidationError(detail=str(e)) - def set_field(self, func, val, auth, save=False): try: func(val, auth) diff --git a/api/share/utils.py b/api/share/utils.py index 34c9be4609c..4f4137dcf58 100644 --- a/api/share/utils.py +++ b/api/share/utils.py @@ -3,6 +3,7 @@ SHARE/Trove accepts metadata records as "indexcards" in turtle format: https://www.w3.org/TR/turtle/ """ from functools import partial +from http import HTTPStatus import logging import random from urllib.parse import urljoin @@ -17,7 +18,11 @@ from framework.encryption import ensure_bytes from framework.sentry import log_exception from osf import models as osf_db -from osf.metadata.tools import pls_gather_metadata_file +from osf.metadata.osf_gathering import ( + OsfmapPartition, + pls_get_magic_metadata_basket, +) +from osf.metadata.serializers import get_metadata_serializer from website import settings @@ -25,7 +30,7 @@ def shtrove_ingest_url(): - return f'{settings.SHARE_URL}api/v3/ingest' + return f'{settings.SHARE_URL}trove/ingest' def sharev2_push_url(): @@ -69,83 +74,100 @@ def _enqueue_update_share(osfresource): enqueue_task(async_update_resource_share.s(_osfguid_value)) -@celery_app.task(bind=True, max_retries=4, acks_late=True) -def task__update_share(self, guid: str, is_backfill=False): +@celery_app.task( + bind=True, + acks_late=True, + max_retries=4, + retry_backoff=True, +) +def task__update_share(self, guid: str, is_backfill=False, osfmap_partition_name='MAIN'): """ - This function updates share takes Preprints, Projects and Registrations. - :param self: - :param guid: - :return: + Send SHARE/trove current metadata record(s) for the osf-guid-identified object """ - resp = _do_update_share(guid, is_backfill=is_backfill) + _osfmap_partition = OsfmapPartition[osfmap_partition_name] + _osfid_instance = apps.get_model('osf.Guid').load(guid) + if _osfid_instance is None: + raise ValueError(f'unknown osfguid "{guid}"') + _resource = _osfid_instance.referent + _is_deletion = _should_delete_indexcard(_resource) + _response = ( + pls_delete_trove_record(_resource, osfmap_partition=_osfmap_partition) + if _is_deletion + else pls_send_trove_record( + _resource, + is_backfill=is_backfill, + osfmap_partition=_osfmap_partition, + ) + ) try: - resp.raise_for_status() + _response.raise_for_status() except Exception as e: - if self.request.retries == self.max_retries: - log_exception(e) - elif resp.status_code >= 500: - try: - self.retry( - exc=e, - countdown=(random.random() + 1) * min(60 + settings.CELERY_RETRY_BACKOFF_BASE ** self.request.retries, 60 * 10), + log_exception(e) + if HTTPStatus(_response.status_code).is_server_error: + raise self.retry(exc=e) + else: # success response + if not _is_deletion: + # enqueue followup task for supplementary metadata + _next_partition = _next_osfmap_partition(_osfmap_partition) + if _next_partition is not None: + task__update_share.delay( + guid, + is_backfill=is_backfill, + osfmap_partition_name=_next_partition.name, ) - except Retry as e: # Retry is only raise after > 5 retries - log_exception(e) - else: - log_exception(e) - - return resp -def pls_send_trove_indexcard(osf_item, *, is_backfill=False): +def pls_send_trove_record(osf_item, *, is_backfill: bool, osfmap_partition: OsfmapPartition): try: _iri = osf_item.get_semantic_iri() except (AttributeError, ValueError): raise ValueError(f'could not get iri for {osf_item}') - _metadata_record = pls_gather_metadata_file(osf_item, 'turtle') + _basket = pls_get_magic_metadata_basket(osf_item) + _serializer = get_metadata_serializer( + format_key='turtle', + basket=_basket, + serializer_config={'osfmap_partition': osfmap_partition}, + ) + _serialized_record = _serializer.serialize() _queryparams = { 'focus_iri': _iri, - 'record_identifier': _shtrove_record_identifier(osf_item), + 'record_identifier': _shtrove_record_identifier(osf_item, osfmap_partition), } if is_backfill: - _queryparams['nonurgent'] = True + _queryparams['nonurgent'] = '' + if osfmap_partition.is_supplementary: + _queryparams['is_supplementary'] = '' + _expiration_date = osfmap_partition.get_expiration_date(_basket) + if _expiration_date is not None: + _queryparams['expiration_date'] = str(_expiration_date) return requests.post( shtrove_ingest_url(), params=_queryparams, headers={ - 'Content-Type': _metadata_record.mediatype, + 'Content-Type': _serializer.mediatype, **_shtrove_auth_headers(osf_item), }, - data=ensure_bytes(_metadata_record.serialized_metadata), + data=ensure_bytes(_serialized_record), ) -def pls_delete_trove_indexcard(osf_item): +def pls_delete_trove_record(osf_item, osfmap_partition: OsfmapPartition): return requests.delete( shtrove_ingest_url(), params={ - 'record_identifier': _shtrove_record_identifier(osf_item), + 'record_identifier': _shtrove_record_identifier(osf_item, osfmap_partition), }, headers=_shtrove_auth_headers(osf_item), ) -def _do_update_share(osfguid: str, *, is_backfill=False): - logger.debug('%s._do_update_share("%s", is_backfill=%s)', __name__, osfguid, is_backfill) - _guid_instance = apps.get_model('osf.Guid').load(osfguid) - if _guid_instance is None: - raise ValueError(f'unknown osfguid "{osfguid}"') - _resource = _guid_instance.referent - _response = ( - pls_delete_trove_indexcard(_resource) - if _should_delete_indexcard(_resource) - else pls_send_trove_indexcard(_resource, is_backfill=is_backfill) +def _shtrove_record_identifier(osf_item, osfmap_partition: OsfmapPartition): + _id = osf_item.guids.values_list('_id', flat=True).first() + return ( + f'{_id}/{osfmap_partition.name}' + if osfmap_partition.is_supplementary + else _id ) - return _response - - -def _shtrove_record_identifier(osf_item): - return osf_item.guids.values_list('_id', flat=True).first() def _shtrove_auth_headers(osf_item): @@ -182,6 +204,16 @@ def _is_item_public(guid_referent) -> bool: return getattr(guid_referent, 'is_public', False) # quacks like AbstractNode +def _next_osfmap_partition(partition: OsfmapPartition) -> OsfmapPartition | None: + match partition: + case OsfmapPartition.MAIN: + return OsfmapPartition.SUPPLEMENT + case OsfmapPartition.SUPPLEMENT: + return OsfmapPartition.MONTHLY_SUPPLEMENT + case _: + return None + + ### # BEGIN soon-to-be-deleted (🤞) legacy sharev2 push # (until dust has settled on iri-centric (rdf-based) search) diff --git a/api_tests/base/test_views.py b/api_tests/base/test_views.py index 6d4a35c07e0..212ebed351a 100644 --- a/api_tests/base/test_views.py +++ b/api_tests/base/test_views.py @@ -43,9 +43,9 @@ if hasattr(patt, 'url_patterns'): # Namespaced list of patterns for subpatt in patt.url_patterns: - VIEW_CLASSES.append(subpatt.callback.cls) + VIEW_CLASSES.append(subpatt.callback.view_class) else: - VIEW_CLASSES.append(patt.callback.cls) + VIEW_CLASSES.append(patt.callback.view_class) class TestApiBaseViews(ApiTestCase): diff --git a/api_tests/institutions/views/test_institution_department_list.py b/api_tests/institutions/views/test_institution_department_list.py index 5a22d17fdff..f2a335eed85 100644 --- a/api_tests/institutions/views/test_institution_department_list.py +++ b/api_tests/institutions/views/test_institution_department_list.py @@ -10,7 +10,7 @@ from osf.metrics import UserInstitutionProjectCounts -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db class TestInstitutionDepartmentList: diff --git a/api_tests/institutions/views/test_institution_detail.py b/api_tests/institutions/views/test_institution_detail.py index e21e3a7087b..a8d81f7138f 100644 --- a/api_tests/institutions/views/test_institution_detail.py +++ b/api_tests/institutions/views/test_institution_detail.py @@ -1,6 +1,9 @@ import pytest -from osf_tests.factories import InstitutionFactory +from osf_tests.factories import ( + AuthUserFactory, + InstitutionFactory, +) from api.base.settings.defaults import API_BASE from django.core.validators import URLValidator @@ -11,6 +14,8 @@ class TestInstitutionDetail: 'nodes', 'registrations', 'users', + } + expected_metrics_relationships = { 'department_metrics', 'user_metrics', 'summary_metrics' @@ -26,34 +31,55 @@ def institution(self): def url(self, institution): return f'/{API_BASE}institutions/{institution._id}/' - def test_detail_response(self, app, institution, url): - - # 404 on wrong _id - res = app.get(f'/{institution}institutions/1PO/', expect_errors=True) - assert res.status_code == 404 - - res = app.get(url) - assert res.status_code == 200 - attrs = res.json['data']['attributes'] - assert attrs['name'] == institution.name - assert attrs['iri'] == institution.identifier_domain - assert attrs['ror_iri'] == institution.ror_uri - assert set(attrs['iris']) == { - institution.ror_uri, - institution.identifier_domain, - institution.absolute_url, - } - assert 'logo_path' in attrs - assert set(attrs['assets'].keys()) == {'logo', 'logo_rounded', 'banner'} - assert res.json['data']['links']['self'].endswith(url) - - relationships = res.json['data']['relationships'] - assert self.expected_relationships == set(relationships.keys()) - for relationships in list(relationships.values()): - # ↓ returns None if url is valid else throws error. - assert self.is_valid_url(relationships['links']['related']['href']) is None - - # test_return_without_logo_path - res = app.get(f'{url}?version=2.14') - assert res.status_code == 200 - assert 'logo_path' not in res.json['data']['attributes'] + @pytest.fixture() + def rando(self): + return AuthUserFactory() + + @pytest.fixture() + def institutional_admin(self, institution): + _admin_user = AuthUserFactory() + institution.get_group('institutional_admins').user_set.add(_admin_user) + return _admin_user + + def test_detail_response(self, app, institution, url, rando, institutional_admin): + + for _user in (None, rando, institutional_admin): + _auth = (None if _user is None else _user.auth) + # 404 on wrong _id + res = app.get(f'/{institution}institutions/1PO/', expect_errors=True, auth=_auth) + assert res.status_code == 404 + + res = app.get(url, auth=_auth) + assert res.status_code == 200 + attrs = res.json['data']['attributes'] + assert attrs['name'] == institution.name + assert attrs['iri'] == institution.identifier_domain + assert attrs['ror_iri'] == institution.ror_uri + assert set(attrs['iris']) == { + institution.ror_uri, + institution.identifier_domain, + institution.absolute_url, + } + assert 'logo_path' in attrs + assert set(attrs['assets'].keys()) == {'logo', 'logo_rounded', 'banner'} + if _user is institutional_admin: + assert attrs['link_to_external_reports_archive'] == institution.link_to_external_reports_archive + else: + assert 'link_to_external_reports_archive' not in attrs + assert res.json['data']['links']['self'].endswith(url) + + relationships = res.json['data']['relationships'] + _expected_relationships = ( + self.expected_relationships | self.expected_metrics_relationships + if _user is institutional_admin + else self.expected_relationships + ) + assert _expected_relationships == set(relationships.keys()) + for relationships in list(relationships.values()): + # ↓ returns None if url is valid else throws error. + assert self.is_valid_url(relationships['links']['related']['href']) is None + + # test_return_without_logo_path + res = app.get(f'{url}?version=2.14', auth=_auth) + assert res.status_code == 200 + assert 'logo_path' not in res.json['data']['attributes'] diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py index b29998d5561..f1641ea923c 100644 --- a/api_tests/institutions/views/test_institution_summary_metrics.py +++ b/api_tests/institutions/views/test_institution_summary_metrics.py @@ -1,15 +1,19 @@ import pytest import datetime +from waffle.testutils import override_flag +from osf.metrics import InstitutionProjectCounts + from api.base.settings.defaults import API_BASE from osf_tests.factories import ( + InstitutionFactory, AuthUserFactory, - InstitutionFactory ) -from osf.metrics import InstitutionProjectCounts +from osf.metrics.reports import InstitutionMonthlySummaryReport +from osf import features -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db class TestInstitutionSummaryMetrics: @@ -92,3 +96,251 @@ def test_get(self, app, url, institution, user, admin): 'self': f'http://localhost:8000/v2/institutions/{institution._id}/metrics/summary/' } } + + +@pytest.mark.es_metrics +@pytest.mark.django_db +class TestNewInstitutionSummaryMetricsList: + @pytest.fixture(autouse=True) + def _waffled(self): + with override_flag(features.INSTITUTIONAL_DASHBOARD_2024, active=True): + yield + + @pytest.fixture() + def institution(self): + return InstitutionFactory() + + @pytest.fixture() + def rando(self): + return AuthUserFactory() + + @pytest.fixture() + def institutional_admin(self, institution): + admin_user = AuthUserFactory() + institution.get_group('institutional_admins').user_set.add(admin_user) + return admin_user + + @pytest.fixture() + def unshown_reports(self, institution): + # Reports that should not be shown in the results + # Report from another institution + another_institution = InstitutionFactory() + _summary_report_factory('2024-08', another_institution) + # Old report from the same institution + _summary_report_factory('2024-07', institution) + _summary_report_factory('2018-02', institution) + + @pytest.fixture() + def reports(self, institution): + return [ + _summary_report_factory( + '2024-08', institution, + user_count=100, + public_project_count=50, + private_project_count=25, + public_registration_count=10, + embargoed_registration_count=5, + published_preprint_count=15, + public_file_count=20, + storage_byte_count=5000000000, + monthly_logged_in_user_count=80, + monthly_active_user_count=60, + ), + _summary_report_factory( + '2024-08', institution, + user_count=200, + public_project_count=150, + private_project_count=125, + public_registration_count=110, + embargoed_registration_count=105, + published_preprint_count=115, + public_file_count=120, + storage_byte_count=15000000000, + monthly_logged_in_user_count=180, + monthly_active_user_count=160, + ), + ] + + @pytest.fixture() + def url(self, institution): + return f'/{API_BASE}institutions/{institution._id}/metrics/summary/' + + def test_anon(self, app, url): + resp = app.get(url, expect_errors=True) + assert resp.status_code == 401 + + def test_rando(self, app, url, rando): + resp = app.get(url, auth=rando.auth, expect_errors=True) + assert resp.status_code == 403 + + def test_get_empty(self, app, url, institutional_admin): + resp = app.get(url, auth=institutional_admin.auth) + assert resp.status_code == 200 + assert resp.json['meta'] == {'version': '2.0'} + + def test_get_report(self, app, url, institutional_admin, institution, reports, unshown_reports): + resp = app.get(url, auth=institutional_admin.auth) + assert resp.status_code == 200 + + data = resp.json['data'] + + assert data['id'] == institution._id + assert data['type'] == 'institution-summary-metrics' + + attributes = data['attributes'] + assert attributes['report_yearmonth'] == '2024-08' + assert attributes['user_count'] == 200 + assert attributes['public_project_count'] == 150 + assert attributes['private_project_count'] == 125 + assert attributes['public_registration_count'] == 110 + assert attributes['embargoed_registration_count'] == 105 + assert attributes['published_preprint_count'] == 115 + assert attributes['public_file_count'] == 120 + assert attributes['storage_byte_count'] == 15000000000 + assert attributes['monthly_logged_in_user_count'] == 180 + assert attributes['monthly_active_user_count'] == 160 + + def test_get_report_with_multiple_months_and_institutions( + self, app, url, institutional_admin, institution + ): + # Create reports for multiple months and institutions + other_institution = InstitutionFactory() + _summary_report_factory( + '2024-09', institution, + user_count=250, + public_project_count=200, + private_project_count=150, + public_registration_count=120, + embargoed_registration_count=110, + published_preprint_count=130, + public_file_count=140, + storage_byte_count=20000000000, + monthly_logged_in_user_count=220, + monthly_active_user_count=200, + ) + _summary_report_factory( + '2024-08', institution, + user_count=200, + public_project_count=150, + private_project_count=125, + public_registration_count=110, + embargoed_registration_count=105, + published_preprint_count=115, + public_file_count=120, + storage_byte_count=15000000000, + monthly_logged_in_user_count=180, + monthly_active_user_count=160, + ) + _summary_report_factory( + '2024-09', other_institution, + user_count=300, + public_project_count=250, + private_project_count=200, + public_registration_count=180, + embargoed_registration_count=170, + published_preprint_count=190, + public_file_count=210, + storage_byte_count=25000000000, + monthly_logged_in_user_count=270, + monthly_active_user_count=260, + ) + + resp = app.get(url, auth=institutional_admin.auth) + assert resp.status_code == 200 + + data = resp.json['data'] + + assert data['id'] == institution._id + assert data['type'] == 'institution-summary-metrics' + + attributes = data['attributes'] + + assert attributes['report_yearmonth'] == '2024-09' + assert attributes['user_count'] == 250 + assert attributes['public_project_count'] == 200 + assert attributes['private_project_count'] == 150 + assert attributes['public_registration_count'] == 120 + assert attributes['embargoed_registration_count'] == 110 + assert attributes['published_preprint_count'] == 130 + assert attributes['public_file_count'] == 140 + assert attributes['storage_byte_count'] == 20000000000 + assert attributes['monthly_logged_in_user_count'] == 220 + assert attributes['monthly_active_user_count'] == 200 + + def test_get_with_valid_report_dates(self, app, url, institution, institutional_admin): + _summary_report_factory( + '2024-08', + institution, + user_count=0, + ) + _summary_report_factory( + '2024-09', + institution, + user_count=999, + + ) + _summary_report_factory( + '2018-02', + institution, + user_count=4133, + ) + + resp = app.get(f'{url}?report_yearmonth=2024-08', auth=institutional_admin.auth) + assert resp.status_code == 200 + + attributes = resp.json['data']['attributes'] + assert attributes['user_count'] == 0 + + resp = app.get(f'{url}?report_yearmonth=2018-02', auth=institutional_admin.auth) + assert resp.status_code == 200 + + attributes = resp.json['data']['attributes'] + assert attributes['user_count'] == 4133 + + def test_get_with_invalid_report_date(self, app, url, institution, institutional_admin): + _summary_report_factory( + '2024-08', + institution, + user_count=0, + ) + _summary_report_factory( + '2024-09', + institution, + user_count=999, + ) + + # Request with an invalid report_date format + resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth) + assert resp.status_code == 200 + + # Verify it defaults to the most recent report data + attributes = resp.json['data']['attributes'] + assert attributes['user_count'] == 999 + + def test_get_without_report_date_uses_most_recent(self, app, url, institution, institutional_admin): + _summary_report_factory( + '2024-08', + institution, + user_count=0, + ) + _summary_report_factory( + '2024-09', + institution, + user_count=999, + ) + + resp = app.get(url, auth=institutional_admin.auth) + assert resp.status_code == 200 + + attributes = resp.json['data']['attributes'] + assert attributes['user_count'] == 999 + + +def _summary_report_factory(yearmonth, institution, **kwargs): + report = InstitutionMonthlySummaryReport( + report_yearmonth=yearmonth, + institution_id=institution._id, + **kwargs, + ) + report.save(refresh=True) + return report diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py index dfee4d178f5..b1bf3490788 100644 --- a/api_tests/institutions/views/test_institution_user_metric_list.py +++ b/api_tests/institutions/views/test_institution_user_metric_list.py @@ -1,22 +1,31 @@ -import pytest import datetime import csv +import json from io import StringIO from random import random -import time +from urllib.parse import urlencode + +import pytest +from waffle.testutils import override_flag -from api.base.settings.defaults import API_BASE, DEFAULT_ES_NULL_VALUE +from api.base.settings.defaults import API_BASE, DEFAULT_ES_NULL_VALUE, REPORT_FILENAME_FORMAT +import osf.features from osf_tests.factories import ( InstitutionFactory, AuthUserFactory, ) from osf.metrics import UserInstitutionProjectCounts -from api.base import settings +from osf.metrics.reports import InstitutionalUserReport -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db -class TestInstitutionUserMetricList: +class TestOldInstitutionUserMetricList: + + @pytest.fixture(autouse=True) + def _waffled(self): + with override_flag(osf.features.INSTITUTIONAL_DASHBOARD_2024, active=False): + yield # these tests apply only before institution dashboard improvements @pytest.fixture() def institution(self): @@ -52,33 +61,31 @@ def admin(self, institution): @pytest.fixture() def populate_counts(self, institution, user, user2): # Old data that shouldn't appear in responses - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user._id, institution_id=institution._id, department='Biology dept', public_project_count=4, private_project_count=4, timestamp=datetime.date(2019, 6, 4) - ).save() + ).save(refresh=True) # New data - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user._id, institution_id=institution._id, department='Biology dept', public_project_count=6, private_project_count=5, - ).save() + ).save(refresh=True) - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user2._id, institution_id=institution._id, department='Psychology dept', public_project_count=3, private_project_count=2, - ).save() - - time.sleep(10) + ).save(refresh=True) @pytest.fixture() def populate_more_counts(self, institution, user, user2, user3, populate_counts): @@ -89,34 +96,30 @@ def populate_more_counts(self, institution, user, user2, user3, populate_counts) users.append(AuthUserFactory()) for test_user in users: - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=test_user._id, institution_id=institution._id, department='Psychology dept', public_project_count=int(10 * random()), private_project_count=int(10 * random()), - ).save() + ).save(refresh=True) - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user3._id, institution_id=institution._id, department='Psychology dept', public_project_count=int(10 * random()), private_project_count=int(10 * random()), - ).save() - - time.sleep(10) + ).save(refresh=True) @pytest.fixture() def populate_na_department(self, institution, user4): - UserInstitutionProjectCounts.record( + UserInstitutionProjectCounts( user_id=user4._id, institution_id=institution._id, public_project_count=1, private_project_count=1, - ).save() - - time.sleep(10) + ).save(refresh=True) @pytest.fixture() def url(self, institution): @@ -218,7 +221,6 @@ def test_filter(self, app, url, admin, populate_counts): resp = app.get(f'{url}?filter[department]=Psychology dept', auth=admin.auth) assert resp.json['data'][0]['attributes']['department'] == 'Psychology dept' - @pytest.mark.skipif(settings.CI_ENV, reason='Non-deterministic fails on CI') def test_sort_and_pagination(self, app, url, user, user2, user3, admin, populate_counts, populate_more_counts, institution): resp = app.get(f'{url}?sort=user_name&page[size]=1&page=2', auth=admin.auth) assert resp.status_code == 200 @@ -229,7 +231,6 @@ def test_sort_and_pagination(self, app, url, user, user2, user3, admin, populate assert resp.json['links']['meta']['total'] == 11 assert resp.json['data'][-1]['attributes']['user_name'] == 'Zedd' - @pytest.mark.skipif(settings.CI_ENV, reason='Non-deterministic fails on CI') def test_filter_and_pagination(self, app, user, user2, user3, url, admin, populate_counts, populate_more_counts, institution): resp = app.get(f'{url}?page=2', auth=admin.auth) assert resp.json['links']['meta']['total'] == 11 @@ -238,7 +239,6 @@ def test_filter_and_pagination(self, app, user, user2, user3, url, admin, popula assert resp.json['links']['meta']['total'] == 1 assert resp.json['data'][0]['attributes']['user_name'] == 'Zedd' - @pytest.mark.skipif(settings.CI_ENV, reason='Non-deterministic fails on CI') def test_filter_and_sort(self, app, url, user, user2, user3, admin, user4, populate_counts, populate_na_department, institution): """ Testing for bug where sorting and filtering would throw 502. @@ -265,3 +265,379 @@ def test_filter_and_sort(self, app, url, user, user2, user3, admin, user4, popul assert data[0]['attributes']['department'] == 'Biology dept' assert data[1]['attributes']['department'] == 'N/A' assert data[2]['attributes']['department'] == 'Psychology dept' + + +@pytest.mark.es_metrics +@pytest.mark.django_db +class TestNewInstitutionUserMetricList: + @pytest.fixture(autouse=True) + def _waffled(self): + with override_flag(osf.features.INSTITUTIONAL_DASHBOARD_2024, active=True): + yield # these tests apply only after institution dashboard improvements + + @pytest.fixture() + def institution(self): + return InstitutionFactory() + + @pytest.fixture() + def rando(self): + return AuthUserFactory() + + @pytest.fixture() + def institutional_admin(self, institution): + _admin_user = AuthUserFactory() + institution.get_group('institutional_admins').user_set.add(_admin_user) + return _admin_user + + @pytest.fixture() + def unshown_reports(self, institution): + # unshown because another institution + _another_institution = InstitutionFactory() + _report_factory('2024-08', _another_institution, user_id='nother_inst') + # unshown because old + _report_factory('2024-07', institution, user_id='old') + + @pytest.fixture() + def reports(self, institution): + return [ + _report_factory( + '2024-08', institution, + user_id='u_sparse', + storage_byte_count=53, + ), + _report_factory( + '2024-08', institution, + user_id='u_orc', + orcid_id='5555-4444-3333-2222', + storage_byte_count=8277, + ), + _report_factory( + '2024-08', institution, + user_id='u_blargl', + department_name='blargl', + storage_byte_count=34834834, + ), + _report_factory( + '2024-08', institution, + user_id='u_orcomma', + orcid_id='4444-3333-2222-1111', + department_name='a department, or so, that happens, incidentally, to have commas', + storage_byte_count=736662999298, + ), + ] + + @pytest.fixture() + def url(self, institution): + return f'/{API_BASE}institutions/{institution._id}/metrics/users/' + + def test_anon(self, app, url): + _resp = app.get(url, expect_errors=True) + assert _resp.status_code == 401 + + def test_rando(self, app, url, rando): + _resp = app.get(url, auth=rando.auth, expect_errors=True) + assert _resp.status_code == 403 + + def test_get_empty(self, app, url, institutional_admin): + _resp = app.get(url, auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert _resp.json['data'] == [] + + def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + _resp = app.get(url, auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert len(_resp.json['data']) == len(reports) + _expected_user_ids = {_report.user_id for _report in reports} + assert set(_user_ids(_resp)) == _expected_user_ids + + def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_ids in ( + ({'filter[department]': 'nunavum'}, set()), + ({'filter[department]': 'incidentally'}, set()), + ({'filter[department]': 'blargl'}, {'u_blargl'}), + ({'filter[department]': 'a department, or so, that happens, incidentally, to have commas'}, {'u_orcomma'}), + ({'filter[department][eq]': 'nunavum'}, set()), + ({'filter[department][eq]': 'blargl'}, {'u_blargl'}), + ({'filter[department][eq]': 'a department, or so, that happens, incidentally, to have commas'}, {'u_orcomma'}), + ({'filter[department][ne]': 'nunavum'}, {'u_sparse', 'u_blargl', 'u_orc', 'u_orcomma'}), + + ({'filter[orcid_id][eq]': '5555-4444-3333-2222'}, {'u_orc'}), + ({'filter[orcid_id][ne]': ''}, {'u_orc', 'u_orcomma'}), + ({'filter[orcid_id][eq]': ''}, {'u_sparse', 'u_blargl'}), + ({ + 'filter[orcid_id]': '', + 'filter[department]': 'blargl', + }, {'u_blargl'}), + ({ + 'filter[orcid_id]': '', + 'filter[department][ne]': 'blargl', + }, {'u_sparse'}), + ({ + 'filter[orcid_id]': '5555-4444-3333-2222', + 'filter[department][ne]': 'blargl', + }, {'u_orc'}), + ({ + 'filter[orcid_id]': '5555-4444-3333-2222', + 'filter[department][ne]': '', + }, set()), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert set(_user_ids(_resp)) == _expected_user_ids + + def test_sort_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_id_list in ( + ({'sort': 'storage_byte_count'}, ['u_sparse', 'u_orc', 'u_blargl', 'u_orcomma']), + ({'sort': '-storage_byte_count'}, ['u_orcomma', 'u_blargl', 'u_orc', 'u_sparse']), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert list(_user_ids(_resp)) == _expected_user_id_list + + def test_paginate_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_id_list in ( + ({'sort': 'storage_byte_count', 'page[size]': 2}, ['u_sparse', 'u_orc']), + ({'sort': 'storage_byte_count', 'page[size]': 2, 'page': 2}, ['u_blargl', 'u_orcomma']), + ({'sort': '-storage_byte_count', 'page[size]': 3}, ['u_orcomma', 'u_blargl', 'u_orc']), + ({'sort': '-storage_byte_count', 'page[size]': 3, 'page': 2}, ['u_sparse']), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert list(_user_ids(_resp)) == _expected_user_id_list + + @pytest.mark.parametrize('format_type, delimiter, content_type', [ + ('csv', ',', 'text/csv; charset=utf-8'), + ('tsv', '\t', 'text/tab-separated-values; charset=utf-8') + ]) + def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institution, format_type, delimiter, + content_type): + _report_factory( + '2024-08', + institution, + user_id='u_orcomma', + account_creation_date='2018-02', + user_name='Jason Kelce', + orcid_id='4444-3333-2222-1111', + department_name='Center, \t Greatest Ever', + storage_byte_count=736662999298, + embargoed_registration_count=1, + published_preprint_count=1, + public_registration_count=2, + public_project_count=3, + public_file_count=4, + private_project_count=5, + month_last_active='2018-02', + month_last_login='2018-02', + ) + + resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) + assert resp.status_code == 200 + assert resp.headers['Content-Type'] == content_type + + current_date = datetime.datetime.now().strftime('%Y-%m') + expected_filename = REPORT_FILENAME_FORMAT.format( + view_name='institution-user-metrics', + date_created=current_date, + extension=format_type + ) + assert resp.headers['Content-Disposition'] == f'attachment; filename="{expected_filename}"' + + response_body = resp.text + expected_response = [ + [ + 'report_yearmonth', + 'account_creation_date', + 'department', + 'embargoed_registration_count', + 'month_last_active', + 'month_last_login', + 'orcid_id', + 'private_projects', + 'public_file_count', + 'public_projects', + 'public_registration_count', + 'published_preprint_count', + 'storage_byte_count', + 'user_name' + ], + [ + '2024-08', + '2018-02', + 'Center, \t Greatest Ever', + '1', + '2018-02', + '2018-02', + '4444-3333-2222-1111', + '5', + '4', + '3', + '2', + '1', + '736662999298', + 'Jason Kelce' + ] + ] + + if delimiter: + with StringIO(response_body) as file: + reader = csv.reader(file, delimiter=delimiter) + response_rows = list(reader) + assert response_rows[0] == expected_response[0] + assert sorted(response_rows[1:]) == sorted(expected_response[1:]) + + @pytest.mark.parametrize('format_type, delimiter, content_type', [ + ('csv', ',', 'text/csv; charset=utf-8'), + ('tsv', '\t', 'text/tab-separated-values; charset=utf-8') + ]) + def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institution, format_type, delimiter, + content_type): + # Create 15 records, exceeding the default page size of 10 + num_records = 15 + expected_data = [] + for i in range(num_records): + _report_factory( + '2024-08', + institution, + user_id=f'u_orcomma_{i}', + account_creation_date='2018-02', + user_name=f'Jalen Hurts #{i}', + orcid_id=f'4444-3333-2222-111{i}', + department_name='QBatman', + storage_byte_count=736662999298 + i, + embargoed_registration_count=1, + published_preprint_count=1, + public_registration_count=2, + public_project_count=3, + public_file_count=4, + private_project_count=5, + month_last_active='2018-02', + month_last_login='2018-02', + ) + expected_data.append([ + '2024-08', + '2018-02', + 'QBatman', + '1', + '2018-02', + '2018-02', + f'4444-3333-2222-111{i}', + '5', + '4', + '3', + '2', + '1', + str(736662999298 + i), + f'Jalen Hurts #{i}', + ]) + + # Make request for CSV format with page[size]=10 + resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) + assert resp.status_code == 200 + assert resp.headers['Content-Type'] == content_type + + current_date = datetime.datetime.now().strftime('%Y-%m') + expected_filename = REPORT_FILENAME_FORMAT.format( + view_name='institution-user-metrics', + date_created=current_date, + extension=format_type + ) + assert resp.headers['Content-Disposition'] == f'attachment; filename="{expected_filename}"' + + # Validate the CSV content contains all 15 records, ignoring the default pagination of 10 + response_body = resp.text + rows = response_body.splitlines() + + assert len(rows) == num_records + 1 == 16 # 1 header + 15 records + + if delimiter: + with StringIO(response_body) as file: + reader = csv.reader(file, delimiter=delimiter) + response_rows = list(reader) + # Validate header row + expected_header = [ + 'report_yearmonth', + 'account_creation_date', + 'department', + 'embargoed_registration_count', + 'month_last_active', + 'month_last_login', + 'orcid_id', + 'private_projects', + 'public_file_count', + 'public_projects', + 'public_registration_count', + 'published_preprint_count', + 'storage_byte_count', + 'user_name' + ] + assert response_rows[0] == expected_header + # Sort both expected and actual rows (ignoring the header) before comparison + assert sorted(response_rows[1:]) == sorted(expected_data) + + def test_get_report_format_table_json(self, app, url, institutional_admin, institution): + _report_factory( + '2024-08', + institution, + user_id='u_orcomma', + account_creation_date='2018-02', + user_name='Brian Dawkins', + orcid_id='4444-3333-2222-1111', + department_name='Safety "The Wolverine" Weapon X', + storage_byte_count=736662999298, + embargoed_registration_count=1, + published_preprint_count=1, + public_registration_count=2, + public_project_count=3, + public_file_count=4, + private_project_count=5, + month_last_active='2018-02', + month_last_login='2018-02', + ) + + resp = app.get(f'{url}?format=json_report', auth=institutional_admin.auth) + assert resp.status_code == 200 + assert resp.headers['Content-Type'] == 'application/json; charset=utf-8' + + current_date = datetime.datetime.now().strftime('%Y-%m') + expected_filename = REPORT_FILENAME_FORMAT.format( + view_name='institution-user-metrics', + date_created=current_date, + extension='json' + ) + assert resp.headers['Content-Disposition'] == f'attachment; filename="{expected_filename}"' + + # Validate JSON structure and content + response_data = json.loads(resp.body) + expected_data = [ + { + 'report_yearmonth': '2024-08', + 'account_creation_date': '2018-02', + 'department': 'Safety "The Wolverine" Weapon X', + 'embargoed_registration_count': 1, + 'month_last_active': '2018-02', + 'month_last_login': '2018-02', + 'orcid_id': '4444-3333-2222-1111', + 'private_projects': 5, + 'public_file_count': 4, + 'public_projects': 3, + 'public_registration_count': 2, + 'published_preprint_count': 1, + 'storage_byte_count': 736662999298, + 'user_name': 'Brian Dawkins' + } + ] + assert response_data == expected_data + + +def _user_ids(api_response): + for _datum in api_response.json['data']: + yield _datum['relationships']['user']['data']['id'] + +def _report_factory(yearmonth, institution, **kwargs): + _report = InstitutionalUserReport( + report_yearmonth=yearmonth, + institution_id=institution._id, + **kwargs, + ) + _report.save(refresh=True) + return _report diff --git a/api_tests/metrics/test_composite_query.py b/api_tests/metrics/test_composite_query.py index fd36c0c5f24..0cd0b3bb180 100644 --- a/api_tests/metrics/test_composite_query.py +++ b/api_tests/metrics/test_composite_query.py @@ -29,7 +29,7 @@ def base_url(): return f'/{API_BASE}metrics/preprints/' -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db class TestElasticSearch: diff --git a/api_tests/metrics/test_counted_usage.py b/api_tests/metrics/test_counted_usage.py index 9e20f2c0238..568d663be9e 100644 --- a/api_tests/metrics/test_counted_usage.py +++ b/api_tests/metrics/test_counted_usage.py @@ -99,8 +99,8 @@ def test_by_client_session_id(self, app, mock_save, user): assert resp.status_code == 201 assert_saved_with( mock_save, - # doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3').hexdigest() - expected_doc_id='55fffffdc0d674d15a5e8763d14e4ae90f658fbfb6fbf94f88a5d24978f02e72', + # doc_id: sha256(b'http://example.foo/|http://example.foo/blahblah/blee|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|api,view').hexdigest() + expected_doc_id='3239044c7462dd318edd0522a0ed7d84b9c6502ef16cb40dfcae6c1f456d57a2', expected_attrs={ 'platform_iri': 'http://example.foo/', 'item_guid': 'zyxwv', @@ -132,8 +132,8 @@ def test_by_client_session_id_anon(self, app, mock_save): assert resp.status_code == 201 assert_saved_with( mock_save, - # doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3').hexdigest() - expected_doc_id='e559ffbc4bd3e3e69252d34c273f0e771ec89ee455ec9b60fbbadf3944e4af4e', + # doc_id: sha256(b'http://example.foo/|http://example.foo/bliz/|5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34|1981-01-01|3|view,web').hexdigest() + expected_doc_id='d01759e963893f9dc9b2ccf016a5ef29135673779802b5578f31449543677e82', expected_attrs={ 'platform_iri': 'http://example.foo/', 'item_guid': 'zyxwv', @@ -166,8 +166,8 @@ def test_by_user_auth(self, app, mock_save, user): assert resp.status_code == 201 assert_saved_with( mock_save, - # doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3').hexdigest() - expected_doc_id='743494d8a55079b91e202da1dbdfce5aea72e310c57a34b36df2c2af5ed4d362', + # doc_id: sha256(b'http://example.foo/|http://osf.io/mst3k|ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a|1981-01-01|3|view,web').hexdigest() + expected_doc_id='7b8bc27c6d90fb45aa5bbd02deceba9f7384ed61b9a6e7253317c262020b94c2', expected_attrs={ 'platform_iri': 'http://example.foo/', 'item_guid': 'yxwvu', @@ -196,8 +196,8 @@ def test_by_useragent_header(self, app, mock_save): assert resp.status_code == 201 assert_saved_with( mock_save, - # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3').hexdigest() - expected_doc_id='a50ac1b2dc1c918cdea7be50b005117fdb6ee00ea069ca3aa4aaf03c0f905fa0', + # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3|api,view').hexdigest() + expected_doc_id='d669528b30f443ffe506e183537af9624ef290090e90a200ecce7b7ca19c77f7', expected_attrs={ 'platform_iri': 'http://example.foo/', 'item_guid': 'yxwvu', diff --git a/api_tests/metrics/test_preprint_metrics.py b/api_tests/metrics/test_preprint_metrics.py index 57e31655c40..1bde8719b75 100644 --- a/api_tests/metrics/test_preprint_metrics.py +++ b/api_tests/metrics/test_preprint_metrics.py @@ -116,7 +116,7 @@ def test_custom_metric_malformed_query(self, mock_execute, app, user, base_url): assert res.status_code == 400 assert res.json['errors'][0]['detail'] == 'Malformed elasticsearch query.' - @pytest.mark.es + @pytest.mark.es_metrics def test_agg_query(self, app, user, base_url): post_url = f'{base_url}downloads/' diff --git a/api_tests/metrics/test_raw_metrics.py b/api_tests/metrics/test_raw_metrics.py index c7feb69426b..6a3b9b8f8c5 100644 --- a/api_tests/metrics/test_raw_metrics.py +++ b/api_tests/metrics/test_raw_metrics.py @@ -14,7 +14,7 @@ pytestmark = pytest.mark.django_db -@pytest.mark.es +@pytest.mark.es_metrics class TestRawMetrics: @pytest.fixture(autouse=True) @@ -22,6 +22,12 @@ def enable_elasticsearch_metrics(self): with override_switch(features.ENABLE_RAW_METRICS, active=True): yield + @pytest.fixture(autouse=True) + def teardown_customer_index(self, es6_client): + es6_client.indices.delete(index='customer', ignore_unavailable=True) + yield + es6_client.indices.delete(index='customer', ignore_unavailable=True) + @pytest.fixture def user(self): user = AuthUserFactory() @@ -132,7 +138,7 @@ def test_post_and_get(self, app, user, base_url): time.sleep(3) - get_url = f'{base_url}_search?q=*' + get_url = f'{base_url}customer/_search?q=*' res = app.get(get_url, auth=user.auth) assert res.json['hits']['total'] == 1 diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py index d8b78cdf5ad..93cde9f1121 100644 --- a/api_tests/metrics/test_registries_moderation_metrics.py +++ b/api_tests/metrics/test_registries_moderation_metrics.py @@ -22,7 +22,7 @@ def enable_elasticsearch_metrics(self): with override_switch(features.ELASTICSEARCH_METRICS, active=True): yield - @pytest.mark.es + @pytest.mark.es_metrics def test_record_transitions(self, registration): registration._write_registration_action( RegistrationModerationStates.INITIAL, @@ -70,7 +70,7 @@ def other_user(self): def base_url(self): return '/_/metrics/registries_moderation/transitions/' - @pytest.mark.es + @pytest.mark.es_metrics def test_registries_moderation_view(self, app, user, base_url, registration): registration._write_registration_action( RegistrationModerationStates.INITIAL, diff --git a/api_tests/preprints/views/test_preprint_detail.py b/api_tests/preprints/views/test_preprint_detail.py index 7e3b279c406..ffec9722514 100644 --- a/api_tests/preprints/views/test_preprint_detail.py +++ b/api_tests/preprints/views/test_preprint_detail.py @@ -18,7 +18,9 @@ from osf.models import ( NodeLicense, PreprintContributor, + PreprintLog ) +from osf.utils import permissions as osf_permissions from osf.utils.permissions import WRITE from osf.utils.workflows import DefaultStates from osf_tests.factories import ( @@ -835,6 +837,472 @@ def test_update_preprint_task_called_on_api_update( assert mock_on_preprint_updated.called + def test_update_has_coi(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'has_coi': True} + ) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_coi'] + + preprint.reload() + assert preprint.has_coi + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_HAS_COI + assert log.params == {'preprint': preprint._id, 'user': user._id, 'value': True} + + def test_update_conflict_of_interest_statement(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'conflict_of_interest_statement': 'Owns shares in Closed Science Corporation.'} + ) + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + preprint.has_coi = False + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Cannot provide conflict of interest statement when has_coi is set to False.' + + preprint.has_coi = True + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['conflict_of_interest_statement'] ==\ + 'Owns shares in Closed Science Corporation.' + + preprint.reload() + assert preprint.conflict_of_interest_statement == 'Owns shares in Closed Science Corporation.' + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_COI_STATEMENT + assert log.params == { + 'preprint': preprint._id, + 'user': user._id, + 'value': 'Owns shares in Closed Science Corporation.' + } + + def test_update_has_data_links(self, app, user, preprint, url): + update_payload = build_preprint_update_payload(preprint._id, attributes={'has_data_links': 'available'}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_data_links'] == 'available' + + preprint.reload() + assert preprint.has_data_links + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_HAS_DATA_LINKS + assert log.params == {'value': 'available', 'user': user._id, 'preprint': preprint._id} + + def test_update_why_no_data(self, app, user, preprint, url): + update_payload = build_preprint_update_payload(preprint._id, attributes={'why_no_data': 'My dog ate it.'}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'You cannot edit this statement while your data links availability is set to true or is unanswered.' + + preprint.has_data_links = 'no' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['why_no_data'] == 'My dog ate it.' + + preprint.reload() + assert preprint.why_no_data + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_WHY_NO_DATA + assert log.params == {'user': user._id, 'preprint': preprint._id} + + def test_update_data_links(self, app, user, preprint, url): + data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] + update_payload = build_preprint_update_payload(preprint._id, attributes={'data_links': data_links}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + preprint.has_data_links = 'no' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Cannot provide data links when has_data_links is set to "no".' + + preprint.has_data_links = 'available' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['data_links'] == data_links + + preprint.reload() + assert preprint.data_links == data_links + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_DATA_LINKS + assert log.params == {'user': user._id, 'preprint': preprint._id} + + update_payload = build_preprint_update_payload(preprint._id, attributes={'data_links': 'maformed payload'}) + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Expected a list of items but got type "str".' + + def test_invalid_data_links(self, app, user, preprint, url): + preprint.has_data_links = 'available' + preprint.save() + + update_payload = build_preprint_update_payload(preprint._id, attributes={'data_links': ['thisaintright']}) + + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Enter a valid URL.' + + def test_update_has_prereg_links(self, app, user, preprint, url): + update_payload = build_preprint_update_payload(preprint._id, attributes={'has_prereg_links': 'available'}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_prereg_links'] == 'available' + + preprint.reload() + assert preprint.has_prereg_links + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_HAS_PREREG_LINKS + assert log.params == {'value': 'available', 'user': user._id, 'preprint': preprint._id} + + def test_invalid_prereg_links(self, app, user, preprint, url): + preprint.has_prereg_links = 'available' + preprint.save() + + update_payload = build_preprint_update_payload(preprint._id, attributes={'prereg_links': ['thisaintright']}) + + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Enter a valid URL.' + + def test_no_data_links_clears_links(self, app, user, preprint, url): + preprint.has_data_links = 'available' + preprint.data_links = ['http://www.apple.com'] + preprint.save() + + update_payload = build_preprint_update_payload(preprint._id, attributes={'has_data_links': 'no'}) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_data_links'] == 'no' + assert res.json['data']['attributes']['data_links'] == [] + + preprint.reload() + assert preprint.has_data_links == 'no' + assert preprint.data_links == [] + + def test_no_prereg_links_clears_links(self, app, user, preprint, url): + preprint.has_prereg_links = 'available' + preprint.prereg_links = ['http://example.com'] + preprint.prereg_link_info = 'prereg_analysis' + preprint.save() + + update_payload = build_preprint_update_payload(preprint._id, attributes={'has_prereg_links': 'no'}) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_prereg_links'] == 'no' + preprint.reload() + assert res.json['data']['attributes']['prereg_links'] == [] + assert not res.json['data']['attributes']['prereg_link_info'] + + assert preprint.prereg_links == [] + assert preprint.prereg_link_info is None + + def test_update_why_no_prereg(self, app, user, preprint, url): + update_payload = build_preprint_update_payload(preprint._id, attributes={'why_no_prereg': 'My dog ate it.'}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + preprint.has_prereg_links = 'available' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'You cannot edit this statement while your prereg links availability is set to true or is unanswered.' + + update_payload = build_preprint_update_payload(preprint._id, attributes={ + 'why_no_prereg': 'My dog ate it.', + 'has_prereg_links': 'no' + }) + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['why_no_prereg'] == 'My dog ate it.' + + preprint.reload() + assert preprint.why_no_prereg == 'My dog ate it.' + + log = preprint.logs.filter(action=PreprintLog.UPDATE_WHY_NO_PREREG).first() + assert log is not None, 'Expected log entry for why_no_prereg_updated not found.' + assert log.action == PreprintLog.UPDATE_WHY_NO_PREREG + assert log.params == {'user': user._id, 'preprint': preprint._id} + + def test_update_prereg_links(self, app, user, preprint, url): + + prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] + update_payload = build_preprint_update_payload(preprint._id, attributes={'prereg_links': prereg_links}) + + contrib = AuthUserFactory() + preprint.add_contributor(contrib, READ) + res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'You do not have permission to perform this action.' + + preprint.has_prereg_links = 'no' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'You cannot edit this field while your prereg links availability is set to false or is unanswered.' + + preprint.has_prereg_links = 'available' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['prereg_links'] == prereg_links + + preprint.reload() + assert preprint.prereg_links == prereg_links + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_PREREG_LINKS + assert log.params == {'user': user._id, 'preprint': preprint._id} + + update_payload = build_preprint_update_payload(preprint._id, attributes={'prereg_links': 'maformed payload'}) + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Expected a list of items but got type "str".' + + def test_update_prereg_link_info(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'prereg_link_info': 'prereg_designs'} + ) + + preprint.has_prereg_links = 'no' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'You cannot edit this field while your prereg links availability is set to false or is unanswered.' + + preprint.has_prereg_links = 'available' + preprint.save() + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['prereg_link_info'] == 'prereg_designs' + + preprint.reload() + assert preprint.prereg_link_info == 'prereg_designs' + log = preprint.logs.first() + assert log.action == PreprintLog.UPDATE_PREREG_LINKS_INFO + assert log.params == {'user': user._id, 'preprint': preprint._id} + + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'prereg_link_info': 'maformed payload'} + ) + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == '"maformed payload" is not a valid choice.' + + def test_update_has_coi_false_with_null_conflict_statement(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_coi': False, + 'conflict_of_interest_statement': None + } + ) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_coi'] is False + assert res.json['data']['attributes']['conflict_of_interest_statement'] is None + + preprint.reload() + assert preprint.has_coi is False + assert preprint.conflict_of_interest_statement is None + + def test_update_has_data_links_no_with_data_links_provided(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_data_links': 'no', + 'data_links': ['http://example.com/data'] + } + ) + + initial_has_data_links = preprint.has_data_links + initial_data_links = preprint.data_links + + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 400 + assert res.json['errors'][0]['detail'] == 'Cannot provide data links when has_data_links is set to "no".' + + preprint.reload() + + assert preprint.has_data_links == initial_has_data_links + assert preprint.data_links == initial_data_links + + assert preprint.has_data_links != 'no' + + def test_update_has_data_links_no_with_empty_data_links(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_data_links': 'no', + 'data_links': [] + } + ) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_data_links'] == 'no' + assert res.json['data']['attributes']['data_links'] == [] + + preprint.reload() + assert preprint.has_data_links == 'no' + assert preprint.data_links == [] + + def test_update_has_prereg_links_no_with_empty_prereg_links(self, app, user, preprint, url): + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_prereg_links': 'no', + 'prereg_links': [], + 'prereg_link_info': '' + } + ) + + res = app.patch_json_api(url, update_payload, auth=user.auth) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_prereg_links'] == 'no' + assert res.json['data']['attributes']['prereg_links'] == [] + assert res.json['data']['attributes']['prereg_link_info'] == '' + + preprint.reload() + assert preprint.has_prereg_links == 'no' + assert preprint.prereg_links == [] + assert preprint.prereg_link_info == '' + + def test_non_admin_cannot_update_has_coi(self, app, user, preprint, url): + write_contrib = AuthUserFactory() + preprint.add_contributor(write_contrib, permissions=osf_permissions.WRITE, auth=Auth(user), save=True) + + update_payload = build_preprint_update_payload( + preprint._id, + attributes={'has_coi': True} + ) + + res = app.patch_json_api(url, update_payload, auth=write_contrib.auth, expect_errors=True) + assert res.status_code == 403 + assert res.json['errors'][0]['detail'] == 'Must have admin permissions to update author assertion fields.' + + preprint.reload() + assert preprint.has_coi is None + + def test_sloan_updates(self, app, user, preprint, url): + """ + - Tests to ensure updating a preprint with unchanged data does not create superfluous log statements. + - Tests to ensure various dependent fields can be updated in a single request. + """ + preprint.has_prereg_links = 'available' + preprint.prereg_links = ['http://no-sf.io'] + preprint.prereg_link_info = 'prereg_designs' + preprint.save() + + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_prereg_links': 'available', + 'prereg_link_info': 'prereg_designs', + 'prereg_links': ['http://osf.io'], # changing here should be only non-factory created log. + } + ) + app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + # Any superfluous log statements? + logs = preprint.logs.all().values_list('action', 'params') + assert logs.count() == 3 # actions should be: 'subjects_updated', 'published', 'prereg_links_updated' + assert logs.latest() == ('prereg_links_updated', {'user': user._id, 'preprint': preprint._id}) + + # Can we set `has_prereg_links` to false and update `why_no_prereg` in a single request? + update_payload = build_preprint_update_payload( + preprint._id, + attributes={ + 'has_prereg_links': 'no', + 'why_no_prereg': 'My dog ate it.' + } + ) + res = app.patch_json_api(url, update_payload, auth=user.auth, expect_errors=True) + + assert res.status_code == 200 + assert res.json['data']['attributes']['has_prereg_links'] == 'no' + assert res.json['data']['attributes']['why_no_prereg'] == 'My dog ate it.' + + preprint.refresh_from_db() + assert preprint.has_prereg_links == 'no' + assert preprint.why_no_prereg == 'My dog ate it.' + @pytest.mark.django_db class TestPreprintUpdateSubjects(UpdateSubjectsMixin): diff --git a/api_tests/preprints/views/test_preprint_detail_author_assertions.py b/api_tests/preprints/views/test_preprint_detail_author_assertions.py deleted file mode 100644 index 63dc8696d41..00000000000 --- a/api_tests/preprints/views/test_preprint_detail_author_assertions.py +++ /dev/null @@ -1,300 +0,0 @@ -import pytest - -from osf.utils.permissions import READ, WRITE, ADMIN -from api.base.settings.defaults import API_BASE -from osf.models import PreprintLog -from osf_tests.factories import PreprintFactory, AuthUserFactory - - -def build_preprint_update_payload( - node_id, attributes=None, relationships=None, - jsonapi_type='preprints'): - payload = { - 'data': { - 'id': node_id, - 'type': jsonapi_type, - 'attributes': attributes, - 'relationships': relationships - } - } - return payload - - -@pytest.mark.django_db -@pytest.mark.enable_enqueue_task -class TestPreprintUpdateWithAuthorAssertion: - - @pytest.fixture() - def user(self): - return AuthUserFactory() - - @pytest.fixture() - def preprint(self, user): - """ - Creator is not admin permission - """ - preprint = PreprintFactory(creator=user) - admin = AuthUserFactory() - preprint.add_contributor(admin, ADMIN) - preprint.add_contributor(user, READ) - return preprint - - @pytest.fixture() - def url(self, preprint): - return f'/{API_BASE}preprints/{preprint._id}/' - - @pytest.fixture() - def read_contrib(self, preprint): - contrib = AuthUserFactory() - preprint.add_contributor(contrib, READ) - return contrib - - @pytest.fixture() - def write_contrib(self, preprint): - contrib = AuthUserFactory() - preprint.add_contributor(contrib, WRITE) - return contrib - - @pytest.fixture() - def admin_contrib(self, preprint): - contrib = AuthUserFactory() - preprint.add_contributor(contrib, ADMIN) - return contrib - - def assert_permission(self, app, url, contrib, attributes, expected_status): - update_payload = build_preprint_update_payload(node_id=contrib._id, attributes=attributes) - res = app.patch_json_api(url, update_payload, auth=contrib.auth, expect_errors=True) - assert res.status_code == expected_status - - # Testing permissions for updating has_coi - def test_update_has_coi_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'has_coi': True}, 403) - - def test_update_has_coi_permission_granted_write(self, app, write_contrib, url): - self.assert_permission(app, url, write_contrib, {'has_coi': True}, 403) - - def test_update_has_coi_permission_granted_admin(self, app, admin_contrib, url): - self.assert_permission(app, url, admin_contrib, {'has_coi': True}, 200) - - def test_update_has_coi_permission_granted_creator(self, app, user, url): - self.assert_permission(app, url, user, {'has_coi': True}, 403) - - # Testing permissions for updating conflict_of_interest_statement - def test_update_conflict_of_interest_statement_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'conflict_of_interest_statement': 'Test'}, 403) - - def test_update_conflict_of_interest_statement_permission_granted_write(self, app, write_contrib, preprint, url): - preprint.has_coi = True - preprint.save() - self.assert_permission(app, url, write_contrib, {'conflict_of_interest_statement': 'Test'}, 403) - - def test_update_conflict_of_interest_statement_permission_granted_admin(self, app, admin_contrib, preprint, url): - preprint.has_coi = True - preprint.save() - self.assert_permission(app, url, admin_contrib, {'conflict_of_interest_statement': 'Test'}, 200) - - def test_update_conflict_of_interest_statement_permission_granted_creator(self, app, user, preprint, url): - preprint.has_coi = True - preprint.save() - self.assert_permission(app, url, user, {'conflict_of_interest_statement': 'Test'}, 403) - - # Testing permissions for updating has_data_links - def test_update_has_data_links_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'has_data_links': 'available'}, 403) - - def test_update_has_data_links_permission_granted_write(self, app, write_contrib, url): - self.assert_permission(app, url, write_contrib, {'has_data_links': 'available'}, 403) - - def test_update_has_data_links_permission_granted_admin(self, app, admin_contrib, url): - self.assert_permission(app, url, admin_contrib, {'has_data_links': 'available'}, 200) - - def test_update_has_data_links_permission_granted_creator(self, app, user, url): - self.assert_permission(app, url, user, {'has_data_links': 'available'}, 403) - - # Testing permissions for updating why_no_data - def test_update_why_no_data_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'why_no_data': 'My dog ate it.'}, 403) - - def test_update_why_no_data_permission_granted_write(self, app, write_contrib, preprint, url): - preprint.has_data_links = 'no' - preprint.save() - self.assert_permission(app, url, write_contrib, {'why_no_data': 'My dog ate it.'}, 403) - - def test_update_why_no_data_permission_granted_admin(self, app, admin_contrib, preprint, url): - preprint.has_data_links = 'no' - preprint.save() - self.assert_permission(app, url, admin_contrib, {'why_no_data': 'My dog ate it.'}, 200) - - def test_update_why_no_data_permission_granted_creator(self, app, user, preprint, url): - preprint.has_data_links = 'no' - preprint.save() - self.assert_permission(app, url, user, {'why_no_data': 'My dog ate it.'}, 403) - - # Testing permissions for updating data_links - def test_update_data_links_permission_denied(self, app, read_contrib, url): - data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - self.assert_permission(app, url, read_contrib, {'data_links': data_links}, 403) - - def test_update_data_links_permission_granted_write(self, app, write_contrib, preprint, url): - data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_data_links = 'available' - preprint.save() - self.assert_permission(app, url, write_contrib, {'data_links': data_links}, 403) - - def test_update_data_links_permission_granted_admin(self, app, admin_contrib, preprint, url): - data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_data_links = 'available' - preprint.save() - self.assert_permission(app, url, admin_contrib, {'data_links': data_links}, 200) - - def test_update_data_links_permission_granted_creator(self, app, user, preprint, url): - data_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_data_links = 'available' - preprint.save() - self.assert_permission(app, url, user, {'data_links': data_links}, 403) - - def test_update_data_links_invalid_payload(self, app, admin_contrib, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'data_links': 'maformed payload'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Expected a list of items but got type "str".' - - def test_update_data_links_invalid_url(self, app, admin_contrib, preprint, url): - preprint.has_data_links = 'available' - preprint.save() - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'data_links': ['thisaintright']}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Enter a valid URL.' - - # Testing permissions for updating has_prereg_links - def test_update_has_prereg_links_permission_denied(self, app, read_contrib, url): - self.assert_permission(app, url, read_contrib, {'has_prereg_links': 'available'}, 403) - - def test_update_has_prereg_links_permission_granted_write(self, app, write_contrib, url): - self.assert_permission(app, url, write_contrib, {'has_prereg_links': 'available'}, 403) - - def test_update_has_prereg_links_permission_granted_admin(self, app, admin_contrib, url): - self.assert_permission(app, url, admin_contrib, {'has_prereg_links': 'available'}, 200) - - def test_update_has_prereg_links_permission_granted_creator(self, app, user, url): - self.assert_permission(app, url, user, {'has_prereg_links': 'available'}, 403) - - # Testing permissions for updating prereg_links - def test_update_prereg_links_permission_denied(self, app, read_contrib, url): - prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - self.assert_permission(app, url, read_contrib, {'prereg_links': prereg_links}, 403) - - def test_update_prereg_links_permission_granted_write(self, app, write_contrib, preprint, url): - prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_prereg_links = 'available' - preprint.save() - self.assert_permission(app, url, write_contrib, {'prereg_links': prereg_links}, 403) - - def test_update_prereg_links_permission_granted_admin(self, app, admin_contrib, preprint, url): - prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_prereg_links = 'available' - preprint.save() - self.assert_permission(app, url, admin_contrib, {'prereg_links': prereg_links}, 200) - - def test_update_prereg_links_permission_granted_creator(self, app, user, preprint, url): - prereg_links = ['http://www.JasonKelce.com', 'http://www.ItsTheWholeTeam.com/'] - preprint.has_prereg_links = 'available' - preprint.save() - self.assert_permission(app, url, user, {'prereg_links': prereg_links}, 403) - - def test_update_prereg_links_invalid_payload(self, app, admin_contrib, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_links': 'maformed payload'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Expected a list of items but got type "str".' - - def test_update_prereg_links_invalid_url(self, app, admin_contrib, preprint, url): - preprint.has_prereg_links = 'available' - preprint.save() - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_links': ['thisaintright']}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'Enter a valid URL.' - - def test_update_prereg_link_info_fail_prereg_links(self, app, admin_contrib, preprint, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_link_info': 'prereg_designs'}) - preprint.has_prereg_links = 'no' - preprint.save() - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == 'You cannot edit this field while your prereg links availability is set to false or is unanswered.' - - def test_update_prereg_link_info_success(self, app, admin_contrib, preprint, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_link_info': 'prereg_designs'}) - preprint.has_prereg_links = 'available' - preprint.save() - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth) - assert res.status_code == 200 - assert res.json['data']['attributes']['prereg_link_info'] == 'prereg_designs' - preprint.reload() - assert preprint.prereg_link_info == 'prereg_designs' - log = preprint.logs.first() - assert log.action == PreprintLog.UPDATE_PREREG_LINKS_INFO - assert log.params == {'user': admin_contrib._id, 'preprint': preprint._id} - - def test_update_prereg_link_info_invalid_payload(self, app, admin_contrib, url): - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'prereg_link_info': 'maformed payload'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 400 - assert res.json['errors'][0]['detail'] == '"maformed payload" is not a valid choice.' - - def test_no_prereg_links_clears_links(self, app, admin_contrib, preprint, url): - preprint.has_prereg_links = 'available' - preprint.prereg_links = ['http://example.com'] - preprint.prereg_link_info = 'prereg_analysis' - preprint.save() - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'has_prereg_links': 'no'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth) - assert res.status_code == 200 - assert res.json['data']['attributes']['has_prereg_links'] == 'no' - assert res.json['data']['attributes']['prereg_links'] == [] - assert not res.json['data']['attributes']['prereg_link_info'] - - def test_no_data_links_clears_links(self, app, admin_contrib, preprint, url): - preprint.has_data_links = 'available' - preprint.data_links = ['http://www.apple.com'] - preprint.save() - update_payload = build_preprint_update_payload(node_id=admin_contrib._id, attributes={'has_data_links': 'no'}) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth) - assert res.status_code == 200 - assert res.json['data']['attributes']['has_data_links'] == 'no' - assert res.json['data']['attributes']['data_links'] == [] - - def test_sloan_updates(self, app, admin_contrib, preprint, url): - preprint.has_prereg_links = 'available' - preprint.prereg_links = ['http://no-sf.io'] - preprint.prereg_link_info = 'prereg_designs' - preprint.save() - update_payload = build_preprint_update_payload( - node_id=preprint._id, - attributes={ - 'has_prereg_links': 'available', - 'prereg_link_info': 'prereg_designs', - 'prereg_links': ['http://osf.io'], - } - ) - app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - logs = preprint.logs.all().values_list('action', 'params') - assert logs.count() == 5 - assert logs.latest() == ('prereg_links_updated', {'user': admin_contrib._id, 'preprint': preprint._id}) - - update_payload = build_preprint_update_payload( - node_id=preprint._id, - attributes={ - 'has_prereg_links': 'no', - 'why_no_prereg': 'My dog ate it.' - } - ) - res = app.patch_json_api(url, update_payload, auth=admin_contrib.auth, expect_errors=True) - assert res.status_code == 200 - assert res.json['data']['attributes']['has_prereg_links'] == 'no' - assert res.json['data']['attributes']['why_no_prereg'] == 'My dog ate it.' - preprint.refresh_from_db() - assert preprint.has_prereg_links == 'no' - assert preprint.why_no_prereg == 'My dog ate it.' diff --git a/api_tests/share/_utils.py b/api_tests/share/_utils.py index 9595aaf1b81..a04808cac3c 100644 --- a/api_tests/share/_utils.py +++ b/api_tests/share/_utils.py @@ -12,6 +12,7 @@ ) from website import settings as website_settings from api.share.utils import shtrove_ingest_url, sharev2_push_url +from osf.metadata.osf_gathering import OsfmapPartition @contextlib.contextmanager @@ -40,36 +41,67 @@ def mock_update_share(): @contextlib.contextmanager -def expect_ingest_request(mock_share_responses, osfguid, *, token=None, delete=False, count=1): +def expect_ingest_request(mock_share_responses, osfguid, *, token=None, delete=False, count=1, error_response=False): mock_share_responses._calls.reset() yield - _double_count = count * 2 # pushing to share two ways - assert len(mock_share_responses.calls) == _double_count, ( - f'expected {_double_count} call(s), got {len(mock_share_responses.calls)}: {list(mock_share_responses.calls)}' + _legacy_count_per_item = 1 + _trove_main_count_per_item = 1 + _trove_supplementary_count_per_item = ( + 0 + if (error_response or delete) + else (len(OsfmapPartition) - 1) ) + _total_count = count * ( + _legacy_count_per_item + + _trove_main_count_per_item + + _trove_supplementary_count_per_item + ) + assert len(mock_share_responses.calls) == _total_count, ( + f'expected {_total_count} call(s), got {len(mock_share_responses.calls)}: {list(mock_share_responses.calls)}' + ) + _trove_ingest_calls = [] + _trove_supp_ingest_calls = [] + _legacy_push_calls = [] for _call in mock_share_responses.calls: if _call.request.url.startswith(shtrove_ingest_url()): - assert_ingest_request(_call.request, osfguid, token=token, delete=delete) + if 'is_supplementary' in _call.request.url: + _trove_supp_ingest_calls.append(_call) + else: + _trove_ingest_calls.append(_call) else: - assert _call.request.url.startswith(sharev2_push_url()) + _legacy_push_calls.append(_call) + assert len(_trove_ingest_calls) == count + assert len(_trove_supp_ingest_calls) == count * _trove_supplementary_count_per_item + assert len(_legacy_push_calls) == count + for _call in _trove_ingest_calls: + assert_ingest_request(_call.request, osfguid, token=token, delete=delete) + for _call in _trove_supp_ingest_calls: + assert_ingest_request(_call.request, osfguid, token=token, delete=delete, supp=True) + for _call in _legacy_push_calls: + assert _call.request.url.startswith(sharev2_push_url()) -def assert_ingest_request(request, expected_osfguid, *, token=None, delete=False): +def assert_ingest_request(request, expected_osfguid, *, token=None, delete=False, supp=False): _querydict = QueryDict(urlsplit(request.path_url).query) - assert _querydict['record_identifier'] == expected_osfguid + if supp: + assert _querydict['record_identifier'].startswith(expected_osfguid) + assert _querydict['record_identifier'] != expected_osfguid + else: + assert _querydict['record_identifier'] == expected_osfguid if delete: assert request.method == 'DELETE' else: assert request.method == 'POST' _focus_iri = _querydict['focus_iri'] assert _focus_iri == f'{website_settings.DOMAIN}{expected_osfguid}' - assert _focus_iri in request.body.decode('utf-8') + _request_body = request.body.decode('utf-8') + assert (_focus_iri in _request_body) or (supp and not _request_body.strip()) _token = token or website_settings.SHARE_API_TOKEN assert request.headers['Authorization'] == f'Bearer {_token}' @contextlib.contextmanager -def expect_preprint_ingest_request(mock_share_responses, preprint, *, delete=False, count=1): +def expect_preprint_ingest_request(mock_share_responses, preprint, *, delete=False, count=1, error_response=False): # same as expect_ingest_request, but with convenience for preprint specifics # and postcommit-task handling (so on_preprint_updated actually runs) with expect_ingest_request( @@ -78,6 +110,7 @@ def expect_preprint_ingest_request(mock_share_responses, preprint, *, delete=Fal token=preprint.provider.access_token, delete=delete, count=count, + error_response=error_response, ): # clear out postcommit tasks from factories postcommit_queue().clear() diff --git a/api_tests/share/test_share_preprint.py b/api_tests/share/test_share_preprint.py index aa4d769d1f7..4ab47963bc8 100644 --- a/api_tests/share/test_share_preprint.py +++ b/api_tests/share/test_share_preprint.py @@ -133,7 +133,7 @@ def test_no_call_async_update_on_400_failure(self, mock_share_responses, preprin mock_share_responses.replace(responses.POST, shtrove_ingest_url(), status=400) mock_share_responses.replace(responses.POST, sharev2_push_url(), status=400) preprint.set_published(True, auth=auth, save=True) - with expect_preprint_ingest_request(mock_share_responses, preprint, count=1): + with expect_preprint_ingest_request(mock_share_responses, preprint, count=1, error_response=True): preprint.update_search() def test_delete_from_share(self, mock_share_responses): diff --git a/conftest.py b/conftest.py index 2eb51df076e..6f870093ed4 100644 --- a/conftest.py +++ b/conftest.py @@ -1,3 +1,4 @@ +import contextlib from unittest import mock import logging import os @@ -5,7 +6,9 @@ from django.core.management import call_command from django.db import transaction +from elasticsearch import exceptions as es_exceptions from elasticsearch_dsl.connections import connections +from elasticsearch_metrics.registry import registry as es_metrics_registry from faker import Factory import pytest import responses @@ -133,22 +136,44 @@ def es6_client(setup_connections): @pytest.fixture(scope='function', autouse=True) -def _es_marker(request): +def _es_metrics_marker(request): """Clear out all indices and index templates before and after - tests marked with ``es``. + tests marked with `es_metrics`. """ - marker = request.node.get_closest_marker('es') + marker = request.node.get_closest_marker('es_metrics') if marker: es6_client = request.getfixturevalue('es6_client') - - def teardown_es(): - es6_client.indices.delete(index='*') - es6_client.indices.delete_template('*') - - teardown_es() - call_command('sync_metrics') - yield - teardown_es() + _temp_prefix = 'temp_metrics_' + _temp_wildcard = f'{_temp_prefix}*' + + def _teardown_es_temps(): + es6_client.indices.delete(index=_temp_wildcard) + try: + es6_client.indices.delete_template(_temp_wildcard) + except es_exceptions.NotFoundError: + pass + + @contextlib.contextmanager + def _mock_metric_names(): + with contextlib.ExitStack() as _exit: + for _metric_class in es_metrics_registry.get_metrics(): + _exit.enter_context(mock.patch.object( + _metric_class, + '_template_name', # also used to construct index names + f'{_temp_prefix}{_metric_class._template_name}', + )) + _exit.enter_context(mock.patch.object( + _metric_class, + '_template', # a wildcard string for indexes and templates + f'{_temp_prefix}{_metric_class._template}', + )) + yield + + _teardown_es_temps() + with _mock_metric_names(): + call_command('sync_metrics') + yield + _teardown_es_temps() else: yield diff --git a/framework/auth/campaigns.py b/framework/auth/campaigns.py index 8a902245817..a47b3cf637b 100644 --- a/framework/auth/campaigns.py +++ b/framework/auth/campaigns.py @@ -100,6 +100,15 @@ def get_campaigns(): } }) + newest_campaigns.update({ + 'agu_conference': { + 'system_tag': CampaignSourceTags.AguConference.value, + 'redirect_url': furl(DOMAIN).add(path='dashboard/').url, + 'confirmation_email_template': mails.CONFIRM_EMAIL_AGU_CONFERENCE, + 'login_type': 'native', + } + }) + CAMPAIGNS = newest_campaigns CAMPAIGNS_LAST_REFRESHED = timezone.now() diff --git a/framework/auth/views.py b/framework/auth/views.py index e398a6db0a5..5f999aaaca6 100644 --- a/framework/auth/views.py +++ b/framework/auth/views.py @@ -944,7 +944,7 @@ def register_user(**kwargs): ) if settings.CONFIRM_REGISTRATIONS_BY_EMAIL: - send_confirm_email_async(user, email=user.username) + send_confirm_email(user, email=user.username) message = language.REGISTRATION_SUCCESS.format(email=user.username) return {'message': message} else: diff --git a/osf/admin.py b/osf/admin.py index 2bfd8c2cc35..71c0ae8172b 100644 --- a/osf/admin.py +++ b/osf/admin.py @@ -6,6 +6,7 @@ from django.db.models import Q, Count from django.http import HttpResponseRedirect from django.urls import reverse +import waffle from osf.external.spam.tasks import reclassify_domain_references from osf.models import OSFUser, Node, NotableDomain, NodeLicense @@ -140,7 +141,24 @@ def get_queryset(self, request): qs = super().get_queryset(request).annotate(number_of_references=Count('domainreference')) return qs + +class _ManygroupWaffleFlagAdmin(waffle.admin.FlagAdmin): + '''customized `waffle.admin.FlagAdmin` to support many groups + + waffle assumes "there are likely not that many" groups [0], + but in osf there are, in fact, that many groups. + + [0]: https://github.com/jazzband/django-waffle/commit/bf36c19ee03baf1c5850ffe0b284900a5c416f53 + ''' + raw_id_fields = (*waffle.admin.FlagAdmin.raw_id_fields, 'groups') + + admin.site.register(OSFUser, OSFUserAdmin) admin.site.register(Node, NodeAdmin) admin.site.register(NotableDomain, NotableDomainAdmin) admin.site.register(NodeLicense, LicenseAdmin) + +# waffle admins, with Flag admin override +admin.site.register(waffle.models.Flag, _ManygroupWaffleFlagAdmin) +admin.site.register(waffle.models.Sample, waffle.admin.SampleAdmin) +admin.site.register(waffle.models.Switch, waffle.admin.SwitchAdmin) diff --git a/osf/external/askismet/client.py b/osf/external/askismet/client.py index 877f7ec4c23..db57b1d3cfa 100644 --- a/osf/external/askismet/client.py +++ b/osf/external/askismet/client.py @@ -133,3 +133,37 @@ def submit_ham(self, user_ip, user_agent, **kwargs): ) if res.status_code != requests.codes.ok: raise AkismetClientError(reason=res.text) + + def get_flagged_count(self, start_date, end_date, category='node'): + from osf.models import NodeLog, PreprintLog + + if category not in ['node', 'preprint']: + raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.") + + log_model = NodeLog if category == 'node' else PreprintLog + + flagged_count = log_model.objects.filter( + action=log_model.FLAG_SPAM, + created__gt=start_date, + created__lt=end_date, + **{f'{category}__spam_data__who_flagged__in': ['akismet', 'both']} + ).count() + + return flagged_count + + def get_hammed_count(self, start_date, end_date, category='node'): + from osf.models import NodeLog, PreprintLog + + if category not in ['node', 'preprint']: + raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.") + + log_model = NodeLog if category == 'node' else PreprintLog + + hammed_count = log_model.objects.filter( + action=log_model.CONFIRM_HAM, + created__gt=start_date, + created__lt=end_date, + **{f'{category}__spam_data__who_flagged__in': ['akismet', 'both']} + ).count() + + return hammed_count diff --git a/osf/external/oopspam/client.py b/osf/external/oopspam/client.py index ef22864a43d..0abdfdd021f 100644 --- a/osf/external/oopspam/client.py +++ b/osf/external/oopspam/client.py @@ -45,3 +45,37 @@ def check_content(self, user_ip, content, **kwargs): # OOPSpam returns a spam score out of 6. 3 or higher indicates spam return spam_score >= settings.OOPSPAM_SPAM_LEVEL, resp_json + + def get_flagged_count(self, start_date, end_date, category='node'): + from osf.models import NodeLog, PreprintLog + + if category not in ['node', 'preprint']: + raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.") + + log_model = NodeLog if category == 'node' else PreprintLog + + flagged_count = log_model.objects.filter( + action=log_model.FLAG_SPAM, + created__gt=start_date, + created__lt=end_date, + **{f'{category}__spam_data__who_flagged__in': ['oopspam', 'both']} + ).count() + + return flagged_count + + def get_hammed_count(self, start_date, end_date, category='node'): + from osf.models import NodeLog, PreprintLog + + if category not in ['node', 'preprint']: + raise ValueError(f"Invalid category '{category}'. Expected 'node' or 'preprint'.") + + log_model = NodeLog if category == 'node' else PreprintLog + + hammed_count = log_model.objects.filter( + action=log_model.CONFIRM_HAM, + created__gt=start_date, + created__lt=end_date, + **{f'{category}__spam_data__who_flagged__in': ['oopspam', 'both']} + ).count() + + return hammed_count diff --git a/osf/features.yaml b/osf/features.yaml index bbae37e3b1a..9872b142c2a 100644 --- a/osf/features.yaml +++ b/osf/features.yaml @@ -189,6 +189,10 @@ flags: note: This is not used everyone: true + - flag_name: INSTITUTIONAL_DASHBOARD_2024 + name: institutional_dashboard_2024 + note: whether to surface older or updated (in 2024) institutional metrics + switches: - flag_name: DISABLE_ENGAGEMENT_EMAILS name: disable_engagement_emails @@ -217,3 +221,8 @@ switches: - flag_name: ENABLE_INACTIVE_SCHEMAS name: enable_inactive_schemas note: This is no longer used + + - flag_name: COUNTEDUSAGE_UNIFIED_METRICS_2024 + name: countedusage_unified_metrics_2024 + note: use only `osf.metrics.counted_usage`-based metrics where possible; un-use PageCounter, PreprintView, PreprintDownload, etc + active: false diff --git a/osf/management/commands/change_node_region.py b/osf/management/commands/change_node_region.py new file mode 100644 index 00000000000..abce28672bf --- /dev/null +++ b/osf/management/commands/change_node_region.py @@ -0,0 +1,160 @@ +import logging +import json + +from django.core.management.base import BaseCommand +from django.db import transaction +from google.cloud.storage.client import Client +from google.oauth2.service_account import Credentials + +from osf.models import AbstractNode +from osf.utils.migrations import disable_auto_now_fields +from addons.osfstorage.models import Region + +logger = logging.getLogger(__name__) + +def _get_file_block_map(node): + file_block_map = {} + file_input_qids = node.registration_schema.schema_blocks.filter( + block_type='file-input' + ).values_list('registration_response_key', flat=True) + for schema_response in node.schema_responses.all(): + for block in schema_response.response_blocks.filter(schema_key__in=file_input_qids): + for file_response in block.response: + if file_block_map.get(file_response['file_id'], False): + file_block_map[file_response['file_id']].append(block) + else: + file_block_map[file_response['file_id']] = [block] + return file_block_map + +def _update_blocks(file_block_map, original_id, cloned_id): + for block in file_block_map[original_id]: + logger.info(f'Updating block {block._id} file info') + response = [] + for file_response in block.response: + if original_id == file_response['file_id']: + for key in file_response['file_urls'].keys(): + file_response['file_urls'][key] = file_response['file_urls'][key].replace(original_id, cloned_id) + response.append(file_response) + block.response = response + block.save() + +def _update_schema_meta(node): + logger.info('Updating legacy schema information...') + node.registration_responses = node.schema_responses.latest('-created').all_responses + node.registered_meta[node.registration_schema._id] = node.expand_registration_responses() + node.save() + logger.info('Updated legacy schema information.') + +def _copy_and_clone_versions(original_file, cloned_file, src_bucket, dest_bucket, dest_bucket_name, dest_region): + for v in original_file.versions.order_by('identifier').all(): + blob_hash = v.location['object'] + logger.info(f'Preparing to move version {blob_hash}') + # Copy each version to dest_bucket + src_blob = src_bucket.get_blob(blob_hash) + src_bucket.copy_blob(src_blob, dest_bucket) + logger.info(f'Blob {blob_hash} copied to destination, cloning version object.') + # Clone each version, update location + cloned_v = v.clone() + cloned_v.location['bucket'] = dest_bucket_name + # Set FKs + cloned_v.creator = v.creator + cloned_v.region = dest_region + # Save before M2M's can be set + cloned_v.save() + cloned_file.add_version(cloned_v) + # Retain original timestamps + cloned_v.created = v.created + cloned_v.modified = v.modified + cloned_v.save() + logger.info(f'Version {blob_hash} cloned.') + +def _clone_file(file_obj): + # Clone each file, so that the originals will be purged from src_region + cloned_f = file_obj.clone() + # Set (G)FKs + cloned_f.target = file_obj.target + cloned_f.parent = file_obj.parent + cloned_f.checkout = file_obj.checkout + cloned_f.copied_from = file_obj.copied_from + # Save before M2M's can be set, assigning both id and _id + cloned_f.save() + # Repoint Guids + assert cloned_f.id, f'Cloned file ID not assigned for {file_obj._id}' + file_obj.guids.update(object_id=cloned_f.id) + # Retain original timestamps + cloned_f.created = file_obj.created + cloned_f.modified = file_obj.modified + cloned_f.save() + return cloned_f + +def change_node_region(node, dest_region, gcs_creds): + creds = Credentials.from_service_account_info(gcs_creds) + client = Client(credentials=creds) + osfstorage_addon = node.get_addon('osfstorage') + src_region = osfstorage_addon.region + if src_region.id == dest_region.id: + logger.warning(f'Source and destination regions match: {src_region._id}. Exiting.') + return + src_bucket_name = src_region.waterbutler_settings['storage']['bucket'] + dest_bucket_name = dest_region.waterbutler_settings['storage']['bucket'] + src_bucket = client.get_bucket(src_bucket_name) + dest_bucket = client.get_bucket(dest_bucket_name) + response_blocks_by_file_id = {} + with transaction.atomic(): + with disable_auto_now_fields(): + if node.type == 'osf.registration': + response_blocks_by_file_id = _get_file_block_map(node) + for f in node.files.all(): + logger.info(f'Prepraring to move file {f._id}') + cloned_f = _clone_file(f) + if f._id in response_blocks_by_file_id: + logger.info(f'Prepraring to update ResponseBlocks for file {f._id}') + _update_blocks(response_blocks_by_file_id, f._id, cloned_f._id) + logger.info(f'File {f._id} cloned, copying versions...') + _copy_and_clone_versions(f, cloned_f, src_bucket, dest_bucket, dest_bucket_name, dest_region) + # Trash original file + f.delete() + logger.info('All files complete.') + if response_blocks_by_file_id: + _update_schema_meta(node) + osfstorage_addon.region = dest_region + osfstorage_addon.save() + logger.info('Region updated. Exiting.') + +class Command(BaseCommand): + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + '-n', + '--node', + type=str, + action='store', + dest='node', + help='Node._id to migrate.', + ) + parser.add_argument( + '-r', + '--region', + type=str, + action='store', + dest='region', + help='Region._id to migrate files to.', + ) + parser.add_argument( + '-c', + '--credentials', + type=str, + action='store', + dest='gcs_creds', + help='GCS Credentials to use. JSON string.', + ) + + def handle(self, *args, **options): + node = AbstractNode.load(options.get('node', None)) + region = Region.load(options.get('region', None)) + gcs_creds = json.loads(options.get('gcs_creds', '{}')) + assert node, 'Node not found' + assert region, 'Region not found' + assert gcs_creds, 'Credentials required' + change_node_region(node, region, gcs_creds) diff --git a/osf/management/commands/email_all_users.py b/osf/management/commands/email_all_users.py index 334ad58933b..f5cbd677fb7 100644 --- a/osf/management/commands/email_all_users.py +++ b/osf/management/commands/email_all_users.py @@ -19,13 +19,13 @@ OFFSET = 500000 -def email_all_users(email_template, dry_run=False, ids=None, run=0, offset=OFFSET): +def email_all_users(email_template, dry_run=False, ids=None, start_id=0, offset=OFFSET): if ids: active_users = OSFUser.objects.filter(id__in=ids) else: - lower_bound = run * offset - upper_bound = (run + 1) * offset + lower_bound = start_id + upper_bound = start_id + offset base_query = OSFUser.objects.filter(date_confirmed__isnull=False, deleted=None).exclude(date_disabled__isnull=False).exclude(is_active=False) active_users = base_query.filter(id__gt=lower_bound, id__lte=upper_bound).order_by('id') @@ -42,11 +42,12 @@ def email_all_users(email_template, dry_run=False, ids=None, run=0, offset=OFFSE total_sent = 0 for user in active_users.iterator(): + logger.info(f'Sending email to {user.id}') try: mails.send_mail( to_addr=user.email, mail=template, - fullname=user.fullname, + given_name=user.given_name or user.fullname, ) except Exception as e: logger.error(f'Exception encountered sending email to {user.id}') @@ -80,11 +81,11 @@ def add_arguments(self, parser): ) parser.add_argument( - '--r', + '--start-id', type=int, - dest='run', + dest='start_id', default=0, - help='Specify which run this is' + help='Specify id to start from.' ) parser.add_argument( @@ -105,9 +106,9 @@ def add_arguments(self, parser): def handle(self, *args, **options): dry_run = options.get('dry_run', False) template = options.get('template') - run = options.get('run') + start_id = options.get('start_id') ids = options.get('ids') offset = options.get('offset', OFFSET) - email_all_users(template, dry_run, run=run, ids=ids, offset=offset) + email_all_users(template, dry_run, start_id=start_id, ids=ids, offset=offset) if dry_run: raise RuntimeError('Dry run, only superusers emailed') diff --git a/osf/management/commands/make_dummy_pageviews_for_metrics.py b/osf/management/commands/make_dummy_pageviews_for_metrics.py index 11ff9ca69c9..09de34bf7a8 100644 --- a/osf/management/commands/make_dummy_pageviews_for_metrics.py +++ b/osf/management/commands/make_dummy_pageviews_for_metrics.py @@ -74,6 +74,8 @@ def _generate_random_countedusage(self, n, max_age): item_guid=ITEM_GUID, session_id='freshen by key', user_is_authenticated=bool(random.randint(0, 1)), + item_public=bool(random.randint(0, 1)), + action_labels=[['view', 'download'][random.randint(0, 1)]], ) def _run_date_query(self, time_range_filter): @@ -103,8 +105,8 @@ def _run_date_query(self, time_range_filter): }, }) return { - 'min': result.aggs['min-timestamp'].value_as_string, - 'max': result.aggs['max-timestamp'].value_as_string, + 'min': result.aggs['min-timestamp'].value, + 'max': result.aggs['max-timestamp'].value, **{ str(bucket.key.date()): bucket.doc_count for bucket in result.aggs['by-date'] diff --git a/osf/management/commands/migrate_preprint_affiliation.py b/osf/management/commands/migrate_preprint_affiliation.py new file mode 100644 index 00000000000..e34c6dc6b27 --- /dev/null +++ b/osf/management/commands/migrate_preprint_affiliation.py @@ -0,0 +1,118 @@ +import datetime +import logging + +from django.core.management.base import BaseCommand +from django.db import transaction +from django.db.models import F, Exists, OuterRef + +from osf.models import PreprintContributor, InstitutionAffiliation + +logger = logging.getLogger(__name__) + +AFFILIATION_TARGET_DATE = datetime.datetime(2024, 9, 19, 14, 37, 48, tzinfo=datetime.timezone.utc) + + +class Command(BaseCommand): + """Assign affiliations from users to preprints where they have write or admin permissions, with optional exclusion by user GUIDs.""" + + help = 'Assign affiliations from users to preprints where they have write or admin permissions.' + + def add_arguments(self, parser): + parser.add_argument( + '--exclude-guids', + nargs='+', + dest='exclude_guids', + help='List of user GUIDs to exclude from affiliation assignment' + ) + parser.add_argument( + '--dry-run', + action='store_true', + dest='dry_run', + help='If true, performs a dry run without making changes' + ) + parser.add_argument( + '--batch-size', + type=int, + default=1000, + dest='batch_size', + help='Number of contributors to process in each batch' + ) + + def handle(self, *args, **options): + start_time = datetime.datetime.now() + logger.info(f'Script started at: {start_time}') + + exclude_guids = set(options.get('exclude_guids') or []) + dry_run = options.get('dry_run', False) + batch_size = options.get('batch_size', 1000) + + if dry_run: + logger.info('Dry run mode activated.') + + processed_count, updated_count = assign_affiliations_to_preprints( + exclude_guids=exclude_guids, + dry_run=dry_run, + batch_size=batch_size + ) + + finish_time = datetime.datetime.now() + logger.info(f'Script finished at: {finish_time}') + logger.info(f'Total processed: {processed_count}, Updated: {updated_count}') + logger.info(f'Total run time: {finish_time - start_time}') + + +def assign_affiliations_to_preprints(exclude_guids=None, dry_run=True, batch_size=1000): + exclude_guids = exclude_guids or set() + processed_count = updated_count = 0 + + # Subquery to check if the user has any affiliated institutions + user_has_affiliations = Exists( + InstitutionAffiliation.objects.filter( + user=OuterRef('user') + ) + ) + + contributors_qs = PreprintContributor.objects.filter( + preprint__preprintgroupobjectpermission__permission__codename__in=['write_preprint'], + preprint__preprintgroupobjectpermission__group__user=F('user'), + ).filter( + user_has_affiliations + ).select_related( + 'user', + 'preprint' + ).exclude( + user__guids___id__in=exclude_guids + ).order_by('pk') # Ensure consistent ordering for batching + + total_contributors = contributors_qs.count() + logger.info(f'Total contributors to process: {total_contributors}') + + # Process contributors in batches + with transaction.atomic(): + for offset in range(0, total_contributors, batch_size): + # Use select_for_update() to ensure query hits the primary database + batch_contributors = contributors_qs[offset:offset + batch_size].select_for_update() + + logger.info(f'Processing contributors {offset + 1} to {min(offset + batch_size, total_contributors)}') + + for contributor in batch_contributors: + user = contributor.user + preprint = contributor.preprint + + if preprint.created > AFFILIATION_TARGET_DATE: + continue + + user_institutions = user.get_affiliated_institutions() + processed_count += 1 + if not dry_run: + preprint.affiliated_institutions.add(*user_institutions) + updated_count += 1 + logger.info( + f'Assigned {len(user_institutions)} affiliations from user <{user._id}> to preprint <{preprint._id}>.' + ) + else: + logger.info( + f'Dry run: Would assign {len(user_institutions)} affiliations from user <{user._id}> to preprint <{preprint._id}>.' + ) + + return processed_count, updated_count diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py index 8f9854a722b..7ab7b843434 100644 --- a/osf/management/commands/monthly_reporters_go.py +++ b/osf/management/commands/monthly_reporters_go.py @@ -1,67 +1,125 @@ +import datetime import logging from django.core.management.base import BaseCommand -from django.db.utils import OperationalError -from django.utils import timezone +from django.db import OperationalError as DjangoOperationalError +from elasticsearch.exceptions import ConnectionError as ElasticConnectionError +from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app +import framework.sentry from osf.metrics.reporters import AllMonthlyReporters from osf.metrics.utils import YearMonth -from website.app import init_app logger = logging.getLogger(__name__) -MAXMONTH = 12 - +_CONTINUE_AFTER_ERRORS = ( + DjangoOperationalError, + ElasticConnectionError, + PostgresOperationalError, +) @celery_app.task(name='management.commands.monthly_reporters_go') -def monthly_reporters_go(report_year=None, report_month=None): - init_app() # OSF-specific setup - - if report_year and report_month: - report_yearmonth = YearMonth(report_year, report_month) - else: # default to last month if year and month not provided - today = timezone.now().date() - report_yearmonth = YearMonth( - year=today.year if today.month > 1 else today.year - 1, - month=today.month - 1 or MAXMONTH, - ) - for _reporter_key in AllMonthlyReporters.__members__.keys(): - monthly_reporter_go.apply_async(kwargs={ +def monthly_reporters_go(yearmonth: str = '', reporter_key: str = ''): + _yearmonth = ( + YearMonth.from_str(yearmonth) + if yearmonth + else YearMonth.from_date(datetime.date.today()).prior() # default last month + ) + _reporter_keys = ( + [reporter_key] + if reporter_key + else _enum_names(AllMonthlyReporters) + ) + for _reporter_key in _reporter_keys: + schedule_monthly_reporter.apply_async(kwargs={ + 'yearmonth': str(_yearmonth), 'reporter_key': _reporter_key, - 'yearmonth': str(report_yearmonth), }) +@celery_app.task(name='management.commands.schedule_monthly_reporter') +def schedule_monthly_reporter( + yearmonth: str, + reporter_key: str, + continue_after: dict | None = None, +): + _reporter = _get_reporter(reporter_key, yearmonth) + _last_kwargs = None + try: + for _kwargs in _reporter.iter_report_kwargs(continue_after=continue_after): + monthly_reporter_do.apply_async(kwargs={ + 'yearmonth': yearmonth, + 'reporter_key': reporter_key, + 'report_kwargs': _kwargs, + }) + _last_kwargs = _kwargs + except _CONTINUE_AFTER_ERRORS as _error: + # let the celery task succeed but log the error + framework.sentry.log_exception(_error) + # schedule another task to continue scheduling + if _last_kwargs is not None: + schedule_monthly_reporter.apply_async(kwargs={ + 'yearmonth': yearmonth, + 'reporter_key': reporter_key, + 'continue_after': _last_kwargs, + }) + + @celery_app.task( - name='management.commands.monthly_reporter_go', - autoretry_for=(OperationalError,), + name='management.commands.monthly_reporter_do', + autoretry_for=( + DjangoOperationalError, + ElasticConnectionError, + PostgresOperationalError, + ), max_retries=5, retry_backoff=True, - bind=True, ) -def monthly_reporter_go(task, reporter_key: str, yearmonth: str): - _reporter_class = AllMonthlyReporters[reporter_key].value - _parsed_yearmonth = YearMonth.from_str(yearmonth) - _reporter_class().run_and_record_for_month(_parsed_yearmonth) +def monthly_reporter_do(reporter_key: str, yearmonth: str, report_kwargs: dict): + _reporter = _get_reporter(reporter_key, yearmonth) + _report = _reporter.report(**report_kwargs) + if _report is not None: + _report.report_yearmonth = _reporter.yearmonth + _report.save() + _followup_task = _reporter.followup_task(_report) + if _followup_task is not None: + _followup_task.apply_async() class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( 'yearmonth', - type=YearMonth.from_str, - default={'year': None, 'month': None}, + type=str, help='year and month (YYYY-MM)', ) + parser.add_argument( + '-r', '--reporter', + type=str, + choices={_name.lower() for _name in _enum_names(AllMonthlyReporters)}, + default='', + help='name of the reporter to run (default all)', + ) - def handle(self, *args, **options): - errors = monthly_reporters_go( - report_year=getattr(options.get('yearmonth'), 'year', None), - report_month=getattr(options.get('yearmonth'), 'month', None), + def handle(self, *args, **kwargs): + monthly_reporters_go( + yearmonth=kwargs['yearmonth'], + reporter_key=kwargs['reporter'].upper(), ) - for error_key, error_val in errors.items(): - self.stdout.write(self.style.ERROR(f'error running {error_key}: ') + error_val) - self.stdout.write(self.style.SUCCESS('done.')) + self.stdout.write(self.style.SUCCESS( + f'scheduling tasks for monthly reporter "{kwargs['reporter']}"...' + if kwargs['reporter'] + else 'scheduling tasks for all monthly reporters...' + )) + + +def _get_reporter(reporter_key: str, yearmonth: str): + _reporter_class = AllMonthlyReporters[reporter_key].value + return _reporter_class(YearMonth.from_str(yearmonth)) + + +def _enum_names(enum_cls) -> list[str]: + return list(enum_cls.__members__.keys()) diff --git a/osf/management/commands/recatalog_metadata.py b/osf/management/commands/recatalog_metadata.py index 43c647e5861..be52e9b0a0e 100644 --- a/osf/management/commands/recatalog_metadata.py +++ b/osf/management/commands/recatalog_metadata.py @@ -55,21 +55,6 @@ def _recatalog_all(queryset, chunk_size): recatalog(queryset, start_id=0, chunk_count=int(9e9), chunk_size=chunk_size) -def _recatalog_datacite_custom_types(chunk_size): - logger.info('recataloguing items with datacite custom type...') - # all preprints - _recatalog_all(Preprint.objects, chunk_size) - # objects with custom resource_type_general - for _model in {Registration, Node, OsfStorageFile}: - _queryset = ( - _model.objects - .exclude(guids__metadata_record__isnull=True) - .exclude(guids__metadata_record__resource_type_general='') - ) - _recatalog_all(_queryset, chunk_size) - logger.info('done recataloguing items with datacite custom type!') - - class Command(BaseCommand): def add_arguments(self, parser): type_group = parser.add_mutually_exclusive_group(required=True) @@ -103,14 +88,6 @@ def add_arguments(self, parser): action='store_true', help='recatalog metadata for users', ) - type_group.add_argument( - '--datacite-custom-types', - action='store_true', - help='''recatalog metadata for items with a specific datacite type, - including all preprints and items with custom resource_type_general - (may be slow for lack of database indexes) - ''', - ) provider_group = parser.add_mutually_exclusive_group() provider_group.add_argument( @@ -144,6 +121,11 @@ def add_arguments(self, parser): default=int(9e9), help='maximum number of chunks (default all/enough/lots)', ) + parser.add_argument( + '--also-decatalog', + action='store_true', + help='also remove private and deleted items from the catalog', + ) def handle(self, *args, **options): pls_all_types = options['all_types'] @@ -156,13 +138,7 @@ def handle(self, *args, **options): start_id = options['start_id'] chunk_size = options['chunk_size'] chunk_count = options['chunk_count'] - datacite_custom_types = options['datacite_custom_types'] - - if datacite_custom_types: # temporary arg for datacite 4.5 migration - assert not start_id, 'oh no, cannot resume with `--datacite-custom-types`' - assert not provider_ids, 'oh no, cannot filter providers with `--datacite-custom-types`' - _recatalog_datacite_custom_types(chunk_size) - return # end + also_decatalog = options['also_decatalog'] if pls_all_types: assert not start_id, 'choose a specific type to resume with --start-id' @@ -185,4 +161,16 @@ def handle(self, *args, **options): _queryset = _queryset.filter( provider__in=AbstractProvider.objects.filter(_id__in=provider_ids), ) + if not also_decatalog: + if provided_model is OsfStorageFile: + _queryset = _queryset.filter(deleted__isnull=True) + elif provided_model is OSFUser: + _queryset = _queryset.filter( + deleted__isnull=True, + is_active=True, + ).exclude(allow_indexing=False) + elif provided_model is Preprint: + _queryset = _queryset.filter(is_public=True, is_published=True, deleted__isnull=True) + else: + _queryset = _queryset.filter(is_public=True, deleted__isnull=True) recatalog(_queryset, start_id, chunk_count, chunk_size) diff --git a/osf/metadata/gather/basket.py b/osf/metadata/gather/basket.py index f28a4dee6d6..eb28a087ad3 100644 --- a/osf/metadata/gather/basket.py +++ b/osf/metadata/gather/basket.py @@ -19,15 +19,14 @@ class Basket: def __init__(self, focus: Focus): assert isinstance(focus, Focus) self.focus = focus - self.reset() # start with an empty basket (except the focus itself) + self.reset() # start with an empty basket def reset(self): self._gathertasks_done = set() - self._known_focus_dict = {} + self._known_focus_dict = {self.focus.iri: {self.focus}} self.gathered_metadata = rdfutils.contextualized_graph() - self._add_focus_reference(self.focus) - def pls_gather(self, predicate_map): # TODO: async + def pls_gather(self, predicate_map, *, include_defaults=True): # TODO: async '''go gatherers, go! @predicate_map: dict with rdflib.URIRef keys @@ -48,7 +47,7 @@ def pls_gather(self, predicate_map): # TODO: async }, }) ''' - self._do_gather(self.focus, predicate_map) + self._do_gather(self.focus, predicate_map, include_defaults=include_defaults) def __getitem__(self, slice_or_arg) -> typing.Iterable[rdflib.term.Node]: '''convenience for getting values from the basket @@ -98,14 +97,20 @@ def _maybe_gather_for_predicate_map(self, iri_or_focus, predicate_map): else: raise ValueError(f'expected `iri_or_focus` to be Focus or URIRef (got {iri_or_focus})') - def _do_gather(self, focus, predicate_map): + def _do_gather(self, focus, predicate_map, *, include_defaults=True): + if include_defaults: + self._add_focus_reference(focus) if not isinstance(predicate_map, dict): # allow iterable of predicates with no deeper paths predicate_map = { predicate_iri: None for predicate_iri in predicate_map } - for gatherer in get_gatherers(focus.rdftype, predicate_map.keys()): + for gatherer in get_gatherers( + focus.rdftype, + predicate_map.keys(), + include_focustype_defaults=include_defaults, + ): for (subj, pred, obj) in self._do_a_gathertask(gatherer, focus): if isinstance(obj, Focus): self._add_focus_reference(obj) diff --git a/osf/metadata/gather/gatherer.py b/osf/metadata/gather/gatherer.py index 2a8822c9d2a..0630e6d61ae 100644 --- a/osf/metadata/gather/gatherer.py +++ b/osf/metadata/gather/gatherer.py @@ -61,11 +61,16 @@ def add_gatherer(gatherer, predicate_iris, focustype_iris): ) -def get_gatherers(focustype_iri, predicate_iris): +def get_gatherers(focustype_iri, predicate_iris, *, include_focustype_defaults=True): gatherer_set = set() for focustype in (None, focustype_iri): for_focustype = __gatherer_registry.get(focustype, {}) - for predicate in (None, *predicate_iris): + _predicates = ( + (None, *predicate_iris) + if include_focustype_defaults + else predicate_iris + ) + for predicate in _predicates: gatherer_set.update(for_focustype.get(predicate, ())) return gatherer_set diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py index 6e5e25c6d0b..9783f7b0879 100644 --- a/osf/metadata/osf_gathering.py +++ b/osf/metadata/osf_gathering.py @@ -1,11 +1,14 @@ '''gatherers of metadata from the osf database, in particular ''' +import datetime +import enum import logging from django.contrib.contenttypes.models import ContentType from django import db import rdflib +from api.caching.tasks import get_storage_usage_total from osf import models as osfdb from osf.metadata import gather from osf.metadata.rdfutils import ( @@ -19,6 +22,7 @@ OSF, OSFIO, OWL, + PROV, RDF, ROR, SKOS, @@ -27,7 +31,12 @@ without_namespace, smells_like_iri, ) -from osf.utils import workflows as osfworkflows +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth +from osf.utils import ( + workflows as osfworkflows, + permissions as osfpermissions, +) from osf.utils.outcomes import ArtifactTypes from website import settings as website_settings @@ -47,13 +56,6 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket: return gather.Basket(focus) -def osfmap_for_type(rdftype_iri: str): - try: - return OSFMAP[rdftype_iri] - except KeyError: - raise ValueError(f'invalid OSFMAP type! expected one of {set(OSFMAP.keys())}, got {rdftype_iri}') - - ##### END "public" api ##### @@ -88,6 +90,7 @@ def osfmap_for_type(rdftype_iri: str): OSF.isContainedBy: OSF_OBJECT_REFERENCE, OSF.fileName: None, OSF.filePath: None, + OSF.hasFileVersion: None, } OSF_OBJECT = { @@ -131,16 +134,7 @@ def osfmap_for_type(rdftype_iri: str): DCTERMS.creator: OSF_AGENT_REFERENCE, }, OWL.sameAs: None, -} - -OSF_FILEVERSION = { - DCTERMS.created: None, - DCTERMS.creator: OSF_AGENT_REFERENCE, - DCTERMS.extent: None, - DCTERMS.modified: None, - DCTERMS.requires: None, - DCTERMS['format']: None, - OSF.versionNumber: None, + PROV.qualifiedAttribution: None, } OSFMAP = { @@ -193,7 +187,7 @@ def osfmap_for_type(rdftype_iri: str): DCTERMS.modified: None, DCTERMS.title: None, DCTERMS.type: None, - OSF.hasFileVersion: OSF_FILEVERSION, + OSF.hasFileVersion: None, OSF.isContainedBy: OSF_OBJECT_REFERENCE, OSF.fileName: None, OSF.filePath: None, @@ -211,6 +205,57 @@ def osfmap_for_type(rdftype_iri: str): }, } +# metadata not included in the core record +OSFMAP_SUPPLEMENT = { + OSF.Project: { + OSF.hasOsfAddon: None, + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.ProjectComponent: { + OSF.hasOsfAddon: None, + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.Registration: { + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.RegistrationComponent: { + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.Preprint: { + OSF.storageByteCount: None, + OSF.storageRegion: None, + }, + OSF.File: { + }, +} + +# metadata not included in the core record that expires after a month +OSFMAP_MONTHLY_SUPPLEMENT = { + OSF.Project: { + OSF.usage: None, + }, + OSF.ProjectComponent: { + OSF.usage: None, + }, + OSF.Registration: { + OSF.usage: None, + }, + OSF.RegistrationComponent: { + OSF.usage: None, + }, + OSF.Preprint: { + OSF.usage: None, + }, + OSF.File: { + OSF.usage: None, + }, +} + + OSF_ARTIFACT_PREDICATES = { ArtifactTypes.ANALYTIC_CODE: OSF.hasAnalyticCodeResource, ArtifactTypes.DATA: OSF.hasDataResource, @@ -218,6 +263,11 @@ def osfmap_for_type(rdftype_iri: str): ArtifactTypes.PAPERS: OSF.hasPapersResource, ArtifactTypes.SUPPLEMENTS: OSF.hasSupplementalResource, } +OSF_CONTRIBUTOR_ROLES = { + osfpermissions.READ: OSF['readonly-contributor'], + osfpermissions.WRITE: OSF['write-contributor'], + osfpermissions.ADMIN: OSF['admin-contributor'], +} BEPRESS_SUBJECT_SCHEME_URI = 'https://bepress.com/reference_guide_dc/disciplines/' BEPRESS_SUBJECT_SCHEME_TITLE = 'bepress Digital Commons Three-Tiered Taxonomy' @@ -259,6 +309,37 @@ def osfmap_for_type(rdftype_iri: str): OSF.Registration: 'StudyRegistration', } + +class OsfmapPartition(enum.Enum): + MAIN = OSFMAP + SUPPLEMENT = OSFMAP_SUPPLEMENT + MONTHLY_SUPPLEMENT = OSFMAP_MONTHLY_SUPPLEMENT + + @property + def is_supplementary(self) -> bool: + return self is not OsfmapPartition.MAIN + + def osfmap_for_type(self, rdftype_iri: str): + try: + return self.value[rdftype_iri] + except KeyError: + if self.is_supplementary: + return {} # allow missing types for non-main partitions + raise ValueError(f'invalid OSFMAP type! expected one of {set(self.value.keys())}, got {rdftype_iri}') + + def get_expiration_date(self, basket: gather.Basket) -> datetime.date | None: + if self is not OsfmapPartition.MONTHLY_SUPPLEMENT: + return None + # let a monthly report expire two months after its reporting period ends + # (this allows the *next* monthly report up to a month to compute, which + # aligns with COUNTER https://www.countermetrics.org/code-of-practice/ ) + # (HACK: entangled with `gather_last_month_usage` implementation, below) + _report_yearmonth_str = next(basket[OSF.usage / DCTERMS.temporal], None) + if _report_yearmonth_str is None: + return None + _report_yearmonth = YearMonth.from_str(_report_yearmonth_str) + return _report_yearmonth.next().next().month_end().date() + ##### END osfmap ##### @@ -619,6 +700,8 @@ def _gather_fileversion(fileversion, fileversion_iri): version_sha256 = (fileversion.metadata or {}).get('sha256') if version_sha256: yield (fileversion_iri, DCTERMS.requires, checksum_iri('sha-256', version_sha256)) + if fileversion.region is not None: + yield from _storage_region_triples(fileversion.region, subject_ref=fileversion_iri) @gather.er(OSF.contains) @@ -819,11 +902,24 @@ def gather_agents(focus): # TODO: preserve order via rdflib.Seq +@gather.er(PROV.qualifiedAttribution) +def gather_qualified_attributions(focus): + _contributor_set = getattr(focus.dbmodel, 'contributor_set', None) + if _contributor_set is not None: + for _contributor in _contributor_set.filter(visible=True).select_related('user'): + _osfrole_ref = OSF_CONTRIBUTOR_ROLES.get(_contributor.permission) + if _osfrole_ref is not None: + _attribution_ref = rdflib.BNode() + yield (PROV.qualifiedAttribution, _attribution_ref) + yield (_attribution_ref, PROV.agent, OsfFocus(_contributor.user)) + yield (_attribution_ref, DCAT.hadRole, _osfrole_ref) + + @gather.er(OSF.affiliation) def gather_affiliated_institutions(focus): if hasattr(focus.dbmodel, 'get_affiliated_institutions'): # like OSFUser institution_qs = focus.dbmodel.get_affiliated_institutions() - elif hasattr(focus.dbmodel, 'affiliated_institutions'): # like AbstractNode + elif hasattr(focus.dbmodel, 'affiliated_institutions'): # like AbstractNode or Preprint institution_qs = focus.dbmodel.affiliated_institutions.all() else: institution_qs = () @@ -1029,3 +1125,63 @@ def gather_cedar_templates(focus): template_iri = rdflib.URIRef(record.get_template_semantic_iri()) yield (OSF.hasCedarTemplate, template_iri) yield (template_iri, DCTERMS.title, record.get_template_name()) + + +@gather.er(OSF.usage) +def gather_last_month_usage(focus): + _usage_report = PublicItemUsageReport.for_last_month( + item_osfid=osfguid_from_iri(focus.iri), + ) + if _usage_report is not None: + _usage_report_ref = rdflib.BNode() + yield (OSF.usage, _usage_report_ref) + yield (_usage_report_ref, DCAT.accessService, rdflib.URIRef(website_settings.DOMAIN.rstrip('/'))) + yield (_usage_report_ref, FOAF.primaryTopic, focus.iri) + yield (_usage_report_ref, DCTERMS.temporal, rdflib.Literal( + str(_usage_report.report_yearmonth), + datatype=rdflib.XSD.gYearMonth, + )) + yield (_usage_report_ref, OSF.viewCount, _usage_report.view_count) + yield (_usage_report_ref, OSF.viewSessionCount, _usage_report.view_session_count) + yield (_usage_report_ref, OSF.downloadCount, _usage_report.download_count) + yield (_usage_report_ref, OSF.downloadSessionCount, _usage_report.download_session_count) + + +@gather.er(OSF.hasOsfAddon) +def gather_addons(focus): + # note: when gravyvalet exists, use `iterate_addons_for_resource` + # from osf.external.gravy_valet.request_helpers and get urls like + # "https://addons.osf.example/v1/addon-imps/..." instead of a urn + for _addon_settings in focus.dbmodel.get_addons(): + if not _addon_settings.config.added_default: # skip always-on addons + _addon_ref = rdflib.URIRef(f'urn:osf.io:addons:{_addon_settings.short_name}') + yield (OSF.hasOsfAddon, _addon_ref) + yield (_addon_ref, RDF.type, OSF.AddonImplementation) + yield (_addon_ref, DCTERMS.identifier, _addon_settings.short_name) + yield (_addon_ref, SKOS.prefLabel, _addon_settings.config.full_name) + + +@gather.er(OSF.storageRegion) +def gather_storage_region(focus): + _region = getattr(focus.dbmodel, 'osfstorage_region', None) + if _region is not None: + yield from _storage_region_triples(_region) + + +def _storage_region_triples(region, *, subject_ref=None): + _region_ref = rdflib.URIRef(region.absolute_api_v2_url) + if subject_ref is None: + yield (OSF.storageRegion, _region_ref) + else: + yield (subject_ref, OSF.storageRegion, _region_ref) + yield (_region_ref, SKOS.prefLabel, rdflib.Literal(region.name, lang='en')) + + +@gather.er( + OSF.storageByteCount, + focustype_iris=[OSF.Project, OSF.ProjectComponent, OSF.Registration, OSF.RegistrationComponent, OSF.Preprint] +) +def gather_storage_byte_count(focus): + _storage_usage_total = get_storage_usage_total(focus.dbmodel) + if _storage_usage_total is not None: + yield (OSF.storageByteCount, _storage_usage_total) diff --git a/osf/metadata/rdfutils.py b/osf/metadata/rdfutils.py index cd944169e20..d2596ad344e 100644 --- a/osf/metadata/rdfutils.py +++ b/osf/metadata/rdfutils.py @@ -23,6 +23,7 @@ RDF = rdflib.Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') # "resource description framework" SKOS = rdflib.Namespace('http://www.w3.org/2004/02/skos/core#') # "simple knowledge organization system" DCAT = rdflib.Namespace('http://www.w3.org/ns/dcat#') # "data catalog (vocabulary)" +PROV = rdflib.Namespace('http://www.w3.org/ns/prov#') # "provenance" # non-standard namespace for datacite terms (resolves to datacite docs) DATACITE = rdflib.Namespace('https://schema.datacite.org/meta/kernel-4/#') @@ -38,6 +39,7 @@ 'skos': SKOS, 'dcmitype': DCMITYPE, 'dcat': DCAT, + 'prov': PROV, } diff --git a/osf/metadata/serializers/turtle.py b/osf/metadata/serializers/turtle.py index 649614b0bfa..e90db45f2f6 100644 --- a/osf/metadata/serializers/turtle.py +++ b/osf/metadata/serializers/turtle.py @@ -1,4 +1,4 @@ -from osf.metadata.osf_gathering import osfmap_for_type +from osf.metadata.osf_gathering import OsfmapPartition from osf.metadata.serializers import _base @@ -9,5 +9,9 @@ def filename_for_itemid(self, itemid: str): return f'{itemid}-metadata.ttl' def serialize(self) -> str: - self.basket.pls_gather(osfmap_for_type(self.basket.focus.rdftype)) + _partition = self.serializer_config.get('osfmap_partition', OsfmapPartition.MAIN) + self.basket.pls_gather( + _partition.osfmap_for_type(self.basket.focus.rdftype), + include_defaults=(_partition is OsfmapPartition.MAIN), + ) return self.basket.gathered_metadata.serialize(format='turtle') diff --git a/osf/metrics/counted_usage.py b/osf/metrics/counted_usage.py index e6a3abf9cd5..39b3b74129b 100644 --- a/osf/metrics/counted_usage.py +++ b/osf/metrics/counted_usage.py @@ -10,7 +10,6 @@ import pytz from osf.metrics.utils import stable_key -from osf.models import Guid logger = logging.getLogger(__name__) @@ -87,6 +86,7 @@ def _autofill_fields(sender, instance, **kwargs): _fill_pageview_info(instance) item_guid = getattr(instance, 'item_guid', None) if item_guid: + from osf.models import Guid guid_instance = Guid.load(item_guid) if guid_instance and guid_instance.referent: _fill_osfguid_info(instance, guid_instance.referent) @@ -104,10 +104,10 @@ def _fill_pageview_info(counted_usage): def _fill_osfguid_info(counted_usage, guid_referent): counted_usage.item_public = _get_ispublic(guid_referent) - counted_usage.item_type = type(guid_referent).__name__.lower() + counted_usage.item_type = get_item_type(guid_referent) counted_usage.surrounding_guids = _get_surrounding_guids(guid_referent) if not counted_usage.provider_id: - counted_usage.provider_id = _get_provider_id(guid_referent) + counted_usage.provider_id = get_provider_id(guid_referent) def _fill_document_id(counted_usage): @@ -142,6 +142,7 @@ def _fill_document_id(counted_usage): counted_usage.session_id, counted_usage.timestamp.date(), time_window, + ','.join(sorted(counted_usage.action_labels)), ) @@ -153,7 +154,7 @@ def _get_ispublic(guid_referent): return getattr(maybe_public, 'is_public', None) # quacks like AbstractNode -def _get_provider_id(guid_referent): +def get_provider_id(guid_referent): provider = getattr(guid_referent, 'provider', None) if isinstance(provider, str): return provider # quacks like BaseFileNode @@ -162,6 +163,10 @@ def _get_provider_id(guid_referent): return 'osf' # quacks like Node, Comment, WikiPage +def get_item_type(guid_referent): + return type(guid_referent).__name__.lower() + + def _get_immediate_wrapper(guid_referent): if hasattr(guid_referent, 'verified_publishable'): return None # quacks like Preprint diff --git a/osf/metrics/preprint_metrics.py b/osf/metrics/preprint_metrics.py index 472cd01f698..4b64398a5c6 100644 --- a/osf/metrics/preprint_metrics.py +++ b/osf/metrics/preprint_metrics.py @@ -37,8 +37,8 @@ def record_for_preprint(cls, preprint, user=None, **kwargs): ) @classmethod - def get_count_for_preprint(cls, preprint, after=None, before=None, index=None): - search = cls.search(after=after, before=before, index=index).filter('match', preprint_id=preprint._id) + def get_count_for_preprint(cls, preprint, after=None, before=None, index=None) -> int: + search = cls.search(index=index).filter('term', preprint_id=preprint._id) timestamp = {} if after: timestamp['gte'] = after diff --git a/osf/metrics/reporters/__init__.py b/osf/metrics/reporters/__init__.py index 1f8e0fba862..e6966414c3c 100644 --- a/osf/metrics/reporters/__init__.py +++ b/osf/metrics/reporters/__init__.py @@ -4,12 +4,16 @@ from .storage_addon_usage import StorageAddonUsageReporter from .download_count import DownloadCountReporter from .institution_summary import InstitutionSummaryReporter +from .institutional_users import InstitutionalUsersReporter +from .institution_summary_monthly import InstitutionalSummaryMonthlyReporter from .new_user_domain import NewUserDomainReporter from .node_count import NodeCountReporter from .osfstorage_file_count import OsfstorageFileCountReporter from .preprint_count import PreprintCountReporter +from .public_item_usage import PublicItemUsageReporter from .user_count import UserCountReporter from .spam_count import SpamCountReporter +from .private_spam_metrics import PrivateSpamMetricsReporter class AllDailyReporters(enum.Enum): @@ -26,3 +30,7 @@ class AllDailyReporters(enum.Enum): class AllMonthlyReporters(enum.Enum): SPAM_COUNT = SpamCountReporter + INSTITUTIONAL_USERS = InstitutionalUsersReporter + INSTITUTIONAL_SUMMARY = InstitutionalSummaryMonthlyReporter + ITEM_USAGE = PublicItemUsageReporter + PRIVATE_SPAM_METRICS = PrivateSpamMetricsReporter diff --git a/osf/metrics/reporters/_base.py b/osf/metrics/reporters/_base.py index d3bf1722523..707e869522b 100644 --- a/osf/metrics/reporters/_base.py +++ b/osf/metrics/reporters/_base.py @@ -1,22 +1,32 @@ +from collections import abc +import dataclasses import logging +import celery + +from osf.metrics.reports import MonthlyReport from osf.metrics.utils import YearMonth logger = logging.getLogger(__name__) +@dataclasses.dataclass class MonthlyReporter: - def report(self, report_yearmonth: YearMonth): + yearmonth: YearMonth + + def iter_report_kwargs(self, continue_after: dict | None = None) -> abc.Iterator[dict]: + # override for multiple reports per month + if continue_after is None: + yield {} # by default, calls `.report()` once with no kwargs + + def report(self, **report_kwargs) -> MonthlyReport | None: """build a report for the given month """ - raise NotImplementedError(f'{self.__name__} must implement `report`') + raise NotImplementedError(f'{self.__class__.__name__} must implement `report`') - def run_and_record_for_month(self, report_yearmonth: YearMonth): - reports = self.report(report_yearmonth) - for report in reports: - assert report.report_yearmonth == str(report_yearmonth) - report.save() + def followup_task(self, report) -> celery.Signature | None: + return None class DailyReporter: @@ -25,7 +35,7 @@ def report(self, report_date): return an iterable of DailyReport (unsaved) """ - raise NotImplementedError(f'{self.__name__} must implement `report`') + raise NotImplementedError(f'{self.__class__.__name__} must implement `report`') def run_and_record_for_date(self, report_date): reports = self.report(report_date) diff --git a/osf/metrics/reporters/institution_summary_monthly.py b/osf/metrics/reporters/institution_summary_monthly.py new file mode 100644 index 00000000000..4748860db32 --- /dev/null +++ b/osf/metrics/reporters/institution_summary_monthly.py @@ -0,0 +1,112 @@ +from django.contrib.contenttypes.models import ContentType +from django.db.models import Q, F, Sum, OuterRef, Exists + +from osf.models import Institution, Preprint, AbstractNode, FileVersion, NodeLog, PreprintLog +from osf.models.spam import SpamStatus +from addons.osfstorage.models import OsfStorageFile +from osf.metrics.reports import InstitutionMonthlySummaryReport +from ._base import MonthlyReporter + + +class InstitutionalSummaryMonthlyReporter(MonthlyReporter): + """Generate an InstitutionMonthlySummaryReport for each institution.""" + + def iter_report_kwargs(self, continue_after: dict | None = None): + _inst_qs = Institution.objects.order_by('pk') + if continue_after: + _inst_qs = _inst_qs.filter(pk__gt=continue_after['institution_pk']) + for _pk in _inst_qs.values_list('pk', flat=True): + yield {'institution_pk': _pk} + + def report(self, **report_kwargs): + _institution = Institution.objects.get(pk=report_kwargs['institution_pk']) + return self.generate_report(_institution) + + def generate_report(self, institution): + node_queryset = institution.nodes.filter( + deleted__isnull=True, + created__lt=self.yearmonth.month_end() + ).exclude( + spam_status=SpamStatus.SPAM, + ) + + preprint_queryset = self.get_published_preprints(institution, self.yearmonth) + + return InstitutionMonthlySummaryReport( + institution_id=institution._id, + user_count=institution.get_institution_users().count(), + private_project_count=self._get_count(node_queryset, 'osf.node', is_public=False), + public_project_count=self._get_count(node_queryset, 'osf.node', is_public=True), + public_registration_count=self._get_count(node_queryset, 'osf.registration', is_public=True), + embargoed_registration_count=self._get_count(node_queryset, 'osf.registration', is_public=False), + published_preprint_count=preprint_queryset.count(), + storage_byte_count=self.get_storage_size(node_queryset, preprint_queryset), + public_file_count=self.get_files(node_queryset, preprint_queryset, is_public=True).count(), + monthly_logged_in_user_count=self.get_monthly_logged_in_user_count(institution, self.yearmonth), + monthly_active_user_count=self.get_monthly_active_user_count(institution, self.yearmonth), + ) + + def _get_count(self, node_queryset, node_type, is_public): + return node_queryset.filter(type=node_type, is_public=is_public, root_id=F('pk')).count() + + def get_published_preprints(self, institution, yearmonth): + queryset = Preprint.objects.can_view().filter( + affiliated_institutions=institution, + created__lte=yearmonth.month_end() + ).exclude( + spam_status=SpamStatus.SPAM + ) + + return queryset + + def get_files(self, node_queryset, preprint_queryset, is_public=None): + public_kwargs = {} + if is_public: + public_kwargs = {'is_public': is_public} + + target_node_q = Q( + target_object_id__in=node_queryset.filter(**public_kwargs).values('pk'), + target_content_type=ContentType.objects.get_for_model(AbstractNode), + ) + target_preprint_q = Q( + target_object_id__in=preprint_queryset.values('pk'), + target_content_type=ContentType.objects.get_for_model(Preprint), + ) + return OsfStorageFile.objects.filter( + deleted__isnull=True, purged__isnull=True + ).filter(target_node_q | target_preprint_q) + + def get_storage_size(self, node_queryset, preprint_queryset): + files = self.get_files(node_queryset, preprint_queryset) + return FileVersion.objects.filter( + size__gt=0, + purged__isnull=True, + basefilenode__in=files + ).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes'] + + def get_monthly_logged_in_user_count(self, institution, yearmonth): + return institution.get_institution_users().filter( + date_last_login__gte=yearmonth.month_start(), + date_last_login__lt=yearmonth.month_end() + ).count() + + def get_monthly_active_user_count(self, institution, yearmonth): + start_date = yearmonth.month_start() + end_date = yearmonth.month_end() + + nodelogs = NodeLog.objects.filter( + user=OuterRef('pk'), + created__gte=start_date, + created__lt=end_date + ) + preprintlogs = PreprintLog.objects.filter( + user=OuterRef('pk'), + created__gte=start_date, + created__lt=end_date + ) + + return institution.get_institution_users().filter( + date_disabled__isnull=True + ).annotate( + has_logs=Exists(nodelogs) | Exists(preprintlogs) + ).filter(has_logs=True).count() diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py new file mode 100644 index 00000000000..512472a3d96 --- /dev/null +++ b/osf/metrics/reporters/institutional_users.py @@ -0,0 +1,191 @@ +import dataclasses + +from django.contrib.contenttypes.models import ContentType +from django.db.models import Q, F, Sum + +from osf import models as osfdb +from osf.models.spam import SpamStatus +from addons.osfstorage.models import OsfStorageFile +from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.utils import YearMonth +from ._base import MonthlyReporter + + +class InstitutionalUsersReporter(MonthlyReporter): + '''build an InstitutionalUserReport for each institution-user affiliation + + built for the institution dashboard at ://osf.example/institutions//dashboard/, + which offers institutional admins insight into how people at their institution are + using osf, based on their explicitly-affiliated osf objects + ''' + def iter_report_kwargs(self, continue_after: dict | None = None): + _before_datetime = self.yearmonth.month_end() + _inst_qs = ( + osfdb.Institution.objects + .filter(created__lt=_before_datetime) + .order_by('pk') + ) + if continue_after: + _inst_qs = _inst_qs.filter(pk__gte=continue_after['institution_pk']) + for _institution in _inst_qs: + _user_qs = _institution.get_institution_users().filter(created__lt=_before_datetime) + if continue_after and (_institution.pk == continue_after['institution_pk']): + _user_qs = _user_qs.filter(pk__gt=continue_after['user_pk']) + for _user_pk in _user_qs.values_list('pk', flat=True): + yield {'institution_pk': _institution.pk, 'user_pk': _user_pk} + + def report(self, **report_kwargs): + _institution = osfdb.Institution.objects.get(pk=report_kwargs['institution_pk']) + _user = osfdb.OSFUser.objects.get(pk=report_kwargs['user_pk']) + _helper = _InstiUserReportHelper(_institution, _user, self.yearmonth) + return _helper.report + + +# helper +@dataclasses.dataclass +class _InstiUserReportHelper: + institution: osfdb.Institution + user: osfdb.OSFUser + yearmonth: YearMonth + report: InstitutionalUserReport = dataclasses.field(init=False) + + def __post_init__(self): + _affiliation = self.user.get_institution_affiliation(self.institution._id) + self.report = InstitutionalUserReport( + institution_id=self.institution._id, + user_id=self.user._id, + user_name=self.user.fullname, + department_name=(_affiliation.sso_department or None), + month_last_login=( + YearMonth.from_date(self.user.date_last_login) + if self.user.date_last_login is not None + else None + ), + month_last_active=self._get_last_active(), + account_creation_date=YearMonth.from_date(self.user.created), + orcid_id=self.user.get_verified_external_id('ORCID', verified_only=True), + public_project_count=self._public_project_queryset().count(), + private_project_count=self._private_project_queryset().count(), + public_registration_count=self._public_registration_queryset().count(), + embargoed_registration_count=self._embargoed_registration_queryset().count(), + public_file_count=self._public_osfstorage_file_count(), + published_preprint_count=self._published_preprint_queryset().count(), + storage_byte_count=self._storage_byte_count(), + ) + + @property + def before_datetime(self): + return self.yearmonth.month_end() + + def _node_queryset(self): + _institution_node_qs = self.institution.nodes.filter( + created__lt=self.before_datetime, + is_deleted=False, + ).exclude(spam_status=SpamStatus.SPAM) + return osfdb.Node.objects.get_nodes_for_user( + user=self.user, + base_queryset=_institution_node_qs, + ) + + def _public_project_queryset(self): + return self._node_queryset().filter( + type='osf.node', # `type` field from TypedModel + is_public=True, + root_id=F('pk'), # only root nodes + ) + + def _private_project_queryset(self): + return self._node_queryset().filter( + type='osf.node', # `type` field from TypedModel + is_public=False, + root_id=F('pk'), # only root nodes + ) + + def _public_registration_queryset(self): + return self._node_queryset().filter( + type='osf.registration', # `type` field from TypedModel + is_public=True, + root_id=F('pk'), # only root nodes + ) + + def _embargoed_registration_queryset(self): + return self._node_queryset().filter( + type='osf.registration', # `type` field from TypedModel + is_public=False, + root_id=F('pk'), # only root nodes + embargo__end_date__gte=self.before_datetime, + ) + + def _published_preprint_queryset(self): + return ( + osfdb.Preprint.objects.can_view() # published/publicly-viewable + .filter( + affiliated_institutions=self.institution, + _contributors=self.user, + date_published__lt=self.before_datetime, + ) + .exclude(spam_status=SpamStatus.SPAM) + ) + + def _public_osfstorage_file_querysets(self): + _target_node_q = Q( + # any public project, registration, project component, or registration component + target_object_id__in=self._node_queryset().filter(is_public=True).values('pk'), + target_content_type=ContentType.objects.get_for_model(osfdb.AbstractNode), + ) + _target_preprint_q = Q( + target_object_id__in=self._published_preprint_queryset().values('pk'), + target_content_type=ContentType.objects.get_for_model(osfdb.Preprint), + ) + return ( # split into two queries to avoid a parallel sequence scan on BFN + OsfStorageFile.objects + .filter( + created__lt=self.before_datetime, + deleted__isnull=True, + purged__isnull=True, + ) + .filter(_target_node_q), + OsfStorageFile.objects + .filter( + created__lt=self.before_datetime, + deleted__isnull=True, + purged__isnull=True, + ) + .filter(_target_preprint_q) + ) + + def _public_osfstorage_file_count(self): + return sum( + _target_queryset.count() for _target_queryset + in self._public_osfstorage_file_querysets() + ) + + def _storage_byte_count(self): + return sum( + osfdb.FileVersion.objects.filter( + size__gt=0, + created__lt=self.before_datetime, + purged__isnull=True, + basefilenode__in=_target_queryset, + ).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes'] + for _target_queryset + in self._public_osfstorage_file_querysets() + ) + + def _get_last_active(self): + end_date = self.yearmonth.month_end() + + node_logs = self.user.logs.filter(created__lt=end_date).order_by('-created') + preprint_logs = self.user.preprint_logs.filter(created__lt=end_date).order_by('-created') + + dates = filter(bool, [ + node_logs.values_list('created', flat=True).first(), + preprint_logs.values_list('created', flat=True).first(), + ]) + + latest_activity_date = max(dates, default=None) + + if latest_activity_date: + return YearMonth.from_date(latest_activity_date) + else: + return None diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py new file mode 100644 index 00000000000..40f259af325 --- /dev/null +++ b/osf/metrics/reporters/private_spam_metrics.py @@ -0,0 +1,28 @@ +from osf.metrics.reports import PrivateSpamMetricsReport +from osf.external.oopspam.client import OOPSpamClient +from osf.external.askismet.client import AkismetClient +from ._base import MonthlyReporter + +class PrivateSpamMetricsReporter(MonthlyReporter): + report_name = 'Private Spam Metrics' + + def report(self): + target_month = self.yearmonth.month_start() + next_month = self.yearmonth.month_end() + + oopspam_client = OOPSpamClient() + akismet_client = AkismetClient() + + report = PrivateSpamMetricsReport( + report_yearmonth=str(self.yearmonth), + node_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='node'), + node_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='node'), + node_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='node'), + node_akismet_hammed=akismet_client.get_hammed_count(target_month, next_month, category='node'), + preprint_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='preprint'), + preprint_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='preprint'), + preprint_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='preprint'), + preprint_akismet_hammed=akismet_client.get_hammed_count(target_month, next_month, category='preprint') + ) + + return report diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py new file mode 100644 index 00000000000..cc401d50bd7 --- /dev/null +++ b/osf/metrics/reporters/public_item_usage.py @@ -0,0 +1,310 @@ +from __future__ import annotations +import datetime +import typing + +import waffle +if typing.TYPE_CHECKING: + import elasticsearch_dsl as edsl + +import osf.features +from osf.metadata.osf_gathering import OsfmapPartition +from osf.metrics.counted_usage import ( + CountedAuthUsage, + get_item_type, + get_provider_id, +) +from osf.metrics.preprint_metrics import ( + PreprintDownload, + PreprintView, +) +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth +from osf import models as osfdb +from website import settings as website_settings +from ._base import MonthlyReporter + + +_CHUNK_SIZE = 500 + +_MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control + + +class _SkipItem(Exception): + pass + + +class PublicItemUsageReporter(MonthlyReporter): + '''build a PublicItemUsageReport for each public item + + includes projects, project components, registrations, registration components, and preprints + ''' + def iter_report_kwargs(self, continue_after: dict | None = None): + _after_osfid = continue_after['osfid'] if continue_after else None + for _osfid in _zip_sorted( + self._countedusage_osfids(_after_osfid), + self._preprintview_osfids(_after_osfid), + self._preprintdownload_osfids(_after_osfid), + ): + yield {'osfid': _osfid} + + def report(self, **report_kwargs): + _osfid = report_kwargs['osfid'] + # get usage metrics from several sources: + # - osf.metrics.counted_usage: + # - views and downloads for each item (using `CountedAuthUsage.item_guid`) + # - views for each item's components and files (using `CountedAuthUsage.surrounding_guids`) + # - osf.metrics.preprint_metrics: + # - preprint views and downloads + # - PageCounter? (no) + try: + _guid = osfdb.Guid.load(_osfid) + if _guid is None or _guid.referent is None: + raise _SkipItem + _obj = _guid.referent + _report = self._init_report(_obj) + self._fill_report_counts(_report, _obj) + if not any(( + _report.view_count, + _report.view_session_count, + _report.download_count, + _report.download_session_count, + )): + raise _SkipItem + return _report + except _SkipItem: + return None + + def followup_task(self, report): + _is_last_month = report.report_yearmonth.next() == YearMonth.from_date(datetime.date.today()) + if _is_last_month: + from api.share.utils import task__update_share + return task__update_share.signature( + args=(report.item_osfid,), + kwargs={ + 'is_backfill': True, + 'osfmap_partition_name': OsfmapPartition.MONTHLY_SUPPLEMENT.name, + }, + countdown=30, # give index time to settle + ) + + def _countedusage_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + _search = self._base_usage_search() + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'item_guid'}}}], + size=_CHUNK_SIZE, + ) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) + + def _preprintview_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + _search = ( + PreprintView.search() + .filter('range', timestamp={ + 'gte': self.yearmonth.month_start(), + 'lt': self.yearmonth.month_end(), + }) + .extra(size=0) # only aggregations, no hits + ) + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], + size=_CHUNK_SIZE, + ) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) + + def _preprintdownload_osfids(self, after_osfid: str | None) -> typing.Iterator[str]: + _search = ( + PreprintDownload.search() + .filter('range', timestamp={ + 'gte': self.yearmonth.month_start(), + 'lt': self.yearmonth.month_end(), + }) + .extra(size=0) # only aggregations, no hits + ) + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'preprint_id'}}}], + size=_CHUNK_SIZE, + ) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) + + def _init_report(self, osf_obj) -> PublicItemUsageReport: + if not _is_item_public(osf_obj): + raise _SkipItem + return PublicItemUsageReport( + item_osfid=osf_obj._id, + item_type=[get_item_type(osf_obj)], + provider_id=[get_provider_id(osf_obj)], + platform_iri=[website_settings.DOMAIN], + # leave counts null; will be set if there's data + ) + + def _fill_report_counts(self, report, osf_obj): + if ( + isinstance(osf_obj, osfdb.Preprint) + and not waffle.switch_is_active(osf.features.COUNTEDUSAGE_UNIFIED_METRICS_2024) # type: ignore[attr-defined] + ): + # note: no session-count info in preprint metrics + report.view_count = self._preprint_views(osf_obj) + report.download_count = self._preprint_downloads(osf_obj) + else: + ( + report.view_count, + report.view_session_count, + ) = self._countedusage_view_counts(osf_obj) + ( + report.download_count, + report.download_session_count, + ) = self._countedusage_download_counts(osf_obj) + + def _base_usage_search(self): + return ( + CountedAuthUsage.search() + .filter('term', item_public=True) + .filter('range', timestamp={ + 'gte': self.yearmonth.month_start(), + 'lt': self.yearmonth.month_end(), + }) + .extra(size=0) # only aggregations, no hits + ) + + def _countedusage_view_counts(self, osf_obj) -> tuple[int, int]: + '''compute view_session_count separately to avoid double-counting + + (the same session may be represented in both the composite agg on `item_guid` + and that on `surrounding_guids`) + ''' + _search = ( + self._base_usage_search() + .query( + 'bool', + filter=[ + {'term': {'action_labels': CountedAuthUsage.ActionLabel.VIEW.value}}, + ], + should=[ + {'term': {'item_guid': osf_obj._id}}, + {'term': {'surrounding_guids': osf_obj._id}}, + ], + minimum_should_match=1, + ) + ) + _search.aggs.metric( + 'agg_session_count', + 'cardinality', + field='session_id', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _view_count = _response.hits.total + _view_session_count = ( + _response.aggregations.agg_session_count.value + if 'agg_session_count' in _response.aggregations + else 0 + ) + return (_view_count, _view_session_count) + + def _countedusage_download_counts(self, osf_obj) -> tuple[int, int]: + '''aggregate downloads on each osfid (not including components/files)''' + _search = ( + self._base_usage_search() + .filter('term', item_guid=osf_obj._id) + .filter('term', action_labels=CountedAuthUsage.ActionLabel.DOWNLOAD.value) + ) + # agg: get download session count + _search.aggs.metric( + 'agg_session_count', + 'cardinality', + field='session_id', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _download_count = _response.hits.total + _download_session_count = ( + _response.aggregations.agg_session_count.value + if 'agg_session_count' in _response.aggregations + else 0 + ) + return (_download_count, _download_session_count) + + def _preprint_views(self, preprint: osfdb.Preprint) -> int: + '''aggregate views on each preprint''' + return PreprintView.get_count_for_preprint( + preprint=preprint, + after=self.yearmonth.month_start(), + before=self.yearmonth.month_end(), + ) + + def _preprint_downloads(self, preprint: osfdb.Preprint) -> int: + '''aggregate downloads on each preprint''' + return PreprintDownload.get_count_for_preprint( + preprint=preprint, + after=self.yearmonth.month_start(), + before=self.yearmonth.month_end(), + ) + + +def _is_item_public(osfid_referent) -> bool: + if isinstance(osfid_referent, osfdb.Preprint): + return bool(osfid_referent.verified_publishable) # quacks like Preprint + return getattr(osfid_referent, 'is_public', False) # quacks like AbstractNode + + +def _zip_sorted( + *iterators: typing.Iterator[str], +) -> typing.Iterator[str]: + '''loop thru multiple iterators on sorted (ascending) sequences of strings + ''' + _nexts = { # holds the next value from each iterator, or None + _i: next(_iter, None) + for _i, _iter in enumerate(iterators) + } + while True: + _nonnull_nexts = [ + _next + for _next in _nexts.values() + if _next is not None + ] + if not _nonnull_nexts: + return # all done + _value = min(_nonnull_nexts) + yield _value + for _i, _iter in enumerate(iterators): + if _nexts[_i] == _value: + _nexts[_i] = next(_iter, None) + + +def _iter_composite_bucket_keys( + search: edsl.Search, + composite_agg_name: str, + composite_source_name: str, + after: str | None = None, +) -> typing.Iterator[str]: + '''iterate thru *all* buckets of a composite aggregation, requesting new pages as needed + + assumes the given search has a composite aggregation of the given name + with a single value source of the given name + + updates the search in-place for subsequent pages + ''' + if after is not None: + search.aggs[composite_agg_name].after = {composite_source_name: after} + while True: + _page_response = search.execute(ignore_cache=True) # reused search object has the previous page cached + try: + _agg_result = _page_response.aggregations[composite_agg_name] + except KeyError: + return # no data; all done + for _bucket in _agg_result.buckets: + _key = _bucket.key.to_dict() + assert set(_key.keys()) == {composite_source_name}, f'expected only one key ("{composite_source_name}") in {_bucket.key}' + yield _key[composite_source_name] + # update the search for the next page + try: + _next_after = _agg_result.after_key + except AttributeError: + return # all done + else: + search.aggs[composite_agg_name].after = _next_after diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py index 54feae8bee5..319381fe899 100644 --- a/osf/metrics/reporters/spam_count.py +++ b/osf/metrics/reporters/spam_count.py @@ -5,15 +5,14 @@ from osf.models import PreprintLog, NodeLog from osf.models.spam import SpamStatus - class SpamCountReporter(MonthlyReporter): - def report(self, report_yearmonth): - target_month = report_yearmonth.target_month() - next_month = report_yearmonth.next_month() + def report(self, **report_kwargs): + assert not report_kwargs + target_month = self.yearmonth.month_start() + next_month = self.yearmonth.month_end() - report = SpamSummaryReport( - report_yearmonth=str(report_yearmonth), + return SpamSummaryReport( # Node Log entries node_confirmed_spam=NodeLog.objects.filter( action=NodeLog.CONFIRM_SPAM, @@ -80,5 +79,3 @@ def report(self, report_yearmonth): created__lt=next_month, ).count() ) - - return [report] diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index 609e79fc324..28ca6cdb964 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -1,3 +1,5 @@ +from __future__ import annotations +from collections import abc import datetime from django.dispatch import receiver @@ -20,10 +22,24 @@ class DailyReport(metrics.Metric): There's something we'd like to know about every so often, so let's regularly run a report and stash the results here. """ - DAILY_UNIQUE_FIELD = None # set in subclasses that expect multiple reports per day + UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_date',) # override in subclasses for multiple reports per day report_date = metrics.Date(format='strict_date', required=True) + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + + def save(self, *args, **kwargs): + if self.timestamp is None: + self.timestamp = datetime.datetime( + self.report_date.year, + self.report_date.month, + self.report_date.day, + tzinfo=datetime.UTC, + ) + super().save(*args, **kwargs) + class Meta: abstract = True dynamic = metrics.MetaField('strict') @@ -32,17 +48,19 @@ class Meta: class YearmonthField(metrics.Date): def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, format='strict_year_month', required=True) + super().__init__(*args, **kwargs, format='strict_year_month') def deserialize(self, data): - if isinstance(data, YearMonth): - return data - elif isinstance(data, str): - return YearMonth.from_str(data) - elif isinstance(data, (datetime.datetime, datetime.date)): - return YearMonth.from_date(data) - else: - raise ValueError('unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') + if isinstance(data, int): + # elasticsearch stores dates in milliseconds since the unix epoch + _as_datetime = datetime.datetime.fromtimestamp(data // 1000) + return YearMonth.from_date(_as_datetime) + elif data is None: + return None + try: + return YearMonth.from_any(data) + except ValueError: + raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') def serialize(self, data): if isinstance(data, str): @@ -51,6 +69,8 @@ def serialize(self, data): return str(data) elif isinstance(data, (datetime.datetime, datetime.date)): return str(YearMonth.from_date(data)) + elif data is None: + return None else: raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM') @@ -58,34 +78,62 @@ def serialize(self, data): class MonthlyReport(metrics.Metric): """MonthlyReport (abstract base for report-based metrics that run monthly) """ + UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_yearmonth',) # override in subclasses for multiple reports per month - report_yearmonth = YearmonthField() + report_yearmonth = YearmonthField(required=True) class Meta: abstract = True dynamic = metrics.MetaField('strict') source = metrics.MetaField(enabled=True) + @classmethod + def most_recent_yearmonth(cls, base_search=None) -> YearMonth | None: + _search = base_search or cls.search() + _search = _search.update_from_dict({'size': 0}) # omit hits + _search.aggs.bucket( + 'agg_most_recent_yearmonth', + 'terms', + field='report_yearmonth', + order={'_key': 'desc'}, + size=1, + ) + _response = _search.execute() + if not _response.aggregations: + return None + (_bucket,) = _response.aggregations.agg_most_recent_yearmonth.buckets + return _bucket.key + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + + def save(self, *args, **kwargs): + if self.timestamp is None: + self.timestamp = YearMonth.from_any(self.report_yearmonth).month_start() + super().save(*args, **kwargs) + @receiver(metrics_pre_save) def set_report_id(sender, instance, **kwargs): - # Set the document id to a hash of "unique together" - # values (just `report_date` by default) to get - # "ON CONFLICT UPDATE" behavior -- if the document - # already exists, it will be updated rather than duplicated. - # Cannot detect/avoid conflicts this way, but that's ok. - - if issubclass(sender, DailyReport): - duf_name = instance.DAILY_UNIQUE_FIELD - if duf_name is None: - instance.meta.id = stable_key(instance.report_date) - else: - duf_value = getattr(instance, duf_name) - if not duf_value or not isinstance(duf_value, str): - raise ReportInvalid(f'{sender.__name__}.{duf_name} MUST have a non-empty string value (got {duf_value})') - instance.meta.id = stable_key(instance.report_date, duf_value) - elif issubclass(sender, MonthlyReport): - instance.meta.id = stable_key(instance.report_yearmonth) + try: + _unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS + except AttributeError: + pass + else: + # Set the document id to a hash of "unique together" fields + # for "ON CONFLICT UPDATE" behavior -- if the document + # already exists, it will be updated rather than duplicated. + # Cannot detect/avoid conflicts this way, but that's ok. + _key_values = [] + for _field_name in _unique_together_fields: + _field_value = getattr(instance, _field_name) + if not _field_value or ( + isinstance(_field_value, abc.Iterable) and not isinstance(_field_value, str) + ): + raise ReportInvalid(f'because "{_field_name}" is in {sender.__name__}.UNIQUE_TOGETHER_FIELDS, {sender.__name__}.{_field_name} MUST have a non-empty scalar value (got {_field_value} of type {type(_field_value)})') + _key_values.append(_field_value) + instance.meta.id = stable_key(*_key_values) #### BEGIN reusable inner objects ##### @@ -157,7 +205,7 @@ class DownloadCountReport(DailyReport): class InstitutionSummaryReport(DailyReport): - DAILY_UNIQUE_FIELD = 'institution_id' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'institution_id',) institution_id = metrics.Keyword() institution_name = metrics.Keyword() @@ -169,7 +217,7 @@ class InstitutionSummaryReport(DailyReport): class NewUserDomainReport(DailyReport): - DAILY_UNIQUE_FIELD = 'domain_name' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'domain_name',) domain_name = metrics.Keyword() new_user_count = metrics.Integer() @@ -187,7 +235,7 @@ class OsfstorageFileCountReport(DailyReport): class PreprintSummaryReport(DailyReport): - DAILY_UNIQUE_FIELD = 'provider_key' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'provider_key',) provider_key = metrics.Keyword() preprint_count = metrics.Integer() @@ -214,3 +262,86 @@ class SpamSummaryReport(MonthlyReport): preprint_flagged = metrics.Integer() user_marked_as_spam = metrics.Integer() user_marked_as_ham = metrics.Integer() + + +class InstitutionalUserReport(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', 'user_id',) + institution_id = metrics.Keyword() + # user info: + user_id = metrics.Keyword() + user_name = metrics.Keyword() + department_name = metrics.Keyword() + month_last_login = YearmonthField() + month_last_active = YearmonthField() + account_creation_date = YearmonthField() + orcid_id = metrics.Keyword() + # counts: + public_project_count = metrics.Integer() + private_project_count = metrics.Integer() + public_registration_count = metrics.Integer() + embargoed_registration_count = metrics.Integer() + published_preprint_count = metrics.Integer() + public_file_count = metrics.Long() + storage_byte_count = metrics.Long() + + +class InstitutionMonthlySummaryReport(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', ) + institution_id = metrics.Keyword() + user_count = metrics.Integer() + public_project_count = metrics.Integer() + private_project_count = metrics.Integer() + public_registration_count = metrics.Integer() + embargoed_registration_count = metrics.Integer() + published_preprint_count = metrics.Integer() + storage_byte_count = metrics.Long() + public_file_count = metrics.Long() + monthly_logged_in_user_count = metrics.Long() + monthly_active_user_count = metrics.Long() + + +class PublicItemUsageReport(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid') + + # where noted, fields are meant to correspond to defined terms from COUNTER + # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html + item_osfid = metrics.Keyword() # counter:Item (or Dataset) + item_type = metrics.Keyword(multi=True) # counter:Data-Type + provider_id = metrics.Keyword(multi=True) # counter:Database(?) + platform_iri = metrics.Keyword(multi=True) # counter:Platform + + # view counts include views on components or files contained by this item + view_count = metrics.Long() # counter:Total Investigations + view_session_count = metrics.Long() # counter:Unique Investigations + + # download counts of this item only (not including contained components or files) + download_count = metrics.Long() # counter:Total Requests + download_session_count = metrics.Long() # counter:Unique Requests + + @classmethod + def for_last_month(cls, item_osfid: str) -> PublicItemUsageReport | None: + _search = ( + PublicItemUsageReport.search() + .filter('term', item_osfid=item_osfid) + # only last month's report + .filter('range', report_yearmonth={ + 'gte': 'now-2M/M', + 'lt': 'now/M', + }) + .sort('-report_yearmonth') + [:1] + ) + _response = _search.execute() + return _response[0] if _response else None + + +class PrivateSpamMetricsReport(MonthlyReport): + node_oopspam_flagged = metrics.Integer() + node_oopspam_hammed = metrics.Integer() + node_akismet_flagged = metrics.Integer() + node_akismet_hammed = metrics.Integer() + preprint_oopspam_flagged = metrics.Integer() + preprint_oopspam_hammed = metrics.Integer() + preprint_akismet_flagged = metrics.Integer() + preprint_akismet_hammed = metrics.Integer() diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 5ea397fef39..973b8bf1ef3 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -1,9 +1,10 @@ +from __future__ import annotations +import calendar +import dataclasses import re import datetime -import typing from hashlib import sha256 - -import pytz +from typing import ClassVar def stable_key(*key_parts): @@ -20,19 +21,22 @@ def stable_key(*key_parts): return sha256(bytes(plain_key, encoding='utf')).hexdigest() -class YearMonth(typing.NamedTuple): +@dataclasses.dataclass(frozen=True) +class YearMonth: + """YearMonth: represents a specific month in a specific year""" year: int month: int - YEARMONTH_RE = re.compile(r'(?P\d{4})-(?P\d{2})') + YEARMONTH_RE: ClassVar[re.Pattern] = re.compile(r'(?P\d{4})-(?P\d{2})') @classmethod - def from_date(cls, date): - assert isinstance(date, (datetime.datetime, datetime.date)) + def from_date(cls, date: datetime.date) -> YearMonth: + """construct a YearMonth from a `datetime.date` (or `datetime.datetime`)""" return cls(date.year, date.month) @classmethod - def from_str(cls, input_str): + def from_str(cls, input_str: str) -> YearMonth: + """construct a YearMonth from a string in "YYYY-MM" format""" match = cls.YEARMONTH_RE.fullmatch(input_str) if match: return cls( @@ -42,13 +46,40 @@ def from_str(cls, input_str): else: raise ValueError(f'expected YYYY-MM format, got "{input_str}"') + @classmethod + def from_any(cls, data) -> YearMonth: + if isinstance(data, YearMonth): + return data + elif isinstance(data, str): + return YearMonth.from_str(data) + elif isinstance(data, (datetime.datetime, datetime.date)): + return YearMonth.from_date(data) + raise ValueError(f'cannot coerce {data} into YearMonth') + def __str__(self): + """convert to string of "YYYY-MM" format""" return f'{self.year}-{self.month:0>2}' - def target_month(self): - return datetime.datetime(self.year, self.month, 1, tzinfo=pytz.utc) + def next(self) -> YearMonth: + """get a new YearMonth for the month after this one""" + return ( + YearMonth(self.year + 1, int(calendar.JANUARY)) + if self.month == calendar.DECEMBER + else YearMonth(self.year, self.month + 1) + ) + + def prior(self) -> YearMonth: + """get a new YearMonth for the month before this one""" + return ( + YearMonth(self.year - 1, int(calendar.DECEMBER)) + if self.month == calendar.JANUARY + else YearMonth(self.year, self.month - 1) + ) + + def month_start(self) -> datetime.datetime: + """get a datetime (in UTC timezone) when this YearMonth starts""" + return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.UTC) - def next_month(self): - if self.month == 12: - return datetime.datetime(self.year + 1, 1, 1, tzinfo=pytz.utc) - return datetime.datetime(self.year, self.month + 1, 1, tzinfo=pytz.utc) + def month_end(self) -> datetime.datetime: + """get a datetime (in UTC timezone) when this YearMonth ends (the start of next month)""" + return self.next().month_start() diff --git a/osf/migrations/0024_institution_link_to_external_reports_archive.py b/osf/migrations/0024_institution_link_to_external_reports_archive.py new file mode 100644 index 00000000000..8e1a47fcffb --- /dev/null +++ b/osf/migrations/0024_institution_link_to_external_reports_archive.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.15 on 2024-08-16 15:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('osf', '0023_preprint_affiliated_institutions'), + ] + + operations = [ + migrations.AddField( + model_name='institution', + name='link_to_external_reports_archive', + field=models.URLField(blank=True, default='', help_text='Full URL where institutional admins can access archived metrics reports.', max_length=2048), + ), + ] diff --git a/osf/models/institution.py b/osf/models/institution.py index 0c3a9780ac2..d0ce38eacf4 100644 --- a/osf/models/institution.py +++ b/osf/models/institution.py @@ -118,6 +118,12 @@ class Institution(DirtyFieldsMixin, Loggable, ObjectIDMixin, BaseModel, Guardian blank=True, help_text='The full domain this institutions that will appear in DOI metadata.' ) + link_to_external_reports_archive = models.URLField( + max_length=2048, + blank=True, + default='', + help_text='Full URL where institutional admins can access archived metrics reports.', + ) class Meta: # custom permissions for use in the OSF Admin App diff --git a/osf/models/node.py b/osf/models/node.py index 74c02c4c555..858cefbc2c4 100644 --- a/osf/models/node.py +++ b/osf/models/node.py @@ -84,7 +84,6 @@ from api.caching.tasks import update_storage_usage from api.caching import settings as cache_settings from api.caching.utils import storage_usage_cache -from api.share.utils import update_share logger = logging.getLogger(__name__) @@ -723,6 +722,7 @@ def should_request_identifiers(self): @classmethod def bulk_update_search(cls, nodes, index=None): + from api.share.utils import update_share for _node in nodes: update_share(_node) from website import search @@ -734,6 +734,7 @@ def bulk_update_search(cls, nodes, index=None): log_exception(e) def update_search(self): + from api.share.utils import update_share update_share(self) from website import search try: diff --git a/osf/models/preprint.py b/osf/models/preprint.py index 467938e68af..40963b74e67 100644 --- a/osf/models/preprint.py +++ b/osf/models/preprint.py @@ -1000,6 +1000,9 @@ def update_has_coi(self, auth: Auth, has_coi: bool, log: bool = True, save: bool This method brought to you via a grant from the Alfred P Sloan Foundation. """ + if has_coi is None: + has_coi = False + if self.has_coi == has_coi: return @@ -1032,17 +1035,14 @@ def update_conflict_of_interest_statement(self, auth: Auth, coi_statement: str, if self.conflict_of_interest_statement == coi_statement: return - if not self.has_coi: - raise PreprintStateError('You do not have the ability to edit a conflict of interest while the has_coi field is ' - 'set to false or unanswered') - - self.conflict_of_interest_statement = coi_statement + self.conflict_of_interest_statement = coi_statement or '' if log: self.add_log( action=PreprintLog.UPDATE_COI_STATEMENT, params={ 'user': auth.user._id, + 'value': self.conflict_of_interest_statement }, auth=auth, ) @@ -1065,6 +1065,9 @@ def update_has_data_links(self, auth: Auth, has_data_links: bool, log: bool = Tr if self.has_data_links == has_data_links: return + if has_data_links == 'no': + self.data_links = [] + self.has_data_links = has_data_links if log: @@ -1076,7 +1079,7 @@ def update_has_data_links(self, auth: Auth, has_data_links: bool, log: bool = Tr }, auth=auth ) - if has_data_links != 'available': + if not has_data_links: self.update_data_links(auth, data_links=[], log=False) if save: self.save() @@ -1097,9 +1100,8 @@ def update_data_links(self, auth: Auth, data_links: list, log: bool = True, save if self.data_links == data_links: return - if not self.has_data_links == 'available' and data_links: - raise PreprintStateError('You cannot edit this statement while your data links availability is set to false' - ' or is unanswered.') + if not self.has_data_links and data_links: + self.data_links = [] self.data_links = data_links @@ -1130,11 +1132,10 @@ def update_why_no_data(self, auth: Auth, why_no_data: str, log: bool = True, sav if self.why_no_data == why_no_data: return - if not self.has_data_links == 'no': - raise PreprintStateError('You cannot edit this statement while your data links availability is set to true or' - ' is unanswered.') - else: - self.why_no_data = why_no_data + if self.has_data_links: + self.why_no_data = '' + + self.why_no_data = why_no_data if log: self.add_log( @@ -1163,6 +1164,10 @@ def update_has_prereg_links(self, auth: Auth, has_prereg_links: bool, log: bool if has_prereg_links == self.has_prereg_links: return + if has_prereg_links == 'no': + self.prereg_links = [] + self.prereg_link_info = None + self.has_prereg_links = has_prereg_links if log: @@ -1174,7 +1179,7 @@ def update_has_prereg_links(self, auth: Auth, has_prereg_links: bool, log: bool }, auth=auth ) - if has_prereg_links != 'available': + if not has_prereg_links: self.update_prereg_links(auth, prereg_links=[], log=False) self.update_prereg_link_info(auth, prereg_link_info=None, log=False) if save: @@ -1196,9 +1201,8 @@ def update_why_no_prereg(self, auth: Auth, why_no_prereg: str, log: bool = True, if why_no_prereg == self.why_no_prereg: return - if self.has_prereg_links == 'available' or self.has_prereg_links is None: - raise PreprintStateError('You cannot edit this statement while your prereg links ' - 'availability is set to true or is unanswered.') + if self.has_prereg_links or self.has_prereg_links is None: + self.why_no_prereg = '' self.why_no_prereg = why_no_prereg @@ -1229,9 +1233,8 @@ def update_prereg_links(self, auth: Auth, prereg_links: list, log: bool = True, if prereg_links == self.prereg_links: return - if not self.has_prereg_links == 'available' and prereg_links: - raise PreprintStateError('You cannot edit this field while your prereg links' - ' availability is set to false or is unanswered.') + if not self.has_prereg_links and prereg_links: + self.prereg_links = [] self.prereg_links = prereg_links @@ -1263,9 +1266,8 @@ def update_prereg_link_info(self, auth: Auth, prereg_link_info: str, log: bool = if self.prereg_link_info == prereg_link_info: return - if not self.has_prereg_links == 'available' and prereg_link_info: - raise PreprintStateError('You cannot edit this field while your prereg links' - ' availability is set to false or is unanswered.') + if not self.has_prereg_links and prereg_link_info: + self.prereg_link_info = None self.prereg_link_info = prereg_link_info diff --git a/osf/models/user.py b/osf/models/user.py index 4626beb3f60..512def3a417 100644 --- a/osf/models/user.py +++ b/osf/models/user.py @@ -34,7 +34,6 @@ MergeConflictError) from framework.exceptions import PermissionsError from framework.sessions.utils import remove_sessions_for_user -from api.share.utils import update_share from osf.external.gravy_valet import ( request_helpers as gv_requests, translations as gv_translations, @@ -1231,12 +1230,12 @@ def update_guessed_names(self): self.family_name = parsed['family'] self.suffix = parsed['suffix'] - def add_unconfirmed_email(self, email, expiration=None, external_identity=None): + def add_unconfirmed_email(self, email, expiration=None, external_identity=None, force=False): """ Add an email verification token for a given email. :param email: the email to confirm - :param email: overwrite default expiration time + :param expiration: overwrite default expiration time :param external_identity: the user's external identity :return: a token :raises: ValueError if email already confirmed, except for login through external idp. @@ -1253,7 +1252,8 @@ def add_unconfirmed_email(self, email, expiration=None, external_identity=None): validate_email(email) if not external_identity and self.emails.filter(address=email).exists(): - raise ValueError('Email already confirmed to this user.') + if not force or self.is_confirmed: + raise ValueError('Email already confirmed to this user.') # If the unconfirmed email is already present, refresh the token if email in self.unconfirmed_emails: @@ -1308,14 +1308,14 @@ def get_confirmation_token(self, email, force=False, renew=False): # assume the token is expired expiration = info.get('expiration') if renew: - new_token = self.add_unconfirmed_email(email) + new_token = self.add_unconfirmed_email(email, force=force) self.save() return new_token if not expiration or (expiration and expiration < timezone.now()): if not force: raise ExpiredTokenError(f'Token for email "{email}" is expired') else: - new_token = self.add_unconfirmed_email(email) + new_token = self.add_unconfirmed_email(email, force=force) self.save() return new_token return token @@ -1346,6 +1346,23 @@ def get_confirmation_url(self, email, destination = '?{}'.format(urlencode({'destination': destination})) if destination else '' return f'{base}confirm/{external}{self._primary_key}/{token}/{destination}' + def get_or_create_confirmation_url(self, email, force=False, renew=False): + """ + Get or create a confirmation URL for the given email. + + :param email: The email to generate a confirmation URL for. + :param force: Force generating a new confirmation link. + :param renew: Renew an expired token. + :raises ValidationError: If email is invalid or domain is banned. + :return: Confirmation URL for the email. + """ + try: + self.get_confirmation_token(email, force=force, renew=renew) + except KeyError: + self.add_unconfirmed_email(email, force=force) + self.save() + return self.get_confirmation_url(email) + def register(self, username, password=None, accepted_terms_of_service=None): """Registers the user. """ @@ -1455,6 +1472,7 @@ def is_assumed_ham(self): return user_has_trusted_email def update_search(self): + from api.share.utils import update_share update_share(self) from website.search.search import update_user update_user(self) diff --git a/osf_tests/external/akismet/test_akismet.py b/osf_tests/external/akismet/test_akismet.py index db3c5d0d584..46729e485e8 100644 --- a/osf_tests/external/akismet/test_akismet.py +++ b/osf_tests/external/akismet/test_akismet.py @@ -237,3 +237,39 @@ def test_meetings_skip_spam_check(self, mock_akismet, user, node_in_conference, node.check_spam(user, {'title'}, request_headers) node.refresh_from_db() assert node.spam_status == SpamStatus.FLAGGED + + @mock.patch('osf.models.NodeLog.objects.filter') + def test_get_flagged_count(self, mock_filter, user): + from osf.external.askismet.client import AkismetClient + from datetime import datetime + + client = AkismetClient() + start_date = datetime(2024, 10, 1) + end_date = datetime(2024, 10, 31) + + client.get_flagged_count(start_date, end_date) + + mock_filter.assert_called_with( + action='flag_spam', + created__gt=start_date, + created__lt=end_date, + node__spam_data__who_flagged__in=['akismet', 'both'] + ) + + @mock.patch('osf.models.NodeLog.objects.filter') + def test_get_hammed_count(self, mock_filter, user): + from osf.external.askismet.client import AkismetClient + from datetime import datetime + + client = AkismetClient() + start_date = datetime(2024, 10, 1) + end_date = datetime(2024, 10, 31) + + client.get_hammed_count(start_date, end_date) + + mock_filter.assert_called_with( + action='confirm_ham', + created__gt=start_date, + created__lt=end_date, + node__spam_data__who_flagged__in=['akismet', 'both'] + ) diff --git a/osf_tests/external/oopspam/test_oopspam.py b/osf_tests/external/oopspam/test_oopspam.py index 36740148116..96656ecc6da 100644 --- a/osf_tests/external/oopspam/test_oopspam.py +++ b/osf_tests/external/oopspam/test_oopspam.py @@ -125,3 +125,39 @@ def test_do_spam_check_false(self, mock_oopspam, user, request_headers): ) assert user.spam_status == SpamStatus.UNKNOWN + + @mock.patch('osf.models.NodeLog.objects.filter') + def test_get_flagged_count(self, mock_filter, user): + from osf.external.oopspam.client import OOPSpamClient + from datetime import datetime + + client = OOPSpamClient() + start_date = datetime(2024, 10, 1) + end_date = datetime(2024, 10, 31) + + client.get_flagged_count(start_date, end_date) + + mock_filter.assert_called_with( + action='flag_spam', + created__gt=start_date, + created__lt=end_date, + node__spam_data__who_flagged__in=['oopspam', 'both'] + ) + + @mock.patch('osf.models.NodeLog.objects.filter') + def test_get_hammed_count(self, mock_filter, user): + from osf.external.oopspam.client import OOPSpamClient + from datetime import datetime + + client = OOPSpamClient() + start_date = datetime(2024, 10, 1) + end_date = datetime(2024, 10, 31) + + client.get_hammed_count(start_date, end_date) + + mock_filter.assert_called_with( + action='confirm_ham', + created__gt=start_date, + created__lt=end_date, + node__spam_data__who_flagged__in=['oopspam', 'both'] + ) diff --git a/osf_tests/factories.py b/osf_tests/factories.py index 860dd967e5e..0bd1664977d 100644 --- a/osf_tests/factories.py +++ b/osf_tests/factories.py @@ -188,7 +188,7 @@ class BaseNodeFactory(DjangoModelFactory): title = factory.Faker('catch_phrase') description = factory.Faker('sentence') created = factory.LazyFunction(timezone.now) - creator = factory.SubFactory(AuthUserFactory) + creator = factory.LazyAttribute(lambda o: AuthUserFactory()) class Meta: model = models.Node diff --git a/osf_tests/management_commands/test_email_all_users.py b/osf_tests/management_commands/test_email_all_users.py index 3392e77a470..c10c84b49d1 100644 --- a/osf_tests/management_commands/test_email_all_users.py +++ b/osf_tests/management_commands/test_email_all_users.py @@ -49,7 +49,7 @@ def test_email_all_users_dry(self, mock_email, superuser): mock_email.assert_called_with( to_addr=superuser.email, mail=mails.TOU_NOTIF, - fullname=superuser.fullname + given_name=superuser.given_name ) @pytest.mark.django_db @@ -64,10 +64,10 @@ def test_dont_email_inactive_users( @pytest.mark.django_db @mock.patch('website.mails.send_mail') def test_email_all_users_offset(self, mock_email, user, user2): - email_all_users('TOU_NOTIF', offset=1, run=0) + email_all_users('TOU_NOTIF', offset=1, start_id=0) - email_all_users('TOU_NOTIF', offset=1, run=1) + email_all_users('TOU_NOTIF', offset=1, start_id=1) - email_all_users('TOU_NOTIF', offset=1, run=2) + email_all_users('TOU_NOTIF', offset=1, start_id=2) assert mock_email.call_count == 2 diff --git a/osf_tests/management_commands/test_migrate_preprint_affiliations.py b/osf_tests/management_commands/test_migrate_preprint_affiliations.py new file mode 100644 index 00000000000..8c80737b3dd --- /dev/null +++ b/osf_tests/management_commands/test_migrate_preprint_affiliations.py @@ -0,0 +1,151 @@ +import pytest +from datetime import timedelta +from osf.management.commands.migrate_preprint_affiliation import AFFILIATION_TARGET_DATE, assign_affiliations_to_preprints +from osf_tests.factories import ( + PreprintFactory, + InstitutionFactory, + AuthUserFactory, +) + + +@pytest.mark.django_db +class TestAssignAffiliationsToPreprints: + + @pytest.fixture() + def institution(self): + return InstitutionFactory() + + @pytest.fixture() + def user_with_affiliation(self, institution): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.save() + return user + + @pytest.fixture() + def user_without_affiliation(self): + return AuthUserFactory() + + @pytest.fixture() + def preprint_with_affiliated_contributor(self, user_with_affiliation): + preprint = PreprintFactory() + preprint.add_contributor( + user_with_affiliation, + permissions='admin', + visible=True + ) + preprint.created = AFFILIATION_TARGET_DATE - timedelta(days=1) + preprint.save() + return preprint + + @pytest.fixture() + def preprint_with_non_affiliated_contributor(self, user_without_affiliation): + preprint = PreprintFactory() + preprint.add_contributor( + user_without_affiliation, + permissions='admin', + visible=True + ) + preprint.created = AFFILIATION_TARGET_DATE - timedelta(days=1) + preprint.save() + return preprint + + @pytest.fixture() + def preprint_past_target_date_with_affiliated_contributor(self, user_with_affiliation): + preprint = PreprintFactory() + preprint.add_contributor( + user_with_affiliation, + permissions='admin', + visible=True + ) + preprint.created = AFFILIATION_TARGET_DATE + timedelta(days=1) + preprint.save() + return preprint + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_assign_affiliations_with_affiliated_contributor(self, preprint_with_affiliated_contributor, institution, dry_run): + preprint = preprint_with_affiliated_contributor + preprint.affiliated_institutions.clear() + preprint.save() + + assign_affiliations_to_preprints(dry_run=dry_run) + + if dry_run: + assert not preprint.affiliated_institutions.exists() + else: + assert institution in preprint.affiliated_institutions.all() + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_no_affiliations_for_non_affiliated_contributor(self, preprint_with_non_affiliated_contributor, dry_run): + preprint = preprint_with_non_affiliated_contributor + preprint.affiliated_institutions.clear() + preprint.save() + + assign_affiliations_to_preprints(dry_run=dry_run) + + assert not preprint.affiliated_institutions.exists() + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_exclude_contributor_by_guid(self, preprint_with_affiliated_contributor, user_with_affiliation, institution, dry_run): + preprint = preprint_with_affiliated_contributor + preprint.affiliated_institutions.clear() + preprint.save() + + assert user_with_affiliation.get_affiliated_institutions() + assert user_with_affiliation in preprint.contributors.all() + exclude_guids = {user._id for user in preprint.contributors.all()} + + assign_affiliations_to_preprints(exclude_guids=exclude_guids, dry_run=dry_run) + + assert not preprint.affiliated_institutions.exists() + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_affiliations_from_multiple_contributors(self, institution, dry_run): + institution_not_include = InstitutionFactory() + read_contrib = AuthUserFactory() + read_contrib.add_or_update_affiliated_institution(institution_not_include) + read_contrib.save() + + write_contrib = AuthUserFactory() + write_contrib.add_or_update_affiliated_institution(institution) + write_contrib.save() + + admin_contrib = AuthUserFactory() + institution2 = InstitutionFactory() + admin_contrib.add_or_update_affiliated_institution(institution2) + admin_contrib.save() + + preprint = PreprintFactory() + preprint.affiliated_institutions.clear() + preprint.created = AFFILIATION_TARGET_DATE - timedelta(days=1) + preprint.add_contributor(read_contrib, permissions='read', visible=True) + preprint.add_contributor(write_contrib, permissions='write', visible=True) + preprint.add_contributor(admin_contrib, permissions='admin', visible=True) + preprint.save() + + assign_affiliations_to_preprints(dry_run=dry_run) + + if dry_run: + assert not preprint.affiliated_institutions.exists() + else: + affiliations = set(preprint.affiliated_institutions.all()) + assert affiliations == {institution, institution2} + assert institution_not_include not in affiliations + + @pytest.mark.parametrize('dry_run', [True, False]) + def test_exclude_recent_preprints(self, preprint_past_target_date_with_affiliated_contributor, preprint_with_affiliated_contributor, institution, dry_run): + new_preprint = preprint_past_target_date_with_affiliated_contributor + new_preprint.affiliated_institutions.clear() + new_preprint.save() + + old_preprint = preprint_with_affiliated_contributor + old_preprint.affiliated_institutions.clear() + old_preprint.save() + + assign_affiliations_to_preprints(dry_run=dry_run) + + assert not new_preprint.affiliated_institutions.exists() + if dry_run: + assert not old_preprint.affiliated_institutions.exists() + else: + assert institution in old_preprint.affiliated_institutions.all() diff --git a/osf_tests/management_commands/test_recatalog_metadata.py b/osf_tests/management_commands/test_recatalog_metadata.py index 85742b76094..550f06e4d13 100644 --- a/osf_tests/management_commands/test_recatalog_metadata.py +++ b/osf_tests/management_commands/test_recatalog_metadata.py @@ -1,17 +1,17 @@ +import datetime import pytest from unittest import mock from operator import attrgetter -import random from django.core.management import call_command -from osf.models.metadata import GuidMetadataRecord from osf_tests.factories import ( PreprintProviderFactory, PreprintFactory, ProjectFactory, RegistrationProviderFactory, RegistrationFactory, + UserFactory, ) @@ -41,18 +41,15 @@ def registration_provider(self): @pytest.fixture def registrations(self, registration_provider): return sorted_by_id([ - RegistrationFactory(provider=registration_provider) + RegistrationFactory(provider=registration_provider, is_public=True) for _ in range(7) ]) @pytest.fixture def projects(self, registrations): return sorted_by_id([ - ProjectFactory() + ProjectFactory(is_public=True) for _ in range(7) - ] + [ - registration.registered_from - for registration in registrations ]) @pytest.fixture @@ -79,19 +76,21 @@ def users(self, preprints, registrations, projects): ]))) @pytest.fixture - def items_with_custom_datacite_type(self, preprints, registrations, projects, files): - _nonpreprint_sample = [ - random.choice(_items) - for _items in (registrations, projects, files) + def decatalog_items(self, registrations): + _user = UserFactory(allow_indexing=False) + _registration = RegistrationFactory(is_public=False, creator=_user) + _implicit_projects = [ + _registration.registered_from, + *(_reg.registered_from for _reg in registrations), + ] + return [ + _user, + _registration, + *_implicit_projects, + PreprintFactory(is_published=False, creator=_user), + ProjectFactory(is_public=False, creator=_user), + ProjectFactory(deleted=datetime.datetime.now(), creator=_user), ] - for _item in _nonpreprint_sample: - _guid_record = GuidMetadataRecord.objects.for_guid(_item) - _guid_record.resource_type_general = 'BookChapter' # datacite resourceTypeGeneral value - _guid_record.save() - return { - *preprints, # every preprint has datacite type "Preprint" - *_nonpreprint_sample, - } def test_recatalog_metadata( self, @@ -103,8 +102,14 @@ def test_recatalog_metadata( projects, files, users, - items_with_custom_datacite_type, + decatalog_items, ): + def _actual_osfids() -> set[str]: + return { + _call[-1]['kwargs']['guid'] + for _call in mock_update_share_task.apply_async.mock_calls + } + # test preprints call_command( 'recatalog_metadata', @@ -183,17 +188,24 @@ def test_recatalog_metadata( mock_update_share_task.reset_mock() - # datacite custom types + # all types + _all_public_items = [*preprints, *registrations, *projects, *files, *users] + call_command( + 'recatalog_metadata', + '--all-types', + ) + _expected_osfids = set(_iter_osfids(_all_public_items)) + assert _expected_osfids == _actual_osfids() + + # also decatalog private/deleted items + _all_items = [*_all_public_items, *decatalog_items] call_command( 'recatalog_metadata', - '--datacite-custom-types', + '--all-types', + '--also-decatalog', ) - _expected_osfids = set(_iter_osfids(items_with_custom_datacite_type)) - _actual_osfids = { - _call[-1]['kwargs']['guid'] - for _call in mock_update_share_task.apply_async.mock_calls - } - assert _expected_osfids == _actual_osfids + _expected_osfids = set(_iter_osfids(_all_items)) + assert _expected_osfids == _actual_osfids() ### diff --git a/osf_tests/metadata/_utils.py b/osf_tests/metadata/_utils.py index df5ed2b7ac7..fb23bdb16c5 100644 --- a/osf_tests/metadata/_utils.py +++ b/osf_tests/metadata/_utils.py @@ -3,23 +3,23 @@ from osf.metadata import gather from osf.metadata.rdfutils import contextualized_graph -def assert_triples(actual_triples, expected_triples): +def assert_triples(actual_triples, expected_triples, label=''): _expected_graph, _expected_focuses = _get_graph_and_focuses(expected_triples) _actual_graph, _actual_focuses = _get_graph_and_focuses(actual_triples) - assert_graphs_equal(_actual_graph, _expected_graph) + assert_graphs_equal(_actual_graph, _expected_graph, label=label) assert _expected_focuses == _actual_focuses -def assert_graphs_equal(actual_rdflib_graph, expected_rdflib_graph): +def assert_graphs_equal(actual_rdflib_graph, expected_rdflib_graph, label=''): (_overlap, _expected_but_absent, _unexpected_but_present) = rdflib.compare.graph_diff( expected_rdflib_graph, actual_rdflib_graph, ) assert not _expected_but_absent and not _unexpected_but_present, '\n\t'.join(( - 'unequal triple-sets!', + (f'unequal triplesets for "{label}"!' if label else 'unequal triple-sets!'), f'overlap size: {len(_overlap)}', - f'expected (but absent): {_friendly_graph(_expected_but_absent)}', - f'unexpected (but present): {_friendly_graph(_unexpected_but_present)}', + f'expected (but absent): {_indented_graph(_expected_but_absent)}', + f'unexpected (but present): {_indented_graph(_unexpected_but_present)}', )) @@ -35,10 +35,9 @@ def _get_graph_and_focuses(triples): return _graph, _focuses -def _friendly_graph(rdfgraph) -> str: +def _indented_graph(rdfgraph) -> str: _graph_to_print = contextualized_graph(rdfgraph) _delim = '\n\t\t' return _delim + _delim.join( - ' '.join(_term.n3() for _term in triple) - for triple in _graph_to_print + _graph_to_print.serialize(format='turtle').strip().split('\n') ) diff --git a/osf_tests/metadata/expected_metadata_files/file_basic.turtle b/osf_tests/metadata/expected_metadata_files/file_basic.turtle index 14a78c46c88..3f430b22521 100644 --- a/osf_tests/metadata/expected_metadata_files/file_basic.turtle +++ b/osf_tests/metadata/expected_metadata_files/file_basic.turtle @@ -3,6 +3,7 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix skos: . a osf:File ; dcat:accessService ; @@ -33,7 +34,8 @@ dcterms:extent "0.000007 MB" ; dcterms:format "img/png" ; dcterms:modified "2123-05-04" ; - dcterms:requires ; + dcterms:requires ; + osf:storageRegion ; osf:versionNumber "1" . a dcterms:Agent, @@ -45,3 +47,5 @@ foaf:Organization ; dcterms:identifier "http://localhost:5000" ; foaf:name "OSF" . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/file_full.turtle b/osf_tests/metadata/expected_metadata_files/file_full.turtle index 37dd3c537f0..175ccfb042f 100644 --- a/osf_tests/metadata/expected_metadata_files/file_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/file_full.turtle @@ -4,6 +4,7 @@ @prefix osf: . @prefix owl: . @prefix rdfs: . +@prefix skos: . a osf:File ; dcat:accessService ; @@ -39,7 +40,8 @@ dcterms:extent "0.000007 MB" ; dcterms:format "img/png" ; dcterms:modified "2123-05-04" ; - dcterms:requires ; + dcterms:requires ; + osf:storageRegion ; osf:versionNumber "1" . a osf:FundingAward ; @@ -76,3 +78,5 @@ foaf:name "OSF" . rdfs:label "Dataset"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/file_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/file_monthly_supplement.turtle new file mode 100644 index 00000000000..845bd149f37 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/file_monthly_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcat: . +@prefix dcterms: . +@prefix foaf: . +@prefix osf: . +@prefix xsd: . + + osf:usage [ dcterms:temporal "2123-05"^^xsd:gYearMonth ; + dcat:accessService ; + foaf:primaryTopic ; + osf:downloadCount 3 ; + osf:downloadSessionCount 2 ; + osf:viewCount 7 ; + osf:viewSessionCount 5 ] . diff --git a/osf_tests/metadata/expected_metadata_files/file_supplement.turtle b/osf_tests/metadata/expected_metadata_files/file_supplement.turtle new file mode 100644 index 00000000000..662c197699d --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/file_supplement.turtle @@ -0,0 +1 @@ +# correctly empty (for now) diff --git a/osf_tests/metadata/expected_metadata_files/preprint_basic.turtle b/osf_tests/metadata/expected_metadata_files/preprint_basic.turtle index f6db59e6e24..ee7e866827b 100644 --- a/osf_tests/metadata/expected_metadata_files/preprint_basic.turtle +++ b/osf_tests/metadata/expected_metadata_files/preprint_basic.turtle @@ -3,6 +3,7 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . @prefix skos: . @@ -25,7 +26,9 @@ dcat:accessService ; osf:hostingInstitution ; osf:isSupplementedBy ; - osf:statedConflictOfInterest osf:no-conflict-of-interest . + osf:statedConflictOfInterest osf:no-conflict-of-interest ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a dcterms:Agent, foaf:Organization ; diff --git a/osf_tests/metadata/expected_metadata_files/preprint_full.turtle b/osf_tests/metadata/expected_metadata_files/preprint_full.turtle index 93c69fa4e8c..cdf665fd5fe 100644 --- a/osf_tests/metadata/expected_metadata_files/preprint_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/preprint_full.turtle @@ -3,6 +3,7 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . @prefix skos: . @@ -25,7 +26,9 @@ dcat:accessService ; osf:hostingInstitution ; osf:isSupplementedBy ; - osf:statedConflictOfInterest osf:no-conflict-of-interest . + osf:statedConflictOfInterest osf:no-conflict-of-interest ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a dcterms:Agent, foaf:Organization ; diff --git a/osf_tests/metadata/expected_metadata_files/preprint_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/preprint_monthly_supplement.turtle new file mode 100644 index 00000000000..8e6d6fb9331 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/preprint_monthly_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcat: . +@prefix dcterms: . +@prefix foaf: . +@prefix osf: . +@prefix xsd: . + + osf:usage [ dcterms:temporal "2123-05"^^xsd:gYearMonth ; + dcat:accessService ; + foaf:primaryTopic ; + osf:downloadCount 3 ; + osf:downloadSessionCount 2 ; + osf:viewCount 7 ; + osf:viewSessionCount 5 ] . diff --git a/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle b/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle new file mode 100644 index 00000000000..9ff0732a509 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle @@ -0,0 +1,7 @@ +@prefix osf: . +@prefix skos: . + + osf:storageByteCount 1337 ; + osf:storageRegion . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/project_basic.turtle b/osf_tests/metadata/expected_metadata_files/project_basic.turtle index c3846782273..aa8244da1fd 100644 --- a/osf_tests/metadata/expected_metadata_files/project_basic.turtle +++ b/osf_tests/metadata/expected_metadata_files/project_basic.turtle @@ -3,7 +3,9 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . +@prefix skos: . a osf:Project ; dcterms:created "2123-05-04" ; @@ -23,7 +25,9 @@ dcat:accessService ; osf:contains ; osf:hostingInstitution ; - osf:supplements . + osf:supplements ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a osf:Preprint ; dcterms:created "2123-05-04" ; @@ -53,8 +57,19 @@ dcterms:modified "2123-05-04" ; osf:fileName "my-file.blarg" ; osf:filePath "/my-file.blarg" ; + osf:hasFileVersion ; osf:isContainedBy . + a osf:FileVersion ; + dcterms:created "2123-05-04" ; + dcterms:creator ; + dcterms:extent "0.000007 MB" ; + dcterms:format "img/png" ; + dcterms:modified "2123-05-04" ; + dcterms:requires ; + osf:storageRegion ; + osf:versionNumber "1" . + a dcterms:Agent, foaf:Organization ; dcterms:identifier "https://cos.io/", @@ -85,3 +100,5 @@ rdfs:label "Preprint"@en . rdfs:label "StudyRegistration"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/project_full.turtle b/osf_tests/metadata/expected_metadata_files/project_full.turtle index 6a84d141440..63946b2f80b 100644 --- a/osf_tests/metadata/expected_metadata_files/project_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/project_full.turtle @@ -3,7 +3,9 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . +@prefix skos: . a osf:Project ; dcterms:created "2123-05-04" ; @@ -29,7 +31,9 @@ osf:hasFunding , ; osf:hostingInstitution ; - osf:supplements . + osf:supplements ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a osf:Preprint ; dcterms:created "2123-05-04" ; @@ -59,8 +63,19 @@ dcterms:modified "2123-05-04" ; osf:fileName "my-file.blarg" ; osf:filePath "/my-file.blarg" ; + osf:hasFileVersion ; osf:isContainedBy . + a osf:FileVersion ; + dcterms:created "2123-05-04" ; + dcterms:creator ; + dcterms:extent "0.000007 MB" ; + dcterms:format "img/png" ; + dcterms:modified "2123-05-04" ; + dcterms:requires ; + osf:storageRegion ; + osf:versionNumber "1" . + a osf:FundingAward ; dcterms:contributor ; dcterms:identifier "https://moneypockets.example/millions" ; @@ -116,3 +131,5 @@ rdfs:label "Dataset"@en . rdfs:label "StudyRegistration"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/project_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/project_monthly_supplement.turtle new file mode 100644 index 00000000000..dd9c54b1f93 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/project_monthly_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcat: . +@prefix dcterms: . +@prefix foaf: . +@prefix osf: . +@prefix xsd: . + + osf:usage [ dcterms:temporal "2123-05"^^xsd:gYearMonth ; + dcat:accessService ; + foaf:primaryTopic ; + osf:downloadCount 3 ; + osf:downloadSessionCount 2 ; + osf:viewCount 7 ; + osf:viewSessionCount 5 ] . diff --git a/osf_tests/metadata/expected_metadata_files/project_supplement.turtle b/osf_tests/metadata/expected_metadata_files/project_supplement.turtle new file mode 100644 index 00000000000..d055e97554f --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/project_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcterms: . +@prefix osf: . +@prefix skos: . + + osf:hasOsfAddon ; + osf:storageByteCount 7 ; + osf:storageRegion . + + a osf:AddonImplementation ; + dcterms:identifier "gitlab" ; + skos:prefLabel "GitLab" . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/registration_basic.turtle b/osf_tests/metadata/expected_metadata_files/registration_basic.turtle index eae4a92336c..9601477944f 100644 --- a/osf_tests/metadata/expected_metadata_files/registration_basic.turtle +++ b/osf_tests/metadata/expected_metadata_files/registration_basic.turtle @@ -3,7 +3,9 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . +@prefix skos: . a osf:Registration ; dcterms:conformsTo ; @@ -21,7 +23,10 @@ dcterms:title "this is a project title!" ; dcterms:type ; dcat:accessService ; - osf:hostingInstitution . + osf:contains ; + osf:hostingInstitution ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a osf:Project ; dcterms:created "2123-05-04" ; @@ -36,6 +41,25 @@ dcterms:title "this is a project title!" ; owl:sameAs . + a osf:File ; + dcterms:created "2123-05-04" ; + dcterms:identifier "http://localhost:5000/w6ibb" ; + dcterms:modified "2123-05-04" ; + osf:fileName "my-reg-file.blarg" ; + osf:filePath "/my-reg-file.blarg" ; + osf:hasFileVersion ; + osf:isContainedBy . + + a osf:FileVersion ; + dcterms:created "2123-05-04" ; + dcterms:creator ; + dcterms:extent "0.000016 MB" ; + dcterms:format "img/png" ; + dcterms:modified "2123-05-04" ; + dcterms:requires ; + osf:storageRegion ; + osf:versionNumber "1" . + a dcterms:Agent, foaf:Organization ; dcterms:identifier "https://cos.io/", @@ -61,3 +85,5 @@ dcterms:title "Open-Ended Registration" . rdfs:label "StudyRegistration"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/registration_full.turtle b/osf_tests/metadata/expected_metadata_files/registration_full.turtle index d30c4594bbe..4ab508c2f17 100644 --- a/osf_tests/metadata/expected_metadata_files/registration_full.turtle +++ b/osf_tests/metadata/expected_metadata_files/registration_full.turtle @@ -3,7 +3,9 @@ @prefix foaf: . @prefix osf: . @prefix owl: . +@prefix prov: . @prefix rdfs: . +@prefix skos: . a osf:Registration ; dcterms:conformsTo ; @@ -21,7 +23,10 @@ dcterms:title "this is a project title!" ; dcterms:type ; dcat:accessService ; - osf:hostingInstitution . + osf:contains ; + osf:hostingInstitution ; + prov:qualifiedAttribution [ dcat:hadRole osf:admin-contributor ; + prov:agent ] . a osf:Project ; dcterms:created "2123-05-04" ; @@ -41,6 +46,25 @@ osf:hasFunding , . + a osf:File ; + dcterms:created "2123-05-04" ; + dcterms:identifier "http://localhost:5000/w6ibb" ; + dcterms:modified "2123-05-04" ; + osf:fileName "my-reg-file.blarg" ; + osf:filePath "/my-reg-file.blarg" ; + osf:hasFileVersion ; + osf:isContainedBy . + + a osf:FileVersion ; + dcterms:created "2123-05-04" ; + dcterms:creator ; + dcterms:extent "0.000016 MB" ; + dcterms:format "img/png" ; + dcterms:modified "2123-05-04" ; + dcterms:requires ; + osf:storageRegion ; + osf:versionNumber "1" . + a osf:FundingAward ; dcterms:contributor ; dcterms:identifier "https://moneypockets.example/millions" ; @@ -91,3 +115,5 @@ dcterms:title "Open-Ended Registration" . rdfs:label "StudyRegistration"@en . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/registration_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/registration_monthly_supplement.turtle new file mode 100644 index 00000000000..435f7f4f921 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/registration_monthly_supplement.turtle @@ -0,0 +1,13 @@ +@prefix dcat: . +@prefix dcterms: . +@prefix foaf: . +@prefix osf: . +@prefix xsd: . + + osf:usage [ dcterms:temporal "2123-05"^^xsd:gYearMonth ; + dcat:accessService ; + foaf:primaryTopic ; + osf:downloadCount 3 ; + osf:downloadSessionCount 2 ; + osf:viewCount 7 ; + osf:viewSessionCount 5 ] . diff --git a/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle b/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle new file mode 100644 index 00000000000..9e8201b7915 --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle @@ -0,0 +1,7 @@ +@prefix osf: . +@prefix skos: . + + osf:storageByteCount 17 ; + osf:storageRegion . + + skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/user_monthly_supplement.turtle b/osf_tests/metadata/expected_metadata_files/user_monthly_supplement.turtle new file mode 100644 index 00000000000..662c197699d --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/user_monthly_supplement.turtle @@ -0,0 +1 @@ +# correctly empty (for now) diff --git a/osf_tests/metadata/expected_metadata_files/user_supplement.turtle b/osf_tests/metadata/expected_metadata_files/user_supplement.turtle new file mode 100644 index 00000000000..662c197699d --- /dev/null +++ b/osf_tests/metadata/expected_metadata_files/user_supplement.turtle @@ -0,0 +1 @@ +# correctly empty (for now) diff --git a/osf_tests/metadata/test_basket.py b/osf_tests/metadata/test_basket.py index 1fa8381cf08..c34ded3e2c5 100644 --- a/osf_tests/metadata/test_basket.py +++ b/osf_tests/metadata/test_basket.py @@ -34,7 +34,7 @@ def test_goodbasket(): basket = gather.Basket(focus) assert basket.focus == focus assert isinstance(basket.gathered_metadata, rdflib.Graph) - assert len(basket.gathered_metadata) == 1 + assert len(basket.gathered_metadata) == 0 assert len(basket._gathertasks_done) == 0 assert len(basket._known_focus_dict) == 1 # no repeat gathertasks: @@ -78,5 +78,6 @@ def test_goodbasket(): # reset basket.reset() - assert len(basket.gathered_metadata) == 1 + assert len(basket.gathered_metadata) == 0 assert len(basket._gathertasks_done) == 0 + assert len(basket._known_focus_dict) == 1 diff --git a/osf_tests/metadata/test_gatherer_registry.py b/osf_tests/metadata/test_gatherer_registry.py index fda28eaf680..c139946ab80 100644 --- a/osf_tests/metadata/test_gatherer_registry.py +++ b/osf_tests/metadata/test_gatherer_registry.py @@ -74,6 +74,10 @@ def gather_agent_name(focus): gather_preprint_or_project_creator, gather_special_preprint_creator, } + assert get_gatherers(BAZ.Preprint, [BAZ.creator], include_focustype_defaults=False) == { + gather_preprint_or_project_creator, + gather_special_preprint_creator, + } assert get_gatherers(BAZ.Agent, [FOO.name, FOO.identifier, FOO.unknown]) == { gather_agent_name, gather_identifiers, diff --git a/osf_tests/metadata/test_osf_gathering.py b/osf_tests/metadata/test_osf_gathering.py index 7bd72770aba..4c064c8a690 100644 --- a/osf_tests/metadata/test_osf_gathering.py +++ b/osf_tests/metadata/test_osf_gathering.py @@ -1,4 +1,5 @@ import datetime +from unittest import mock from django.test import TestCase import rdflib @@ -11,15 +12,19 @@ FOAF, OSF, OSFIO, + DCAT, DCTERMS, DCMITYPE, DOI, OWL, + PROV, RDF, SKOS, checksum_iri, ) from osf import models as osfdb +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth from osf.utils import permissions, workflows from osf_tests import factories from website import settings as website_settings @@ -36,12 +41,13 @@ def setUpTestData(cls): external_identity={'ORCID': {'1234-4321-5678-8765': 'VERIFIED'}}, ) cls.user__readonly = factories.UserFactory( - external_identity={'ORCID': {'1234-4321-6789-9876': 'CREATE'}}, + external_identity={'ORCID': {'1234-4321-6789-9876': 'CREATE'}}, # unverified orcid social={ 'profileWebsites': ['http://mysite.example', 'http://myothersite.example/foo'], 'baiduScholar': 'blarg', }, ) + cls.user__invisible = factories.UserFactory() # cedar metadata template cls.cedar_template = factories.CedarMetadataTemplateFactory( cedar_id='https://repo.metadatacenter.org/templates/this-is-a-cedar-id', @@ -51,8 +57,11 @@ def setUpTestData(cls): ) # project (with components): cls.project = factories.ProjectFactory(creator=cls.user__admin, is_public=True) + cls.project.add_addon('box', auth=None) + cls.project.add_addon('gitlab', auth=None) cls.project.add_contributor(cls.user__readwrite, permissions=permissions.WRITE) - cls.project.add_contributor(cls.user__readonly, permissions=permissions.READ, visible=False) + cls.project.add_contributor(cls.user__readonly, permissions=permissions.READ) + cls.project.add_contributor(cls.user__invisible, permissions=permissions.WRITE, visible=False) cls.component = factories.ProjectFactory(parent=cls.project, creator=cls.user__admin, is_public=True) cls.sibcomponent = factories.ProjectFactory(parent=cls.project, creator=cls.user__admin, is_public=True) cls.subcomponent = factories.ProjectFactory(parent=cls.component, creator=cls.user__admin, is_public=True) @@ -89,7 +98,8 @@ def setUpTestData(cls): is_public=True, ) cls.preprint.add_contributor(cls.user__readwrite, permissions=permissions.WRITE) - cls.preprint.add_contributor(cls.user__readonly, permissions=permissions.READ, visible=False) + cls.preprint.add_contributor(cls.user__readonly, permissions=permissions.READ) + cls.preprint.add_contributor(cls.user__invisible, permissions=permissions.WRITE, visible=False) cls.registration_cedar_record = factories.CedarMetadataRecordFactory( template=cls.cedar_template, is_published=True, @@ -453,6 +463,7 @@ def test_gather_versions(self): # focus: file fileversion = self.file.versions.first() fileversion_iri = URIRef(f'{self.filefocus.iri}?revision={fileversion.identifier}') + storageregion_iri = URIRef(f'{website_settings.API_DOMAIN}v2/regions/us/') assert_triples(osf_gathering.gather_versions(self.filefocus), { (self.filefocus.iri, OSF.hasFileVersion, fileversion_iri), (fileversion_iri, RDF.type, OSF.FileVersion), @@ -462,7 +473,9 @@ def test_gather_versions(self): (fileversion_iri, DCTERMS['format'], Literal(fileversion.content_type)), (fileversion_iri, DCTERMS.extent, Literal('0.118 MB')), (fileversion_iri, OSF.versionNumber, Literal(fileversion.identifier)), - (fileversion_iri, DCTERMS.requires, checksum_iri('sha-256', self.file_sha256)) + (fileversion_iri, DCTERMS.requires, checksum_iri('sha-256', self.file_sha256)), + (fileversion_iri, OSF.storageRegion, storageregion_iri), + (storageregion_iri, SKOS.prefLabel, Literal('United States', lang='en')), }) def test_gather_files(self): @@ -521,11 +534,19 @@ def test_gather_agents(self): assert_triples(osf_gathering.gather_agents(self.projectfocus), { (self.projectfocus.iri, DCTERMS.creator, self.userfocus__admin), (self.projectfocus.iri, DCTERMS.creator, self.userfocus__readwrite), + (self.projectfocus.iri, DCTERMS.creator, self.userfocus__readonly), }) # focus: registration assert_triples(osf_gathering.gather_agents(self.registrationfocus), { (self.registrationfocus.iri, DCTERMS.creator, self.userfocus__admin), (self.registrationfocus.iri, DCTERMS.creator, self.userfocus__readwrite), + (self.registrationfocus.iri, DCTERMS.creator, self.userfocus__readonly), + }) + # focus: preprint + assert_triples(osf_gathering.gather_agents(self.preprintfocus), { + (self.preprintfocus.iri, DCTERMS.creator, self.userfocus__admin), + (self.preprintfocus.iri, DCTERMS.creator, self.userfocus__readwrite), + (self.preprintfocus.iri, DCTERMS.creator, self.userfocus__readonly), }) # focus: file assert_triples(osf_gathering.gather_agents(self.filefocus), set()) @@ -750,3 +771,116 @@ def test_gather_cedar_templates(self): (self.filefocus.iri, OSF.hasCedarTemplate, cedar_template_iri), (cedar_template_iri, DCTERMS.title, Literal(self.cedar_template.schema_name)) }) + + def test_gather_last_month_usage(self): + # no usage report: + with mock.patch( + 'osf.metrics.reports.PublicItemUsageReport.for_last_month', + return_value=None, + ): + assert_triples(osf_gathering.gather_last_month_usage(self.projectfocus), set()) + # yes usage report: + _ym = YearMonth.from_date(datetime.datetime.now(tz=datetime.UTC)) + with mock.patch( + 'osf.metrics.reports.PublicItemUsageReport.for_last_month', + return_value=PublicItemUsageReport( + item_osfid=self.project._id, + report_yearmonth=_ym, + view_count=71, + view_session_count=13, + download_count=43, + download_session_count=11, + ), + ): + _usage_bnode = rdflib.BNode() + assert_triples(osf_gathering.gather_last_month_usage(self.projectfocus), { + (self.projectfocus.iri, OSF.usage, _usage_bnode), + (_usage_bnode, DCTERMS.temporal, Literal(str(_ym), datatype=rdflib.XSD.gYearMonth)), + (_usage_bnode, DCAT.accessService, rdflib.URIRef(website_settings.DOMAIN.rstrip('/'))), + (_usage_bnode, FOAF.primaryTopic, self.projectfocus.iri), + (_usage_bnode, OSF.viewCount, Literal(71)), + (_usage_bnode, OSF.viewSessionCount, Literal(13)), + (_usage_bnode, OSF.downloadCount, Literal(43)), + (_usage_bnode, OSF.downloadSessionCount, Literal(11)), + }) + + def test_gather_addons(self): + # registration (without non-default addon) + assert_triples(osf_gathering.gather_addons(self.registrationfocus), set()) + # project (with non-default addons) + _box_ref = rdflib.URIRef('urn:osf.io:addons:box') + _gitlab_ref = rdflib.URIRef('urn:osf.io:addons:gitlab') + assert_triples(osf_gathering.gather_addons(self.projectfocus), { + (self.projectfocus.iri, OSF.hasOsfAddon, _box_ref), + (_box_ref, RDF.type, OSF.AddonImplementation), + (_box_ref, DCTERMS.identifier, Literal('box')), + (_box_ref, SKOS.prefLabel, Literal('Box')), + (self.projectfocus.iri, OSF.hasOsfAddon, _gitlab_ref), + (_gitlab_ref, RDF.type, OSF.AddonImplementation), + (_gitlab_ref, DCTERMS.identifier, Literal('gitlab')), + (_gitlab_ref, SKOS.prefLabel, Literal('GitLab')), + }) + + def test_gather_storage_region(self): + _default_region_ref = rdflib.URIRef(f'{website_settings.API_DOMAIN}v2/regions/us/') + assert_triples(osf_gathering.gather_storage_region(self.projectfocus), { + (self.projectfocus.iri, OSF.storageRegion, _default_region_ref), + (_default_region_ref, SKOS.prefLabel, Literal('United States', lang='en')), + }) + assert_triples(osf_gathering.gather_storage_region(self.registrationfocus), { + (self.registrationfocus.iri, OSF.storageRegion, _default_region_ref), + (_default_region_ref, SKOS.prefLabel, Literal('United States', lang='en')), + }) + assert_triples(osf_gathering.gather_storage_region(self.preprintfocus), { + (self.preprintfocus.iri, OSF.storageRegion, _default_region_ref), + (_default_region_ref, SKOS.prefLabel, Literal('United States', lang='en')), + }) + + def test_gather_qualified_attributions(self): + _attribution_admin = rdflib.BNode() + _attribution_readwrite = rdflib.BNode() + _attribution_readonly = rdflib.BNode() + assert_triples(osf_gathering.gather_qualified_attributions(self.projectfocus), { + (self.projectfocus.iri, PROV.qualifiedAttribution, _attribution_admin), + (_attribution_admin, PROV.agent, self.userfocus__admin), + (_attribution_admin, DCAT.hadRole, OSF['admin-contributor']), + (self.projectfocus.iri, PROV.qualifiedAttribution, _attribution_readwrite), + (_attribution_readwrite, PROV.agent, self.userfocus__readwrite), + (_attribution_readwrite, DCAT.hadRole, OSF['write-contributor']), + (self.projectfocus.iri, PROV.qualifiedAttribution, _attribution_readonly), + (_attribution_readonly, PROV.agent, self.userfocus__readonly), + (_attribution_readonly, DCAT.hadRole, OSF['readonly-contributor']), + }) + assert_triples(osf_gathering.gather_qualified_attributions(self.registrationfocus), { + (self.registrationfocus.iri, PROV.qualifiedAttribution, _attribution_admin), + (_attribution_admin, PROV.agent, self.userfocus__admin), + (_attribution_admin, DCAT.hadRole, OSF['admin-contributor']), + (self.registrationfocus.iri, PROV.qualifiedAttribution, _attribution_readwrite), + (_attribution_readwrite, PROV.agent, self.userfocus__readwrite), + (_attribution_readwrite, DCAT.hadRole, OSF['write-contributor']), + (self.registrationfocus.iri, PROV.qualifiedAttribution, _attribution_readonly), + (_attribution_readonly, PROV.agent, self.userfocus__readonly), + (_attribution_readonly, DCAT.hadRole, OSF['readonly-contributor']), + }) + assert_triples(osf_gathering.gather_qualified_attributions(self.preprintfocus), { + (self.preprintfocus.iri, PROV.qualifiedAttribution, _attribution_admin), + (_attribution_admin, PROV.agent, self.userfocus__admin), + (_attribution_admin, DCAT.hadRole, OSF['admin-contributor']), + (self.preprintfocus.iri, PROV.qualifiedAttribution, _attribution_readwrite), + (_attribution_readwrite, PROV.agent, self.userfocus__readwrite), + (_attribution_readwrite, DCAT.hadRole, OSF['write-contributor']), + (self.preprintfocus.iri, PROV.qualifiedAttribution, _attribution_readonly), + (_attribution_readonly, PROV.agent, self.userfocus__readonly), + (_attribution_readonly, DCAT.hadRole, OSF['readonly-contributor']), + }) + + def test_gather_storage_byte_count(self): + assert_triples(osf_gathering.gather_storage_byte_count(self.projectfocus), { + (self.projectfocus.iri, OSF.storageByteCount, Literal(123456)), + }) + assert_triples(osf_gathering.gather_storage_byte_count(self.registrationfocus), { + (self.registrationfocus.iri, OSF.storageByteCount, Literal(0)), + }) + assert_triples(osf_gathering.gather_storage_byte_count(self.preprintfocus), { + (self.preprintfocus.iri, OSF.storageByteCount, Literal(1337)), + }) diff --git a/osf_tests/metadata/test_serialized_metadata.py b/osf_tests/metadata/test_serialized_metadata.py index 0c74961778a..c8a0eee95ac 100644 --- a/osf_tests/metadata/test_serialized_metadata.py +++ b/osf_tests/metadata/test_serialized_metadata.py @@ -5,8 +5,11 @@ import rdflib from osf import models as osfdb +from osf.metadata.osf_gathering import OsfmapPartition from osf.metadata.rdfutils import OSF, DCTERMS from osf.metadata.tools import pls_gather_metadata_file +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth from osf.models.licenses import NodeLicense from api_tests.utils import create_test_file from osf_tests import factories @@ -22,53 +25,103 @@ BASIC_METADATA_SCENARIO = { OSF.Project: { - 'turtle': 'project_basic.turtle', - 'datacite-xml': 'project_basic.datacite.xml', - 'datacite-json': 'project_basic.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'project_basic.turtle', + 'datacite-xml': 'project_basic.datacite.xml', + 'datacite-json': 'project_basic.datacite.json', + }, }, OSF.Preprint: { - 'turtle': 'preprint_basic.turtle', - 'datacite-xml': 'preprint_basic.datacite.xml', - 'datacite-json': 'preprint_basic.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'preprint_basic.turtle', + 'datacite-xml': 'preprint_basic.datacite.xml', + 'datacite-json': 'preprint_basic.datacite.json', + }, }, OSF.Registration: { - 'turtle': 'registration_basic.turtle', - 'datacite-xml': 'registration_basic.datacite.xml', - 'datacite-json': 'registration_basic.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'registration_basic.turtle', + 'datacite-xml': 'registration_basic.datacite.xml', + 'datacite-json': 'registration_basic.datacite.json', + }, }, OSF.File: { - 'turtle': 'file_basic.turtle', - 'datacite-xml': 'file_basic.datacite.xml', - 'datacite-json': 'file_basic.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'file_basic.turtle', + 'datacite-xml': 'file_basic.datacite.xml', + 'datacite-json': 'file_basic.datacite.json', + }, }, DCTERMS.Agent: { - 'turtle': 'user_basic.turtle', + OsfmapPartition.MAIN: { + 'turtle': 'user_basic.turtle', + }, }, } FULL_METADATA_SCENARIO = { OSF.Project: { - 'turtle': 'project_full.turtle', - 'datacite-xml': 'project_full.datacite.xml', - 'datacite-json': 'project_full.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'project_full.turtle', + 'datacite-xml': 'project_full.datacite.xml', + 'datacite-json': 'project_full.datacite.json', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'project_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'project_monthly_supplement.turtle', + }, }, OSF.Preprint: { - 'turtle': 'preprint_full.turtle', - 'datacite-xml': 'preprint_full.datacite.xml', - 'datacite-json': 'preprint_full.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'preprint_full.turtle', + 'datacite-xml': 'preprint_full.datacite.xml', + 'datacite-json': 'preprint_full.datacite.json', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'preprint_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'preprint_monthly_supplement.turtle', + }, }, OSF.Registration: { - 'turtle': 'registration_full.turtle', - 'datacite-xml': 'registration_full.datacite.xml', - 'datacite-json': 'registration_full.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'registration_full.turtle', + 'datacite-xml': 'registration_full.datacite.xml', + 'datacite-json': 'registration_full.datacite.json', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'registration_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'registration_monthly_supplement.turtle', + }, }, OSF.File: { - 'turtle': 'file_full.turtle', - 'datacite-xml': 'file_full.datacite.xml', - 'datacite-json': 'file_full.datacite.json', + OsfmapPartition.MAIN: { + 'turtle': 'file_full.turtle', + 'datacite-xml': 'file_full.datacite.xml', + 'datacite-json': 'file_full.datacite.json', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'file_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'file_monthly_supplement.turtle', + }, }, DCTERMS.Agent: { - 'turtle': 'user_full.turtle', + OsfmapPartition.MAIN: { + 'turtle': 'user_full.turtle', + }, + OsfmapPartition.SUPPLEMENT: { + 'turtle': 'user_supplement.turtle', + }, + OsfmapPartition.MONTHLY_SUPPLEMENT: { + 'turtle': 'user_monthly_supplement.turtle', + }, }, } @@ -124,8 +177,7 @@ def setUp(self): mock.patch('django.utils.timezone.now', new=forever_now), mock.patch('osf.models.metaschema.RegistrationSchema.absolute_api_v2_url', new='http://fake.example/schema/for/test'), ): - patcher.start() - self.addCleanup(patcher.stop) + self.enterContext(patcher) # build test objects self.user = factories.AuthUserFactory( fullname='Person McNamington', @@ -147,12 +199,13 @@ def setUp(self): category='doi', value=f'10.70102/FK2osf.io/{self.project._id}', ) + self.project.add_addon('gitlab', auth=None) self.file = create_test_file( self.project, self.user, filename='my-file.blarg', size=7, - sha256='6ac3c336e4094835293a3fed8a4b5fedde1b5e2626d9838fed50693bba00af0e', + sha256='shashasha', ) osf_preprint_provider = factories.PreprintProviderFactory(_id='osf') another_provider = factories.PreprintProviderFactory( @@ -208,9 +261,26 @@ def setUp(self): doi_prefix='11.rp', ), ) + self.reg_file = create_test_file( + self.registration, + self.user, + filename='my-reg-file.blarg', + size=17, + sha256='shashasha', + ) osfdb.GuidMetadataRecord.objects.for_guid(self.registration._id).update({ 'resource_type_general': 'StudyRegistration', }, auth=self.user) + self.enterContext(mock.patch( + 'osf.metrics.reports.PublicItemUsageReport.for_last_month', + return_value=PublicItemUsageReport( + report_yearmonth=YearMonth.from_date(forever_now()), + view_count=7, + view_session_count=5, + download_count=3, + download_session_count=2, + ), + )) self.guid_dict = { OSF.Project: self.project._id, OSF.Preprint: self.preprint._id, @@ -261,27 +331,37 @@ def test_serialized_metadata(self): self._assert_scenario(FULL_METADATA_SCENARIO) def _assert_scenario(self, scenario_dict): - for focus_type, expected_files in scenario_dict.items(): - for format_key, filename in expected_files.items(): - osfguid = self.guid_dict[focus_type] - gathered_file = pls_gather_metadata_file(osfguid, format_key) - with self.subTest(focus_type=focus_type, format_key=format_key, testpath='pls_gather_metadata_file'): - self.assertEqual(gathered_file.mediatype, EXPECTED_MEDIATYPE[format_key]) - # to update expected metadata, uncomment `_write_expected_file` and this - # next line (being careful not to leave it uncommented...) and run tests - # self._write_expected_file(filename, gathered_file.serialized_metadata) - self._assert_expected_file(filename, gathered_file.serialized_metadata) + for focus_type, by_partition in scenario_dict.items(): + for osfmap_partition, expected_files in by_partition.items(): + for format_key, filename in expected_files.items(): + self._assert_scenario_file(focus_type, osfmap_partition, format_key, filename) - with self.subTest(focus_type=focus_type, format_key=format_key, testpath='metadata download'): - resp = self.app.get(f'/{osfguid}/metadata/?format={format_key}') - assert resp.status_code == 200 - self.assertEqual(resp.status_code, 200) - self.assertEqual(resp.headers['Content-Type'], EXPECTED_MEDIATYPE[format_key]) - self.assertEqual( - resp.headers['Content-Disposition'], - f'attachment; filename={gathered_file.filename}', - ) - self._assert_expected_file(filename, resp.text) + def _assert_scenario_file( + self, + focus_type: str, + osfmap_partition: OsfmapPartition, + format_key: str, + filename: str, + ): + osfguid = self.guid_dict[focus_type] + gathered_file = pls_gather_metadata_file(osfguid, format_key, {'osfmap_partition': osfmap_partition}) + with self.subTest(focus_type=focus_type, format_key=format_key, testpath='pls_gather_metadata_file'): + self.assertEqual(gathered_file.mediatype, EXPECTED_MEDIATYPE[format_key]) + # to update expected metadata, uncomment `_write_expected_file` and this + # next line (being careful not to leave it uncommented...) and run tests + # self._write_expected_file(filename, gathered_file.serialized_metadata) + self._assert_expected_file(filename, gathered_file.serialized_metadata) + if not osfmap_partition.is_supplementary: + with self.subTest(focus_type=focus_type, format_key=format_key, testpath='metadata download'): + resp = self.app.get(f'/{osfguid}/metadata/?format={format_key}') + assert resp.status_code == 200 + self.assertEqual(resp.status_code, 200) + self.assertEqual(resp.headers['Content-Type'], EXPECTED_MEDIATYPE[format_key]) + self.assertEqual( + resp.headers['Content-Disposition'], + f'attachment; filename={gathered_file.filename}', + ) + self._assert_expected_file(filename, resp.text) def _assert_expected_file(self, filename, actual_metadata): _open_mode = ('rb' if isinstance(actual_metadata, bytes) else 'r') @@ -290,16 +370,16 @@ def _assert_expected_file(self, filename, actual_metadata): if filename.endswith('.turtle'): # HACK: because the turtle serializer may output things in different order # TODO: stable turtle serializer (or another primitive rdf serialization) - self._assert_equivalent_turtle(actual_metadata, _expected_metadata) + self._assert_equivalent_turtle(actual_metadata, _expected_metadata, filename) else: self.assertEqual(actual_metadata, _expected_metadata) - def _assert_equivalent_turtle(self, actual_turtle, expected_turtle): + def _assert_equivalent_turtle(self, actual_turtle, expected_turtle, filename): _actual = rdflib.Graph() _actual.parse(data=actual_turtle, format='turtle') _expected = rdflib.Graph() _expected.parse(data=expected_turtle, format='turtle') - assert_graphs_equal(_actual, _expected) + assert_graphs_equal(_actual, _expected, label=filename) # def _write_expected_file(self, filename, expected_metadata): # '''for updating expected metadata files from current serializers diff --git a/osf_tests/metrics/reporters/__init__.py b/osf_tests/metrics/reporters/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/osf_tests/metrics/reporters/_testutils.py b/osf_tests/metrics/reporters/_testutils.py new file mode 100644 index 00000000000..0d18f3bcac9 --- /dev/null +++ b/osf_tests/metrics/reporters/_testutils.py @@ -0,0 +1,10 @@ +from osf.metrics.reporters._base import MonthlyReporter +from osf.metrics.reports import MonthlyReport + + +def list_monthly_reports(reporter: MonthlyReporter) -> list[MonthlyReport]: + _reports = ( + reporter.report(**_kwargs) + for _kwargs in reporter.iter_report_kwargs() + ) + return [_report for _report in _reports if (_report is not None)] diff --git a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py new file mode 100644 index 00000000000..05baa4d38e7 --- /dev/null +++ b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py @@ -0,0 +1,287 @@ +import time +import datetime +import logging +from django.test import TestCase +from osf.metrics.reporters import InstitutionalSummaryMonthlyReporter +from osf.metrics.utils import YearMonth +from osf_tests.factories import ( + InstitutionFactory, + ProjectFactory, + RegistrationFactory, + PreprintFactory, + AuthUserFactory, +) +from ._testutils import list_monthly_reports + + +class TestInstiSummaryMonthlyReporter(TestCase): + + @classmethod + def setUpTestData(cls): + cls._yearmonth = YearMonth(2018, 2) # February 2018 + cls._institution = InstitutionFactory() + cls._now = datetime.datetime(2018, 2, 4, tzinfo=datetime.UTC) + + # Existing data for the primary institution + cls._public_project = cls._create_affiliated_project(cls._institution, is_public=True, created=cls._now) + cls._private_project = cls._create_affiliated_project(cls._institution, is_public=False, created=cls._now) + cls._public_registration = cls._create_affiliated_registration(cls._institution, is_public=True, created=cls._now) + cls._embargoed_registration = cls._create_affiliated_registration(cls._institution, is_public=False, created=cls._now) + + cls._published_preprint = cls._create_affiliated_preprint(cls._institution, is_public=True, created=cls._now) + + cls._logged_in_user = cls._create_logged_in_user(cls._institution, date_last_login=cls._now) + cls._active_user = cls._create_active_user(cls._institution, date_confirmed=cls._now - datetime.timedelta(days=1)) + + @classmethod + def _create_affiliated_preprint(cls, institution, is_public, created): + published_preprint = PreprintFactory(is_public=is_public) + published_preprint.affiliated_institutions.add(institution) + published_preprint.created = created + published_preprint.save() + return published_preprint + + @classmethod + def _create_affiliated_project(cls, institution, is_public, created): + project = ProjectFactory(is_public=is_public) + project.affiliated_institutions.add(institution) + project.created = created + project.save() + return project + + @classmethod + def _create_affiliated_registration(cls, institution, is_public, created): + registration = RegistrationFactory(is_public=is_public) + registration.affiliated_institutions.add(institution) + registration.created = created + registration.save() + return registration + + @classmethod + def _create_logged_in_user(cls, institution, date_last_login): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_last_login = date_last_login + user.save() + return user + + @classmethod + def _create_active_user(cls, institution, date_confirmed): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_confirmed = date_confirmed + ProjectFactory(creator=user) # adds log to make active + log = user.logs.get() + log.created = date_confirmed + log.save() + user.save() + return user + + def test_report_generation(self): + reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) + reports = list_monthly_reports(reporter) + self.assertEqual(len(reports), 1) + + report = reports[0] + self.assertEqual(report.institution_id, self._institution._id) + self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user + self.assertEqual(report.public_project_count, 1) + self.assertEqual(report.private_project_count, 1) + self.assertEqual(report.public_registration_count, 1) + self.assertEqual(report.embargoed_registration_count, 1) + self.assertEqual(report.published_preprint_count, 1) + self.assertEqual(report.storage_byte_count, 1337) # test value for one file + self.assertEqual(report.public_file_count, 1) + self.assertEqual(report.monthly_logged_in_user_count, 1) + self.assertEqual(report.monthly_active_user_count, 1) + + def test_report_generation_multiple_institutions(self): + institution2 = InstitutionFactory() + institution3 = InstitutionFactory() + + # Set up dates for different months + last_month = datetime.datetime(2018, 1, 15, tzinfo=datetime.UTC) + next_month = datetime.datetime(2018, 3, 10, tzinfo=datetime.UTC) + + self._create_affiliated_project(institution2, is_public=True, created=self._now) + self._create_affiliated_project(institution3, is_public=True, created=last_month) + + # Create future projects for self._institution (should not be counted) + self._create_affiliated_project(self._institution, is_public=True, created=next_month) + + # Create users affiliated with different institutions + self._create_active_user(institution2, date_confirmed=self._now) + self._create_active_user(institution3, date_confirmed=last_month) + + # Run the reporter for the current month (February 2018) + reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) + reports = list_monthly_reports(reporter) + self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3 + + # Extract reports by institution + report_institution = next(r for r in reports if r.institution_id == self._institution._id) + report_institution2 = next(r for r in reports if r.institution_id == institution2._id) + + # Validate report for self._institution + self.assertEqual(report_institution.public_project_count, 1) + self.assertEqual(report_institution.private_project_count, 1) + self.assertEqual(report_institution.user_count, 2) + self.assertEqual(report_institution.monthly_active_user_count, 1) + self.assertEqual(report_institution.monthly_logged_in_user_count, 1) + + # Validate report for institution2 + self.assertEqual(report_institution2.public_project_count, 1) + self.assertEqual(report_institution2.private_project_count, 0) + self.assertEqual(report_institution2.user_count, 1) + self.assertEqual(report_institution2.monthly_active_user_count, 1) + self.assertEqual(report_institution2.monthly_logged_in_user_count, 0) # No logged-in users + + +class TestSummaryMonthlyReporterBenchmarker(TestCase): + + @classmethod + def setUpTestData(cls): + cls.logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.INFO) + cls._yearmonth = YearMonth(2018, 2) # February 2018 + cls._institution = InstitutionFactory() + cls._now = datetime.datetime(2018, 2, 4, tzinfo=datetime.UTC) + cls.enable_benchmarking = True + + @classmethod + def _create_affiliated_preprint(cls, institution, is_public, created, creator=None): + published_preprint = PreprintFactory(is_public=is_public, creator=creator) + published_preprint.affiliated_institutions.add(institution) + published_preprint.created = created + published_preprint.save() + return published_preprint + + @classmethod + def _create_affiliated_project(cls, institution, is_public, created, creator=None): + project = ProjectFactory(is_public=is_public, creator=creator) + project.affiliated_institutions.add(institution) + project.created = created + project.save() + return project + + @classmethod + def _create_affiliated_registration(cls, institution, is_public, created, creator=None): + registration = RegistrationFactory(is_public=is_public, creator=creator) + registration.affiliated_institutions.add(institution) + registration.created = created + registration.save() + return registration + + @classmethod + def _create_logged_in_user(cls, institution, date_last_login): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_last_login = date_last_login + user.save() + return user + + @classmethod + def _create_active_user(cls, institution, date_confirmed): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_confirmed = date_confirmed + ProjectFactory(creator=user) # adds log to make active + log = user.logs.get() + log.created = date_confirmed + log.save() + user.save() + return user + + def test_high_counts_multiple_institutions(self): + """ + Test the report generation with configurable high counts for institutions, users, and their objects. + Benchmarking can be enabled by setting the 'enable_benchmarking' attribute to True. + """ + # Check if benchmarking is enabled + enable_benchmarking = self.enable_benchmarking + + # Configure counts (adjust these numbers as needed) + additional_institution_count = 1 # Number of institutions (adjust as needed) + users_per_institution = 3 # Number of users per institution (adjust as needed) + objects_per_user = 3 # Number of objects per user (adjust as needed) + + # Timing variables + if enable_benchmarking: + total_start_time = time.time() + data_creation_start_time = time.time() + + # Create institutions + institutions = [self._institution] + institutions += [InstitutionFactory() for _ in range(additional_institution_count)] + + if enable_benchmarking: + institutions_creation_time = time.time() + self.logger.info( + f"Time taken to create {additional_institution_count + 1} institutions: {institutions_creation_time - data_creation_start_time:.2f} seconds") + + # Generate data for each institution + if enable_benchmarking: + users_creation_start_time = time.time() + institution_users = {} + for institution in institutions: + # Create users for the institution + users = [] + for _ in range(users_per_institution): + user = AuthUserFactory() + user.add_or_update_affiliated_institution(institution) + user.date_last_login = self._now + user.date_confirmed = self._now - datetime.timedelta(days=1) + user.save() + users.append(user) + institution_users[institution] = users + + if enable_benchmarking: + users_creation_time = time.time() + self.logger.info(f"Time taken to create users: {users_creation_time - users_creation_start_time:.2f} seconds") + + # Create projects, registrations, and preprints for each user + if enable_benchmarking: + objects_creation_start_time = time.time() + for institution in institutions: + users = institution_users[institution] + for user in users: + for _ in range(objects_per_user): + self._create_affiliated_project(institution, is_public=True, created=self._now, creator=user) + self._create_affiliated_project(institution, is_public=False, created=self._now, creator=user) + self._create_affiliated_registration(institution, is_public=True, created=self._now, creator=user) + self._create_affiliated_registration(institution, is_public=False, created=self._now, creator=user) + self._create_affiliated_preprint(institution, is_public=True, created=self._now, creator=user) + + if enable_benchmarking: + objects_creation_time = time.time() + self.logger.info( + f"Time taken to create objects: {objects_creation_time - objects_creation_start_time:.2f} seconds") + data_creation_end_time = time.time() + self.logger.info( + f"Total time taken to create data: {data_creation_end_time - data_creation_start_time:.2f} seconds") + + # Run the reporter + if enable_benchmarking: + reporter_start_time = time.time() + reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) + reports = list_monthly_reports(reporter) + assert len(reports) == additional_institution_count + 1 + + if enable_benchmarking: + reporter_end_time = time.time() + self.logger.info(f"Time taken to run the reporter: {reporter_end_time - reporter_start_time:.2f} seconds") + total_end_time = time.time() + self.logger.info(f"Total test execution time: {total_end_time - total_start_time:.2f} seconds") + + self.assertEqual(len(reports), additional_institution_count + 1) + + # Validate counts for each institution + expected_count = users_per_institution * objects_per_user + for report in reports: + self.assertEqual(report.public_project_count, expected_count) + self.assertEqual(report.private_project_count, expected_count) + self.assertEqual(report.public_registration_count, expected_count) + self.assertEqual(report.embargoed_registration_count, expected_count) + self.assertEqual(report.published_preprint_count, expected_count) + self.assertEqual(report.user_count, users_per_institution) + self.assertEqual(report.monthly_logged_in_user_count, users_per_institution) diff --git a/osf_tests/metrics/reporters/test_institutional_users_reporter.py b/osf_tests/metrics/reporters/test_institutional_users_reporter.py new file mode 100644 index 00000000000..275fcb1e8a1 --- /dev/null +++ b/osf_tests/metrics/reporters/test_institutional_users_reporter.py @@ -0,0 +1,263 @@ +from __future__ import annotations +import dataclasses +import datetime +import unittest + +from django.test import TestCase + +from api_tests.utils import create_test_file +from osf import models as osfdb +from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.reporters import InstitutionalUsersReporter +from osf.metrics.utils import YearMonth +from osf_tests.factories import ( + InstitutionFactory, + PreprintFactory, + ProjectFactory, + RegistrationFactory, + UserFactory, + EmbargoFactory, +) +from ._testutils import list_monthly_reports + + +def _patch_now(fakenow: datetime.datetime): + return unittest.mock.patch('django.utils.timezone.now', return_value=fakenow) + + +class TestInstiUsersReporter(TestCase): + @classmethod + def setUpTestData(cls): + cls._yearmonth = YearMonth(2012, 7) + cls._now = datetime.datetime( + cls._yearmonth.year, + cls._yearmonth.month, + 13, # just some day in the month + tzinfo=datetime.UTC, + ) + with _patch_now(cls._now): + cls._institution = InstitutionFactory() + cls._user_setup_with_nothing = _InstiUserSetup(0, 0, 0, 0, 0, cls._institution, cls._now) + cls._user_setup_with_ones = _InstiUserSetup(1, 1, 1, 1, 1, cls._institution, cls._now) + cls._user_setup_with_stuff = _InstiUserSetup( + 2, 3, 5, 3, 2, cls._institution, cls._now, + orcid_id='1111-2222-3333-4444', + department_name='blargl studies', + ) + cls._user_setup_with_stuff.fill_uncounted_objects() + + def _assert_report_matches_setup(self, report: InstitutionalUserReport, setup: _InstiUserSetup): + self.assertEqual(report.institution_id, setup.institution._id) + # user info: + self.assertEqual(report.user_id, setup.user._id) + self.assertEqual(report.user_name, setup.user.fullname) + self.assertEqual(report.department_name, setup.department_name) + self.assertEqual(report.month_last_login, YearMonth.from_date(setup.user.date_last_login)) + if setup.month_last_active: + self.assertEqual(report.month_last_active, YearMonth.from_date(setup.month_last_active)) + else: + self.assertEqual(report.month_last_active, setup.month_last_active) + + self.assertEqual(report.account_creation_date, YearMonth.from_date(setup.user.created)) + self.assertEqual(report.orcid_id, setup.orcid_id) + # counts (NOTE: report.public_file_count and report.storage_byte_count tested separately) + self.assertEqual(report.public_project_count, setup.public_project_count) + self.assertEqual(report.private_project_count, setup.private_project_count) + self.assertEqual(report.public_registration_count, setup.public_registration_count) + self.assertEqual(report.embargoed_registration_count, setup.embargoed_registration_count) + self.assertEqual(report.published_preprint_count, setup.published_preprint_count) + + def test_no_users(self): + _actual_reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self.assertEqual(_actual_reports, []) + + def test_one_user_with_nothing(self): + self._user_setup_with_nothing.affiliate_user() + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self.assertEqual(len(_reports), 1) + self._assert_report_matches_setup(_reports[0], self._user_setup_with_nothing) + + def test_one_user_with_ones(self): + self._user_setup_with_ones.affiliate_user() + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self.assertEqual(len(_reports), 1) + self._assert_report_matches_setup(_reports[0], self._user_setup_with_ones) + + def test_one_user_with_stuff_and_no_files(self): + self._user_setup_with_stuff.affiliate_user() + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self.assertEqual(len(_reports), 1) + self._assert_report_matches_setup(_reports[0], self._user_setup_with_stuff) + self.assertEqual(_reports[0].public_file_count, 2) # preprint 2 files + self.assertEqual(_reports[0].storage_byte_count, 2674) # preprint bytes + + def test_one_user_with_stuff_and_a_file(self): + self._user_setup_with_stuff.affiliate_user() + _user = self._user_setup_with_stuff.user + _project = _user.nodes.first() + with _patch_now(self._now): + create_test_file(target=_project, user=_user, size=37) + (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 3) # 2 preprint files + self.assertEqual(_report.storage_byte_count, 2711) # 2 preprint files + + def test_one_user_with_stuff_and_multiple_files(self): + self._user_setup_with_stuff.affiliate_user() + _user = self._user_setup_with_stuff.user + _project = _user.nodes.first() + with _patch_now(self._now): + create_test_file(target=_project, user=_user, size=37, filename='b') + create_test_file(target=_project, user=_user, size=73, filename='bl') + _component = ProjectFactory(parent=_project, creator=_user, is_public=True) + _component.affiliated_institutions.add(self._institution) + create_test_file(target=_component, user=_user, size=53, filename='bla') + create_test_file(target=_component, user=_user, size=51, filename='blar') + create_test_file(target=_component, user=_user, size=47, filename='blarg') + (_report,) = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 7) # 2 preprint files + self.assertEqual(_report.storage_byte_count, 2935) # 2 preprint files + 37 + 73 + 53 + 51 + 47 + + def test_several_users(self): + _setups = [ + self._user_setup_with_nothing, + self._user_setup_with_ones, + self._user_setup_with_stuff, + ] + for _setup in _setups: + _setup.affiliate_user() + _setup_by_userid = { + _setup.user._id: _setup + for _setup in _setups + } + _reports = list_monthly_reports(InstitutionalUsersReporter(self._yearmonth)) + self.assertEqual(len(_reports), len(_setup_by_userid)) + for _actual_report in _reports: + _setup = _setup_by_userid[_actual_report.user_id] + self._assert_report_matches_setup(_actual_report, _setup) + + +@dataclasses.dataclass +class _InstiUserSetup: + '''helper class to simplify database setup for a test-case + + (note: public_file_count and storage_byte_count set up separately) + ''' + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + institution: osfdb.Institution + now: datetime.datetime + department_name: str | None = None + orcid_id: str | None = None + user: osfdb.OSFUser = dataclasses.field(init=False) + month_last_active: datetime.datetime | None = dataclasses.field(init=False) + + def __post_init__(self): + self.user = UserFactory( + date_last_login=self.now, + external_identity=( + {'ORCID': {self.orcid_id: 'VERIFIED'}} + if self.orcid_id + else {} + ), + ) + self._add_affiliations(self._generate_counted_objects()) + node_logs = self.user.logs.order_by('-created') + preprint_logs = self.user.preprint_logs.order_by('-created') + + dates = filter(bool, [ + node_logs.values_list('created', flat=True).first(), + preprint_logs.values_list('created', flat=True).first(), + ]) + + self.month_last_active = max(dates, default=None) + + def affiliate_user(self): + self.user.add_or_update_affiliated_institution( + self.institution, + sso_department=self.department_name, + ) + + @property + def future_timestamp(self): + return self.now + datetime.timedelta(days=123) + + def fill_uncounted_objects(self): + # uncounted because not affiliated: + self._add_public_project() + self._add_private_project() + self._add_public_registration() + self._add_embargoed_registration() + self._add_published_preprint() + # uncounted because affiliated with another institution: + self._add_affiliations(( + self._add_public_project(), + self._add_private_project(), + self._add_public_registration(), + self._add_embargoed_registration(), + self._add_published_preprint(), + ), institution=InstitutionFactory()) + # uncounted because created after the report's time range: + with _patch_now(self.future_timestamp): + self._add_affiliations(( + self._add_public_project(), + self._add_private_project(), + self._add_public_registration(), + self._add_embargoed_registration(), + self._add_published_preprint(), + )) + + def _add_affiliations(self, objs, institution=None): + for _obj in objs: + if _obj is not None: + _obj.affiliated_institutions.add(institution or self.institution) + + def _generate_counted_objects(self): + for _ in range(self.public_project_count): + yield self._add_public_project() + for _ in range(self.private_project_count): + yield self._add_private_project() + for _ in range(self.public_registration_count): + yield self._add_public_registration() + for _ in range(self.embargoed_registration_count): + yield self._add_embargoed_registration() + for _ in range(self.published_preprint_count): + yield self._add_published_preprint() + + def _add_public_project(self) -> osfdb.Node: + return ProjectFactory( + creator=self.user, + is_public=True, + ) + + def _add_private_project(self) -> osfdb.Node: + return ProjectFactory( + creator=self.user, + is_public=False, + ) + + def _add_public_registration(self) -> osfdb.Registration: + return RegistrationFactory( + creator=self.user, + is_public=True, + ) + + def _add_embargoed_registration(self) -> osfdb.Registration: + return RegistrationFactory( + creator=self.user, + is_public=False, + embargo=EmbargoFactory( + user=self.user, + end_date=self.future_timestamp, + ), + ) + + def _add_published_preprint(self) -> osfdb.Preprint | None: + return PreprintFactory( + creator=self.user, + is_public=True, + ) diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py new file mode 100644 index 00000000000..b75c420b1a2 --- /dev/null +++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py @@ -0,0 +1,284 @@ +from datetime import datetime, timedelta +from operator import attrgetter +from unittest import mock + +import pytest + +from osf.metrics.counted_usage import CountedAuthUsage +from osf.metrics.preprint_metrics import ( + PreprintDownload, + PreprintView, +) +from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth +from osf import models as osfdb +from osf_tests import factories +from ._testutils import list_monthly_reports + + +@pytest.mark.es_metrics +@pytest.mark.django_db +class TestPublicItemUsageReporter: + @pytest.fixture(autouse=True) + def _patch_settings(self): + with mock.patch('website.settings.DOMAIN', 'http://osf.example'): + yield + + @pytest.fixture + def item0(self): + _item0 = factories.PreprintFactory(is_public=True) + _item0._id = 'item0' + return _item0 + + @pytest.fixture + def item1(self): + _item1 = factories.ProjectFactory(is_public=True) + _item1._id = 'item1' + return _item1 + + @pytest.fixture + def item2(self, item1): + _item2 = factories.ProjectFactory(is_public=True, parent=item1) + _item2._id = 'item2' + return _item2 + + @pytest.fixture + def ym_empty(self) -> YearMonth: + return YearMonth(2012, 7) + + @pytest.fixture + def ym_sparse(self) -> YearMonth: + return YearMonth(2017, 7) + + @pytest.fixture + def ym_busy(self) -> YearMonth: + return YearMonth(2023, 7) + + @pytest.fixture + def sparse_month_usage(self, ym_sparse, item0, item1, item2): + # "sparse" month: + # item0: 3 views, 0 downloads, 2 sessions + # item1: 1 views, 1 download, 1 session (plus 1 view from child item2) + # item2: 1 views, 0 downloads, 1 session + _month_start = ym_sparse.month_start() + _save_usage( + item0, + timestamp=_month_start, + session_id='sesh0', + action_labels=['view'], + ) + _save_usage( + item0, + timestamp=_month_start + timedelta(minutes=2), + session_id='sesh0', + action_labels=['view'], + ) + _save_usage( + item1, + timestamp=_month_start + timedelta(minutes=3), + session_id='sesh0', + action_labels=['download'], + ) + _save_usage( + item0, + timestamp=_month_start + timedelta(days=17), + session_id='sesh1', + action_labels=['view'], + ) + _save_usage( + item1, + timestamp=_month_start + timedelta(days=17, minutes=3), + session_id='sesh1', + action_labels=['view'], + ) + _save_usage( + item2, + timestamp=_month_start + timedelta(days=17, minutes=5), + session_id='sesh1', + action_labels=['view'], + ) + _save_usage( + item2, + timestamp=_month_start + timedelta(days=17, minutes=11), + session_id='sesh1', + action_labels=['download'], + ) + + @pytest.fixture + def busy_month_item0(self, ym_busy, item0): + # item0: 4 sessions, 4*7 views, 4*5 downloads + _month_start = ym_busy.month_start() + for _sesh in range(0, 4): + _sesh_start = _month_start + timedelta(days=_sesh) + for _minute in range(0, 7): + _save_usage( + item0, + timestamp=_sesh_start + timedelta(minutes=_minute), + session_id=f'sesh0{_sesh}', + action_labels=['view'], + ) + for _minute in range(10, 15): + _save_usage( + item0, + timestamp=_sesh_start + timedelta(minutes=_minute), + session_id=f'sesh0{_sesh}', + action_labels=['download'], + ) + + @pytest.fixture + def busy_month_item1(self, ym_busy, item1): + # item1: 10 sessions, 6*9 views, 5*7 downloads + # (plus 11 views in 11 sessions from child item2) + _month_start = ym_busy.month_start() + for _sesh in range(0, 6): + _sesh_start = _month_start + timedelta(days=_sesh) + for _minute in range(0, 9): + _save_usage( + item1, + timestamp=_sesh_start + timedelta(minutes=_minute), + session_id=f'sesh1{_sesh}', + action_labels=['view'], + ) + for _sesh in range(5, 10): + _sesh_start = _month_start + timedelta(days=_sesh) + for _minute in range(10, 17): + _save_usage( + item1, + timestamp=_sesh_start + timedelta(minutes=_minute), + session_id=f'sesh1{_sesh}', + action_labels=['download'], + ) + + @pytest.fixture + def busy_month_item2(self, ym_busy, item2): + # item2: 11 sessions, 11 views, 11 downloads (child of item1) + _month_start = ym_busy.month_start() + for _sesh in range(1, 12): + _save_usage( + item2, + timestamp=_month_start + timedelta(days=_sesh), + session_id=f'sesh2{_sesh}', + action_labels=['view'], + ) + _save_usage( + item2, + timestamp=_month_start + timedelta(days=_sesh, hours=_sesh), + session_id=f'sesh2{_sesh}', + action_labels=['download'], + ) + + def test_no_data(self, ym_empty): + _reporter = PublicItemUsageReporter(ym_empty) + _empty = list_monthly_reports(_reporter) + assert _empty == [] + + def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2, item0): + _empty = list_monthly_reports(PublicItemUsageReporter(ym_empty)) + _sparse = list_monthly_reports(PublicItemUsageReporter(ym_sparse)) + _busy = list_monthly_reports(PublicItemUsageReporter(ym_busy)) + + # empty month: + assert _empty == [] + + # sparse month: + assert len(_sparse) == 3 + _sparse_item0, _sparse_item1, _sparse_item2 = sorted(_sparse, key=attrgetter('item_osfid')) + # sparse-month item0 + assert isinstance(_sparse_item0, PublicItemUsageReport) + assert _sparse_item0.item_osfid == 'item0' + assert _sparse_item0.provider_id == [item0.provider._id] + assert _sparse_item0.platform_iri == ['http://osf.example'] + assert _sparse_item0.view_count == 3 + assert _sparse_item0.view_session_count is None # no session count for preprints + assert _sparse_item0.download_count == 0 + assert _sparse_item0.download_session_count is None # no session count for preprints + # sparse-month item1 + assert isinstance(_sparse_item1, PublicItemUsageReport) + assert _sparse_item1.item_osfid == 'item1' + assert _sparse_item1.provider_id == ['osf'] + assert _sparse_item1.platform_iri == ['http://osf.example'] + assert _sparse_item1.view_count == 2 # including item2 + assert _sparse_item1.view_session_count == 1 # including item2 + assert _sparse_item1.download_count == 1 # NOT including item2 + assert _sparse_item1.download_session_count == 1 # NOT including item2 + # sparse-month item2 + assert isinstance(_sparse_item1, PublicItemUsageReport) + assert _sparse_item2.item_osfid == 'item2' + assert _sparse_item2.provider_id == ['osf'] + assert _sparse_item2.platform_iri == ['http://osf.example'] + assert _sparse_item2.view_count == 1 + assert _sparse_item2.view_session_count == 1 + assert _sparse_item2.download_count == 1 + assert _sparse_item2.download_session_count == 1 + + # busy month: + assert len(_busy) == 3 + _busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_osfid')) + # busy-month item0 + assert isinstance(_busy_item0, PublicItemUsageReport) + assert _busy_item0.item_osfid == 'item0' + assert _busy_item0.provider_id == [item0.provider._id] + assert _busy_item0.platform_iri == ['http://osf.example'] + assert _busy_item0.view_count == 4 * 7 + assert _busy_item0.view_session_count is None # no session count for preprints + assert _busy_item0.download_count == 4 * 5 + assert _busy_item0.download_session_count is None # no session count for preprints + # busy-month item1 + assert isinstance(_busy_item1, PublicItemUsageReport) + assert _busy_item1.item_osfid == 'item1' + assert _busy_item1.provider_id == ['osf'] + assert _busy_item1.platform_iri == ['http://osf.example'] + assert _busy_item1.view_count == 6 * 9 + 11 + assert _busy_item1.view_session_count == 6 + 11 + assert _busy_item1.download_count == 5 * 7 + assert _busy_item1.download_session_count == 5 + # busy-month item2 + assert isinstance(_busy_item2, PublicItemUsageReport) + assert _busy_item2.item_osfid == 'item2' + assert _busy_item2.provider_id == ['osf'] + assert _busy_item2.platform_iri == ['http://osf.example'] + assert _busy_item2.view_count == 11 + assert _busy_item2.view_session_count == 11 + assert _busy_item2.download_count == 11 + assert _busy_item2.download_session_count == 11 + + +def _save_usage( + item, + *, + timestamp: datetime, + action_labels: list[str], + **kwargs, +): + _countedusage_kwargs = { + 'timestamp': timestamp, + 'item_guid': item._id, + 'action_labels': action_labels, + 'platform_iri': 'http://osf.example', + **kwargs, + } + CountedAuthUsage(**_countedusage_kwargs).save(refresh=True) + if isinstance(item, osfdb.Preprint): + if 'view' in action_labels: + _save_preprint_view(item, timestamp) + if 'download' in action_labels: + _save_preprint_download(item, timestamp) + + +def _save_preprint_view(preprint, timestamp): + PreprintView( + timestamp=timestamp, + count=1, + preprint_id=preprint._id, + provider_id=preprint.provider._id, + ).save(refresh=True) + + +def _save_preprint_download(preprint, timestamp): + PreprintDownload( + timestamp=timestamp, + count=1, + preprint_id=preprint._id, + provider_id=preprint.provider._id, + ).save(refresh=True) diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py index 2089e7279c9..46375184f95 100644 --- a/osf_tests/metrics/test_daily_report.py +++ b/osf_tests/metrics/test_daily_report.py @@ -1,4 +1,4 @@ -from datetime import date +import datetime from unittest import mock import pytest @@ -21,7 +21,13 @@ class UniqueByDate(DailyReport): class Meta: app_label = 'osf' - today = date(2022, 5, 18) + today = datetime.date(2022, 5, 18) + expected_timestamp = datetime.datetime( + today.year, + today.month, + today.day, + tzinfo=datetime.UTC, + ) reports = [ UniqueByDate(report_date=today), @@ -35,35 +41,47 @@ class Meta: assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is report assert report.meta.id == expected_key + assert report.timestamp == expected_timestamp mock_save.reset_mock() - def test_with_duf(self, mock_save): + def test_with_unique_together(self, mock_save): # multiple reports of this type per day, unique by given field class UniqueByDateAndField(DailyReport): - DAILY_UNIQUE_FIELD = 'duf' - duf = metrics.Keyword() + UNIQUE_TOGETHER_FIELDS = ('report_date', 'uniquefield',) + uniquefield = metrics.Keyword() class Meta: app_label = 'osf' - today = date(2022, 5, 18) + today = datetime.date(2022, 5, 18) + expected_timestamp = datetime.datetime( + today.year, + today.month, + today.day, + tzinfo=datetime.UTC, + ) expected_blah = 'dca57e6cde89b19274ea24bc713971dab137a896b8e06d43a11a3f437cd1d151' - blah_report = UniqueByDateAndField(report_date=today, duf='blah') + blah_report = UniqueByDateAndField(report_date=today, uniquefield='blah') blah_report.save() assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is blah_report assert blah_report.meta.id == expected_blah + assert blah_report.timestamp == expected_timestamp mock_save.reset_mock() expected_fleh = 'e7dd5ff6b087807efcfa958077dc713878f21c65af79b3ccdb5dc2409bf5ad99' - fleh_report = UniqueByDateAndField(report_date=today, duf='fleh') + fleh_report = UniqueByDateAndField(report_date=today, uniquefield='fleh') fleh_report.save() assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is fleh_report assert fleh_report.meta.id == expected_fleh + assert fleh_report.timestamp == expected_timestamp mock_save.reset_mock() - bad_report = UniqueByDateAndField(report_date=today) - with pytest.raises(ReportInvalid): - bad_report.save() + for _bad_report in ( + UniqueByDateAndField(report_date=today), + UniqueByDateAndField(report_date=today, uniquefield=['list', 'of', 'things']), + ): + with pytest.raises(ReportInvalid): + _bad_report.save() diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py new file mode 100644 index 00000000000..3c841e6555c --- /dev/null +++ b/osf_tests/metrics/test_monthly_report.py @@ -0,0 +1,151 @@ +import datetime +from unittest import mock + +import pytest +from elasticsearch_metrics import metrics + +from osf.metrics.reports import MonthlyReport, ReportInvalid, PublicItemUsageReport +from osf.metrics.utils import YearMonth + + +class TestMonthlyReportKey: + @pytest.fixture + def mock_save(self): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save + + def test_default(self, mock_save): + # only one of this type of report per month + class UniqueByMonth(MonthlyReport): + blah = metrics.Keyword() + + class Meta: + app_label = 'osf' + + yearmonth = YearMonth(2022, 5) + expected_timestamp = datetime.datetime(yearmonth.year, yearmonth.month, 1, tzinfo=datetime.UTC) + + reports = [ + UniqueByMonth(report_yearmonth=yearmonth), + UniqueByMonth(report_yearmonth=yearmonth, blah='blah'), + UniqueByMonth(report_yearmonth=yearmonth, blah='fleh'), + ] + expected_key = '8463aac67c1e5a038049196781d8f100f069225352d1829651892cf3fbfc50e2' + + for report in reports: + report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is report + assert report.meta.id == expected_key + assert report.timestamp == expected_timestamp + mock_save.reset_mock() + + def test_with_unique_together(self, mock_save): + # multiple reports of this type per day, unique by given field + class UniqueByMonthAndField(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'uniquefield',) + uniquefield = metrics.Keyword() + + class Meta: + app_label = 'osf' + + yearmonth = YearMonth(2022, 5) + expected_timestamp = datetime.datetime(yearmonth.year, yearmonth.month, 1, tzinfo=datetime.UTC) + + expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08' + blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='blah') + blah_report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is blah_report + assert blah_report.meta.id == expected_blah + assert blah_report.timestamp == expected_timestamp + mock_save.reset_mock() + + expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f' + fleh_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='fleh') + fleh_report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is fleh_report + assert fleh_report.meta.id == expected_fleh + assert fleh_report.timestamp == expected_timestamp + mock_save.reset_mock() + + for _bad_report in ( + UniqueByMonthAndField(report_yearmonth=yearmonth), + UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield=['list']), + ): + with pytest.raises(ReportInvalid): + _bad_report.save() + + +@pytest.mark.es_metrics +class TestLastMonthReport: + @pytest.fixture + def osfid(self): + return 'abced' + + @pytest.fixture + def this_month(self): + return YearMonth.from_date(datetime.date.today()) + + @pytest.fixture + def last_month(self, this_month): + return _prior_yearmonth(this_month) + + @pytest.fixture + def two_months_back(self, last_month): + return _prior_yearmonth(last_month) + + @pytest.fixture + def three_months_back(self, two_months_back): + return _prior_yearmonth(two_months_back) + + @pytest.fixture + def this_month_report(self, osfid, this_month): + return _item_usage_report(this_month, osfid, view_count=77) + + @pytest.fixture + def last_month_report(self, osfid, last_month): + return _item_usage_report(last_month, osfid, view_count=57) + + @pytest.fixture + def diff_last_month_report(self, last_month): + return _item_usage_report(last_month, 'zyxvt', view_count=17) + + @pytest.fixture + def two_months_back_report(self, osfid, two_months_back): + return _item_usage_report(two_months_back, osfid, view_count=27) + + @pytest.fixture + def three_months_back_report(self, osfid, three_months_back): + return _item_usage_report(three_months_back, osfid, view_count=37) + + def test_with_none(self, osfid): + assert PublicItemUsageReport.for_last_month(osfid) is None + + def test_with_others(self, osfid, this_month_report, three_months_back_report, diff_last_month_report): + assert PublicItemUsageReport.for_last_month(osfid) is None + + def test_with_prior_month(self, osfid, this_month_report, two_months_back_report, three_months_back_report, diff_last_month_report): + assert PublicItemUsageReport.for_last_month(osfid) == two_months_back_report + + def test_with_last_month(self, osfid, this_month_report, last_month_report, two_months_back_report, three_months_back_report, diff_last_month_report): + assert PublicItemUsageReport.for_last_month(osfid) == last_month_report + + +def _prior_yearmonth(ym: YearMonth) -> YearMonth: + return ( + YearMonth(ym.year - 1, 12) + if ym.month == 1 + else YearMonth(ym.year, ym.month - 1) + ) + + +def _item_usage_report(ym: YearMonth, osfid: str, **kwargs): + _report = PublicItemUsageReport( + report_yearmonth=ym, + item_osfid=osfid, + **kwargs + ) + _report.save(refresh=True) + return _report diff --git a/osf_tests/metrics/test_spam_count_reporter.py b/osf_tests/metrics/test_spam_count_reporter.py new file mode 100644 index 00000000000..0e7ba6956bf --- /dev/null +++ b/osf_tests/metrics/test_spam_count_reporter.py @@ -0,0 +1,38 @@ +import pytest +from datetime import datetime +from osf.metrics.reporters.private_spam_metrics import PrivateSpamMetricsReporter +from osf.metrics.utils import YearMonth +from osf_tests.factories import NodeLogFactory, NodeFactory +from unittest.mock import patch + +@pytest.mark.django_db +def test_private_spam_metrics_reporter(): + start_date = datetime(2024, 10, 1) + + oopspam_node = NodeFactory(spam_data={'who_flagged': 'oopspam'}) + akismet_node = NodeFactory(spam_data={'who_flagged': 'akismet'}) + + NodeLogFactory.create_batch(10, action='flag_spam', created=start_date, node=oopspam_node) + NodeLogFactory.create_batch(5, action='confirm_ham', created=start_date, node=oopspam_node) + NodeLogFactory.create_batch(20, action='flag_spam', created=start_date, node=akismet_node) + NodeLogFactory.create_batch(10, action='confirm_ham', created=start_date, node=akismet_node) + + report_yearmonth = YearMonth(2024, 10) + + with patch('osf.external.oopspam.client.OOPSpamClient.get_flagged_count') as mock_oopspam_get_flagged_count, \ + patch('osf.external.oopspam.client.OOPSpamClient.get_hammed_count') as mock_oopspam_get_hammed_count, \ + patch('osf.external.askismet.client.AkismetClient.get_flagged_count') as mock_akismet_get_flagged_count, \ + patch('osf.external.askismet.client.AkismetClient.get_hammed_count') as mock_akismet_get_hammed_count: + + mock_oopspam_get_flagged_count.return_value = 10 + mock_oopspam_get_hammed_count.return_value = 5 + mock_akismet_get_flagged_count.return_value = 20 + mock_akismet_get_hammed_count.return_value = 10 + + reporter = PrivateSpamMetricsReporter(report_yearmonth) + report = reporter.report() + + assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}" + assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}" + assert report.node_akismet_flagged == 20, f"Expected 20, got {report.node_akismet_flagged}" + assert report.node_akismet_hammed == 10, f"Expected 10, got {report.node_akismet_hammed}" diff --git a/osf_tests/metrics/test_yearmonth.txt b/osf_tests/metrics/test_yearmonth.txt new file mode 100644 index 00000000000..e078b709b6a --- /dev/null +++ b/osf_tests/metrics/test_yearmonth.txt @@ -0,0 +1,73 @@ +YearMonth tests +(doctest-style, in a way pytest will run; see https://docs.pytest.org/en/stable/how-to/doctest.html ) +>>> from osf.metrics.utils import YearMonth + +basic dataclass behavior: +>>> YearMonth(2000, 2) +YearMonth(year=2000, month=2) +>>> YearMonth(1999, 9) +YearMonth(year=1999, month=9) +>>> ym = YearMonth(2050, 2) +>>> ym.year +2050 +>>> ym.month +2 + +`from_date` constructor, accepts either `datetime.date` or `datetime.datetime`: +>>> import datetime +>>> YearMonth.from_date(datetime.date(1973, 1, 1)) +YearMonth(year=1973, month=1) +>>> YearMonth.from_date(datetime.datetime(1974, 3, 2)) +YearMonth(year=1974, month=3) + +`from_str` constructor, accepts "YYYY-MM" format: +>>> YearMonth.from_str('2000-12') +YearMonth(year=2000, month=12) + +`from_any` constructor, accepts YearMonth, "YYYY-MM", or date/datetime +>>> YearMonth.from_any('2000-12') +YearMonth(year=2000, month=12) +>>> YearMonth.from_any(_) is _ +True +>>> YearMonth.from_any(datetime.date(1973, 1, 1)) +YearMonth(year=1973, month=1) +>>> YearMonth.from_any(datetime.datetime(1974, 3, 2)) +YearMonth(year=1974, month=3) +>>> YearMonth.from_any(None) +Traceback (most recent call last): + ... +ValueError: cannot coerce None into YearMonth +>>> YearMonth.from_any(7) +Traceback (most recent call last): + ... +ValueError: cannot coerce 7 into YearMonth + +`__str__` method gives "YYYY-MM" format: +>>> str(YearMonth(1491, 7)) +'1491-07' + +`next` method gives the next year-month: +>>> ym = YearMonth(1491, 11) +>>> ym.next() +YearMonth(year=1491, month=12) +>>> ym.next().next() +YearMonth(year=1492, month=1) + +`prior` method gives the prior year-month: +>>> ym = YearMonth(1492, 2) +>>> ym.prior() +YearMonth(year=1492, month=1) +>>> ym.prior().prior() +YearMonth(year=1491, month=12) + +`month_start` method: +>>> YearMonth(3333, 3).month_start() +datetime.datetime(3333, 3, 1, 0, 0, tzinfo=datetime.timezone.utc) +>>> YearMonth(1999, 12).month_start().isoformat() +'1999-12-01T00:00:00+00:00' + +`month_end` method: +>>> YearMonth(3333, 3).month_end() +datetime.datetime(3333, 4, 1, 0, 0, tzinfo=datetime.timezone.utc) +>>> YearMonth(1999, 12).month_end().isoformat() +'2000-01-01T00:00:00+00:00' diff --git a/osf_tests/test_management_commands.py b/osf_tests/test_management_commands.py index 8f29e72bc93..26e34601648 100644 --- a/osf_tests/test_management_commands.py +++ b/osf_tests/test_management_commands.py @@ -265,7 +265,7 @@ def test_data_storage_usage_command(self): assert (key, expected_summary_data[key]) == (key, actual_summary_data[key]) -@pytest.mark.es +@pytest.mark.es_metrics @pytest.mark.django_db class TestInstitutionMetricsUpdate: diff --git a/package.json b/package.json index be5c3b44a30..fba6f9fe0b7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "OSF", - "version": "24.07.0", + "version": "24.11.0", "description": "Facilitating Open Science", "repository": "https://github.com/CenterForOpenScience/osf.io", "author": "Center for Open Science", diff --git a/tests/test_campaigns.py b/tests/test_campaigns.py index 587aaaa82d8..1df6a32169a 100644 --- a/tests/test_campaigns.py +++ b/tests/test_campaigns.py @@ -46,6 +46,7 @@ def setUp(self): 'osf-registries', 'osf-registered-reports', 'agu_conference_2023', + 'agu_conference', ] self.refresh = timezone.now() campaigns.CAMPAIGNS = None # force campaign refresh now that preprint providers are populated diff --git a/tests/test_views.py b/tests/test_views.py index f1dbaa3285d..d78e7760c17 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -3438,8 +3438,8 @@ def test_register_after_being_invited_as_unreg_contributor(self, mock_update_sea assert new_user.check_password(password) assert new_user.fullname == real_name - @mock.patch('framework.auth.views.send_confirm_email_async') - def test_register_sends_user_registered_signal(self, mock_send_confirm_email_async): + @mock.patch('framework.auth.views.send_confirm_email') + def test_register_sends_user_registered_signal(self, mock_send_confirm_email): url = api_url_for('register_user') name, email, password = fake.name(), fake_email(), 'underpressure' with capture_signals() as mock_signals: @@ -3453,7 +3453,7 @@ def test_register_sends_user_registered_signal(self, mock_send_confirm_email_asy } ) assert mock_signals.signals_sent() == {auth.signals.user_registered, auth.signals.unconfirmed_user_created} - assert mock_send_confirm_email_async.called + assert mock_send_confirm_email.called @mock.patch('framework.auth.views.mails.send_mail') def test_resend_confirmation(self, send_mail: MagicMock): diff --git a/website/mails/mails.py b/website/mails/mails.py index da66ad8d083..afca9e78f03 100644 --- a/website/mails/mails.py +++ b/website/mails/mails.py @@ -191,6 +191,10 @@ def get_english_article(word): 'confirm_agu_conference_2023', subject='OSF Account Verification, from the American Geophysical Union Conference' ) +CONFIRM_EMAIL_AGU_CONFERENCE = Mail( + 'confirm_agu_conference', + subject='OSF Account Verification, from the American Geophysical Union Conference' +) CONFIRM_EMAIL_PREPRINTS = lambda name, provider: Mail( f'confirm_preprints_{name}', subject=f'OSF Account Verification, {provider}' diff --git a/website/notifications/emails.py b/website/notifications/emails.py index 245baf9f0af..d26d43351d5 100644 --- a/website/notifications/emails.py +++ b/website/notifications/emails.py @@ -176,7 +176,7 @@ def get_user_subscriptions(user, event): if user_subscription: return {key: list(getattr(user_subscription, key).all().values_list('guids___id', flat=True)) for key in constants.NOTIFICATION_TYPES} else: - return {key: [] for key in constants.NOTIFICATION_TYPES} + return {key: [user._id] if (event in constants.USER_SUBSCRIPTIONS_AVAILABLE and key == 'email_transactional') else [] for key in constants.NOTIFICATION_TYPES} def get_node_lineage(node): diff --git a/website/policies/views.py b/website/policies/views.py new file mode 100644 index 00000000000..c13ad197dae --- /dev/null +++ b/website/policies/views.py @@ -0,0 +1,19 @@ +import markdown + +from website.settings import \ + PRIVACY_POLICY_PATH, PRIVACY_POLICY_GITHUB_LINK, \ + TERMS_POLICY_PATH, TERMS_POLICY_GITHUB_LINK + +def privacy_policy(): + with open(PRIVACY_POLICY_PATH, 'r') as policy_file: + return { + 'policy_content': markdown.markdown(policy_file.read(), extensions=['toc']), + 'POLICY_GITHUB_LINK': PRIVACY_POLICY_GITHUB_LINK + } + +def terms_policy(): + with open(TERMS_POLICY_PATH, 'r') as policy_file: + return { + 'policy_content': markdown.markdown(policy_file.read(), extensions=['toc']), + 'POLICY_GITHUB_LINK': TERMS_POLICY_GITHUB_LINK + } diff --git a/website/routes.py b/website/routes.py index 61d6c96c9aa..ce328c3dcd7 100644 --- a/website/routes.py +++ b/website/routes.py @@ -53,6 +53,7 @@ from addons.base import views as addon_views from website.discovery import views as discovery_views from website.conferences import views as conference_views +from website.policies import views as policy_views from website.preprints import views as preprint_views from website.registries import views as registries_views from website.reviews import views as reviews_views @@ -176,8 +177,11 @@ def get_globals(): def is_private_link_anonymous_view(): # Avoid circular import from osf.models import PrivateLink + view_only = request.args.get('view_only') + if not view_only: + return False try: - return PrivateLink.objects.filter(key=request.args.get('view_only')).values_list('anonymous', flat=True).get() + return PrivateLink.objects.filter(key=view_only).values_list('anonymous', flat=True).get() except PrivateLink.DoesNotExist: return False @@ -1142,6 +1146,18 @@ def make_url_map(app): Rule('/goodbye/', 'get', goodbye, notemplate), + Rule( + '/privacy_policy/', + 'get', + policy_views.privacy_policy, + OsfWebRenderer('policies/generic_policy.mako', trust=True) + ), + Rule( + '/terms_of_use/', + 'get', + policy_views.terms_policy, + OsfWebRenderer('policies/generic_policy.mako', trust=True) + ), Rule( [ '/project//', diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 07ca9d89c64..1bb36e6ce3e 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -26,6 +26,9 @@ def parent_dir(path): STATIC_FOLDER = os.path.join(BASE_PATH, 'static') STATIC_URL_PATH = '/static' ASSET_HASH_PATH = os.path.join(APP_PATH, 'webpack-assets.json') +POLICY_PATH = os.path.join(APP_PATH, 'COS_POLICIES') +PRIVACY_POLICY_PATH = os.path.join(POLICY_PATH, 'PRIVACY_POLICY.md') +TERMS_POLICY_PATH = os.path.join(POLICY_PATH, 'TERMS_OF_USE.md') ROOT = os.path.join(BASE_PATH, '..') BCRYPT_LOG_ROUNDS = 12 LOG_LEVEL = logging.INFO @@ -446,6 +449,7 @@ class CeleryConfig: 'osf.management.commands.daily_reporters_go', 'osf.management.commands.monthly_reporters_go', 'osf.management.commands.ingest_cedar_metadata_templates', + 'osf.metrics.reporters', } med_pri_modules = { @@ -2047,10 +2051,12 @@ class CeleryConfig: OSF_REGISTRIES_LOGO = 'osf_registries' OSF_LOGO_LIST = [OSF_LOGO, OSF_PREPRINTS_LOGO, OSF_MEETINGS_LOGO, OSF_PREREG_LOGO, OSF_REGISTRIES_LOGO] +PRIVACY_POLICY_GITHUB_LINK = 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/PRIVACY_POLICY.md' +TERMS_POLICY_GITHUB_LINK = 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/TERMS_OF_USE.md' FOOTER_LINKS = { - 'terms': 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/TERMS_OF_USE.md', - 'privacyPolicy': 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/PRIVACY_POLICY.md', - 'cookies': 'https://github.com/CenterForOpenScience/centerforopenscience.org/blob/master/PRIVACY_POLICY.md#f-cookies', + 'terms': 'https://osf.io/terms_of_use/', + 'privacyPolicy': 'https://osf.io/privacy_policy/', + 'cookies': 'https://osf.io/privacy_policy/#f-cookies', 'cos': 'https://cos.io', 'statusPage': 'https://status.cos.io/', 'apiDocs': 'https://developer.osf.io/', diff --git a/website/templates/emails/confirm_agu_conference.html.mako b/website/templates/emails/confirm_agu_conference.html.mako new file mode 100644 index 00000000000..603e2c39e8d --- /dev/null +++ b/website/templates/emails/confirm_agu_conference.html.mako @@ -0,0 +1,26 @@ +<%inherit file="notify_base.mako" /> + +<%def name="content()"> + + + Hello ${user.fullname},
+
+ + Thank you for joining us at the AGU Open Science Pavilion, and welcome to the Open Science Framework (OSF). + + We are pleased to offer a special AGU attendees exclusive 1:1 consultation to continue our conversation and to help + you get oriented on the OSF. This is an opportunity for us to show you useful OSF features, talk about + open science in Earth and space sciences, and for you to ask any questions you may have. + You can sign up to participate by completing this form, and a member of our team will be in touch to + determine your availability: +
+ https://docs.google.com/forms/d/e/1FAIpQLSeJ23YPaEMdbLY1OqbcP85Tt6rhLpFoOtH0Yg4vY_wSKULRcw/viewform?usp=sf_link +

+ To confirm your OSF account, please verify your email address by visiting this link:
+
+ ${confirmation_url}
+
+ From the team at the Center for Open Science
+ + + diff --git a/website/templates/emails/tou_notif.html.mako b/website/templates/emails/tou_notif.html.mako index 1da8c0cbc07..56130626668 100644 --- a/website/templates/emails/tou_notif.html.mako +++ b/website/templates/emails/tou_notif.html.mako @@ -3,12 +3,12 @@ <%def name="content()"> - Hi ${fullname},
+ Hi ${given_name},

- On August 10, 2020 the COS Websites and Services
Terms of Use will change. The updates to the Terms are necessary to support continued use of the Websites and Services by the public.
- To better understand what has changed, go here.
+ On Friday, January 10, 2025 the COS Websites and Services Terms of Use and Privacy Policy will change. The updates to the Terms are necessary to support continued use of the Websites and Services by the public.
+ To better understand what has changed, see the Terms of Use change summary and Privacy Policy change summary.

- If you have any questions email support@osf.io.
+ You do not need to take any actions to acknowledge these updates. If you have any questions, please email support@osf.io.

Regards,

diff --git a/website/templates/policies/generic_policy.mako b/website/templates/policies/generic_policy.mako new file mode 100644 index 00000000000..6ae8581d350 --- /dev/null +++ b/website/templates/policies/generic_policy.mako @@ -0,0 +1,16 @@ +<%inherit file="base.mako"/> + +<%def name="content()"> +
+
+
+
+ ${policy_content} +
+
+
+ Version history for this policy is available here +
+
+
+ diff --git a/website/util/metrics.py b/website/util/metrics.py index 7324a410138..c76adb89f5a 100644 --- a/website/util/metrics.py +++ b/website/util/metrics.py @@ -57,6 +57,7 @@ class CampaignSourceTags(Enum): OsfRegisteredReports = campaign_source_tag('osf_registered_reports') Osf4m = campaign_source_tag('osf4m') AguConference2023 = campaign_source_tag('agu_conference_2023') + AguConference = campaign_source_tag('agu_conference') class OsfClaimedTags(Enum):