From 71f678716471a351eafd738acf0aa62630e65f00 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 10:34:10 +1000 Subject: [PATCH 01/25] [QOL-9122] general cleanup - standardise whitespace - sort imports alphabetically - refactor code to extract helper functions - add more doc comments - put binary operators at start of line instead of end - use parameterised logging --- ckanext/validation/cli.py | 36 +++++++- ckanext/validation/commands.py | 3 +- ckanext/validation/common.py | 69 ++++++++-------- ckanext/validation/controller.py | 2 +- ckanext/validation/helpers.py | 8 -- ckanext/validation/logic.py | 27 +++--- ckanext/validation/model.py | 10 +++ ckanext/validation/plugin/__init__.py | 52 ++++++------ .../templates/validation/validation_read.html | 2 +- ckanext/validation/tests/helpers.py | 6 +- ckanext/validation/tests/test_form.py | 67 +++++++-------- ckanext/validation/tests/test_helpers.py | 8 +- ckanext/validation/tests/test_interfaces.py | 4 +- ckanext/validation/tests/test_jobs.py | 20 ++--- ckanext/validation/tests/test_logic.py | 27 +++--- ckanext/validation/tests/test_plugin.py | 82 +++++-------------- ckanext/validation/tests/test_utils.py | 4 +- ckanext/validation/tests/test_validators.py | 6 +- ckanext/validation/utils.py | 2 +- ckanext/validation/views.py | 5 +- setup.py | 5 +- 21 files changed, 219 insertions(+), 226 deletions(-) diff --git a/ckanext/validation/cli.py b/ckanext/validation/cli.py index a87b36ba..290dcbf9 100644 --- a/ckanext/validation/cli.py +++ b/ckanext/validation/cli.py @@ -1,6 +1,8 @@ +# encoding: utf-8 + import click -import ckanext.validation.common as common +from ckanext.validation import common def get_commands(): @@ -16,6 +18,8 @@ def validation(): @validation.command(name='init-db') def init_db(): + """ Initialize database tables. + """ common.init_db() @@ -41,6 +45,13 @@ def init_db(): u'Note that when using this you will have to specify the resource formats to target yourself.' u' Not to be used with -r or -d.') def run_validation(yes, resource, dataset, search): + '''Start asynchronous data validation on the site resources. If no + options are provided it will run validation on all resources of + the supported formats (`ckanext.validation.formats`). You can + specify particular datasets to run the validation on their + resources. You can also pass arbitrary search parameters to filter + the selected datasets. + ''' common.run_validation(yes, resource, dataset, search) @@ -49,6 +60,17 @@ def run_validation(yes, resource, dataset, search): help=u'Location of the CSV validation report file on the relevant commands.', default=u'validation_errors_report.csv') def report(output): + '''Generate a report with all current data validation reports. This + will print an overview of the total number of tabular resources + and a breakdown of how many have a validation status of success, + failure or error. Additionally it will create a CSV report with all + failing resources, including the following fields: + * Dataset name + * Resource id + * Resource URL + * Status + * Validation report URL + ''' common.report(output) @@ -57,4 +79,16 @@ def report(output): help=u'Location of the CSV validation report file on the relevant commands.', default=u'validation_errors_report.csv') def report_full(output): + '''Generate a detailed report. This is similar to 'report' + but on the CSV report it will add a row for each error found on the + validation report (limited to ten occurrences of the same error + type per file). So the fields in the generated CSV report will be: + + * Dataset name + * Resource id + * Resource URL + * Status + * Error code + * Error message + ''' common.report(output, full=True) diff --git a/ckanext/validation/commands.py b/ckanext/validation/commands.py index e1372198..04505bb9 100644 --- a/ckanext/validation/commands.py +++ b/ckanext/validation/commands.py @@ -1,9 +1,10 @@ # encoding: utf-8 + import sys from ckantoolkit import CkanCommand -import ckanext.validation.common as common +from ckanext.validation import common class Validation(CkanCommand): diff --git a/ckanext/validation/common.py b/ckanext/validation/common.py index 2da90925..4f7bfa53 100644 --- a/ckanext/validation/common.py +++ b/ckanext/validation/common.py @@ -1,16 +1,18 @@ -from __future__ import print_function +# encoding: utf-8 -import sys -import logging import csv +import logging +import six +import sys from ckantoolkit import (c, NotAuthorized, ObjectNotFound, abort, _, render, get_action, config, check_ckan_version) -from ckanext.validation.model import create_tables, tables_exist + from ckanext.validation import settings from ckanext.validation.logic import _search_datasets +from ckanext.validation.model import create_tables, tables_exist log = logging.getLogger(__name__) @@ -43,19 +45,29 @@ def validation(resource_id): u'validation': validation, u'resource': resource, u'pkg_dict': dataset, + u'dataset': dataset, }) except NotAuthorized: - abort(403, _(u'Unauthorized to read this validation report')) + return abort(403, _(u'Unauthorized to read this validation report')) except ObjectNotFound: - - abort(404, _(u'No validation report exists for this resource')) + return abort(404, _(u'No validation report exists for this resource')) ############################################################################### # CLI # ############################################################################### + +def user_confirm(msg): + if check_ckan_version(min_version='2.9'): + import click + return click.confirm(msg) + else: + from ckan.lib.cli import query_yes_no + return query_yes_no(msg) == 'yes' + + def error(msg): ''' Print an error message to STDOUT and exit with return code 1. @@ -74,10 +86,10 @@ def init_db(): print(u'Validation tables created') -def run_validation(assume_yes, resource_id, dataset_id, search_params): +def run_validation(assume_yes, resource_ids, dataset_ids, search_params): - if resource_id: - for resource_id in resource_id: + if resource_ids: + for resource_id in resource_ids: resource = get_action('resource_show')({}, {'id': resource_id}) _run_validation_on_resource( resource['id'], resource['package_id']) @@ -89,23 +101,15 @@ def run_validation(assume_yes, resource_id, dataset_id, search_params): error('No suitable datasets, exiting...') elif not assume_yes: - - msg = ('\nYou are about to start validation for {0} datasets' + + msg = ('\nYou are about to start validation for {0} datasets' '.\n Do you want to continue?') - if check_ckan_version(min_version='2.9.0'): - import click - if not click.confirm(msg.format(query['count'])): - error('Command aborted by user') - else: - from ckan.lib.cli import query_yes_no - confirm = query_yes_no(msg.format(query['count'])) - if confirm == 'no': - error('Command aborted by user') + if not user_confirm(msg.format(query['count'])): + error('Command aborted by user') result = get_action('resource_validation_run_batch')( {'ignore_auth': True}, - {'dataset_ids': dataset_id, + {'dataset_ids': dataset_ids, 'query': search_params} ) print(result['output']) @@ -118,11 +122,8 @@ def _run_validation_on_resource(resource_id, dataset_id): {u'resource_id': resource_id, u'async': True}) - msg = ('Resource {} from dataset {} sent to ' + - 'the validation queue') - - log.debug( - msg.format(resource_id, dataset_id)) + log.debug('Resource %s from dataset %s sent to the validation queue', + resource_id, dataset_id) def _process_row(dataset, resource, writer): @@ -248,7 +249,7 @@ def report(output_csv, full=False): row_counts = _process_row_full(dataset, resource, writer) if not row_counts: continue - for code, count in row_counts.items(): + for code, count in six.iteritems(row_counts): if code not in error_counts: error_counts[code] = count else: @@ -277,24 +278,24 @@ def report(output_csv, full=False): outputs['output_csv'] = output_csv outputs['formats_success_output'] = '' - for count, code in sorted([(v, k) for k, v in outputs['formats_success'].items()], reverse=True): + for count, code in sorted([(v, k) for k, v in six.iteritems(outputs['formats_success'])], reverse=True): outputs['formats_success_output'] += '* {}: {}\n'.format(code, count) outputs['formats_failure_output'] = '' - for count, code in sorted([(v, k) for k, v in outputs['formats_failure'].items()], reverse=True): + for count, code in sorted([(v, k) for k, v in six.iteritems(outputs['formats_failure'])], reverse=True): outputs['formats_failure_output'] += '* {}: {}\n'.format(code, count) error_counts_output = '' if full: - for count, code in sorted([(v, k) for k, v in error_counts.items()], reverse=True): + for count, code in sorted([(v, k) for k, v in six.iteritems(error_counts)], reverse=True): error_counts_output += '* {}: {}\n'.format(code, count) outputs['error_counts_output'] = error_counts_output msg_errors = ''' - Errors breakdown: - {} - '''.format(outputs['error_counts_output']) + Errors breakdown: + {} + '''.format(outputs['error_counts_output']) outputs['msg_errors'] = msg_errors if full else '' diff --git a/ckanext/validation/controller.py b/ckanext/validation/controller.py index f4dd5c60..b4396a21 100644 --- a/ckanext/validation/controller.py +++ b/ckanext/validation/controller.py @@ -2,7 +2,7 @@ from ckantoolkit import BaseController -import ckanext.validation.common as common +from ckanext.validation import common class ValidationController(BaseController): diff --git a/ckanext/validation/helpers.py b/ckanext/validation/helpers.py index 2b591f19..c6381034 100644 --- a/ckanext/validation/helpers.py +++ b/ckanext/validation/helpers.py @@ -22,14 +22,6 @@ def get_validation_badge(resource, in_listing=False): 'unknown': _('unknown'), } - messages = { - 'success': _('Valid data'), - 'failure': _('Invalid data'), - 'invalid': _('Invalid data'), - 'error': _('Error during validation'), - 'unknown': _('Data validation unknown'), - } - if resource['validation_status'] in ['success', 'failure', 'error']: status = resource['validation_status'] if status == 'failure': diff --git a/ckanext/validation/logic.py b/ckanext/validation/logic.py index 57fd230b..f278d81d 100644 --- a/ckanext/validation/logic.py +++ b/ckanext/validation/logic.py @@ -83,11 +83,12 @@ def resource_validation_run(context, data_dict): t.check_access(u'resource_validation_run', context, data_dict) - if not data_dict.get(u'resource_id'): + resource_id = data_dict.get(u'resource_id') + if not resource_id: raise t.ValidationError({u'resource_id': u'Missing value'}) resource = t.get_action(u'resource_show')( - {}, {u'id': data_dict[u'resource_id']}) + {}, {u'id': resource_id}) # TODO: limit to sysadmins async_job = data_dict.get(u'async', True) @@ -95,7 +96,7 @@ def resource_validation_run(context, data_dict): # Ensure format is supported if not resource.get(u'format', u'').lower() in settings.SUPPORTED_FORMATS: raise t.ValidationError( - {u'format': u'Unsupported resource format.' + + {u'format': u'Unsupported resource format.' u'Must be one of {}'.format( u','.join(settings.SUPPORTED_FORMATS))}) @@ -110,7 +111,7 @@ def resource_validation_run(context, data_dict): try: validation = Session.query(Validation).filter( - Validation.resource_id == data_dict['resource_id']).one() + Validation.resource_id == resource_id).one() except NoResultFound: validation = None @@ -310,9 +311,8 @@ def resource_validation_run_batch(context, data_dict): except t.ValidationError as e: log.warning( - u'Could not run validation for resource {} '.format(resource['id']) + - u'from dataset {}: {}'.format( - dataset['name'], str(e))) + u'Could not run validation for resource %s from dataset %s: %s', + resource['id'], dataset['name'], e) if len(query['results']) < page_size: break @@ -388,8 +388,8 @@ def _update_search_params(search_data_dict, user_search_params=None): else: search_data_dict['fq'] = user_search_params['fq'] - if (user_search_params.get('fq_list') and - isinstance(user_search_params['fq_list'], list)): + if (user_search_params.get('fq_list') + and isinstance(user_search_params['fq_list'], list)): search_data_dict['fq_list'].extend(user_search_params['fq_list']) @@ -565,8 +565,8 @@ def resource_update(context, data_dict): raise t.ObjectNotFound(t._('Resource was not found.')) # Persist the datastore_active extra if already present and not provided - if ('datastore_active' in resource.extras and - 'datastore_active' not in data_dict): + if ('datastore_active' in resource.extras + and 'datastore_active' not in data_dict): data_dict['datastore_active'] = resource.extras['datastore_active'] for plugin in plugins.PluginImplementations(plugins.IResourceController): @@ -642,9 +642,8 @@ def _run_sync_validation(resource_id, local_upload=False, new_resource=True): {u'resource_id': resource_id, u'async': False}) except t.ValidationError as e: - log.info( - u'Could not run validation for resource {}: {}'.format( - resource_id, str(e))) + log.info(u'Could not run validation for resource %s: %s', + resource_id, e) return validation = t.get_action(u'resource_validation_show')( diff --git a/ckanext/validation/model.py b/ckanext/validation/model.py index 80435402..9e82b7f7 100644 --- a/ckanext/validation/model.py +++ b/ckanext/validation/model.py @@ -26,10 +26,20 @@ class Validation(Base): id = Column(Unicode, primary_key=True, default=make_uuid) resource_id = Column(Unicode) + # status can be one of these values: + # created: Job created and put onto queue + # running: Job picked up by worker and being processed + # success: Validation Successful and report attached + # failure: Validation Failed and report attached + # error: Validation Job could not create validation report status = Column(Unicode, default=u'created') + # created is when job was added created = Column(DateTime, default=datetime.datetime.utcnow) + # finished is when report was generated, is None when new or restarted finished = Column(DateTime) + # json object of report, can be None report = Column(JSON) + # json object of error, can be None error = Column(JSON) diff --git a/ckanext/validation/plugin/__init__.py b/ckanext/validation/plugin/__init__.py index de615c79..43ecce53 100644 --- a/ckanext/validation/plugin/__init__.py +++ b/ckanext/validation/plugin/__init__.py @@ -1,8 +1,8 @@ # encoding: utf-8 -import os -import logging -import json +import json +import logging +import os from six import string_types import ckan.plugins as p @@ -72,11 +72,15 @@ def i18n_directory(self): def update_config(self, config_): if not tables_exist(): + if t.check_ckan_version('2.9'): + init_command = 'ckan validation init-db' + else: + init_command = 'paster --plugin=ckanext-validation validation init-db' log.critical(u''' -The validation extension requires a database setup. Please run the following -to create the database tables: - paster --plugin=ckanext-validation validation init-db -''') +The validation extension requires a database setup. +Validation pages will not be enabled. +Please run the following to create the database tables: + %s''', init_command) else: log.debug(u'Validation tables exist') @@ -145,8 +149,8 @@ def _process_schema_fields(self, data_dict): and schema_upload.filename: data_dict[u'schema'] = _get_underlying_file(schema_upload).read() elif schema_url: - if (not isinstance(schema_url, string_types) or - not schema_url.lower()[:4] == u'http'): + if (not isinstance(schema_url, string_types) + or not schema_url.lower()[:4] == u'http'): raise t.ValidationError({u'schema_url': 'Must be a valid URL'}) data_dict[u'schema'] = schema_url elif schema_json: @@ -186,9 +190,9 @@ def _handle_validation_for_resource(self, context, resource): needs_validation = False if (( # File uploaded - resource.get(u'url_type') == u'upload' or + resource.get(u'url_type') == u'upload' # URL defined - resource.get(u'url') + or resource.get(u'url') ) and ( # Make sure format is supported resource.get(u'format', u'').lower() in @@ -200,7 +204,7 @@ def _handle_validation_for_resource(self, context, resource): for plugin in p.PluginImplementations(IDataValidation): if not plugin.can_validate(context, resource): - log.debug('Skipping validation for resource {}'.format(resource['id'])) + log.debug('Skipping validation for resource %s', resource['id']) return _run_async_validation(resource[u'id']) @@ -213,22 +217,21 @@ def before_update(self, context, current_resource, updated_resource): return updated_resource needs_validation = False - if (( + if ( # New file uploaded - updated_resource.get(u'upload') or + updated_resource.get(u'upload') # External URL changed - updated_resource.get(u'url') != current_resource.get(u'url') or + or updated_resource.get(u'url') != current_resource.get(u'url') # Schema changed - (updated_resource.get(u'schema') != - current_resource.get(u'schema')) or + or (updated_resource.get(u'schema') + != current_resource.get(u'schema')) # Format changed - (updated_resource.get(u'format', u'').lower() != - current_resource.get(u'format', u'').lower()) + or (updated_resource.get(u'format', u'').lower() + != current_resource.get(u'format', u'').lower()) ) and ( # Make sure format is supported updated_resource.get(u'format', u'').lower() in - settings.SUPPORTED_FORMATS - )): + settings.SUPPORTED_FORMATS): needs_validation = True if needs_validation: @@ -271,7 +274,7 @@ def after_update(self, context, data_dict): if resource_id in self.resources_to_validate: for plugin in p.PluginImplementations(IDataValidation): if not plugin.can_validate(context, data_dict): - log.debug('Skipping validation for resource {}'.format(data_dict['id'])) + log.debug('Skipping validation for resource %s', data_dict['id']) return del self.resources_to_validate[resource_id] @@ -310,6 +313,5 @@ def _run_async_validation(resource_id): {u'resource_id': resource_id, u'async': True}) except t.ValidationError as e: - log.warning( - u'Could not run validation for resource {}: {}'.format( - resource_id, str(e))) + log.warning(u'Could not run validation for resource %s: %s', + resource_id, e) diff --git a/ckanext/validation/templates/validation/validation_read.html b/ckanext/validation/templates/validation/validation_read.html index dcb0387b..c557c960 100644 --- a/ckanext/validation/templates/validation/validation_read.html +++ b/ckanext/validation/templates/validation/validation_read.html @@ -45,7 +45,7 @@

{{ h.resource_display_name(resource) | truncate(50) }} - {% include 'validation/snippets/validation_report_' ~ type ~ '.html' %} + {% include 'validation/snippets/validation_report_' ~ type ~ '.html' %} {% endblock %} diff --git a/ckanext/validation/tests/helpers.py b/ckanext/validation/tests/helpers.py index b6d40e73..817702c8 100644 --- a/ckanext/validation/tests/helpers.py +++ b/ckanext/validation/tests/helpers.py @@ -1,10 +1,12 @@ +# encoding: utf-8 + +import functools +import mock import six if six.PY3: import builtins else: import __builtin__ as builtins -import functools -import mock from werkzeug.datastructures import FileStorage as FlaskFileStorage from pyfakefs import fake_filesystem diff --git a/ckanext/validation/tests/test_form.py b/ckanext/validation/tests/test_form.py index 1b32af4c..f3af3694 100644 --- a/ckanext/validation/tests/test_form.py +++ b/ckanext/validation/tests/test_form.py @@ -1,5 +1,7 @@ -import json +# encoding: utf-8 + import io +import json import mock import pytest @@ -117,10 +119,10 @@ def test_resource_form_create(self, app): json_value = json.dumps(value) params = { - 'package_id': dataset['id'], - 'url': 'https://example.com/data.csv', - 'schema': json_value, - } + 'package_id': dataset['id'], + 'url': 'https://example.com/data.csv', + 'schema': json_value, + } _post(app, NEW_RESOURCE_URL.format(dataset['id']), params) @@ -140,10 +142,10 @@ def test_resource_form_create_json(self, app): json_value = json.dumps(value) params = { - 'package_id': dataset['id'], - 'url': 'https://example.com/data.csv', - 'schema_json': json_value, - } + 'package_id': dataset['id'], + 'url': 'https://example.com/data.csv', + 'schema_json': json_value, + } _post(app, NEW_RESOURCE_URL.format(dataset['id']), params) @@ -164,8 +166,8 @@ def test_resource_form_create_upload(self, mock_open): upload = ('schema_upload', 'schema.json', json_value) params = { - 'url': 'https://example.com/data.csv', - } + 'url': 'https://example.com/data.csv', + } _post(self.app, NEW_RESOURCE_URL.format(dataset['id']), params, upload=[upload]) @@ -180,10 +182,10 @@ def test_resource_form_create_url(self, app): value = 'https://example.com/schemas.json' params = { - 'package_id': dataset['id'], - 'url': 'https://example.com/data.csv', - 'schema_json': value, - } + 'package_id': dataset['id'], + 'url': 'https://example.com/data.csv', + 'schema_json': value, + } _post(app, NEW_RESOURCE_URL.format(dataset['id']), params) @@ -205,8 +207,10 @@ def test_resource_form_update(self, app): }] ) + resource_id = dataset['resources'][0]['id'] + response = _get_resource_update_page_as_sysadmin( - app, dataset['id'], dataset['resources'][0]['id']) + app, dataset['id'], resource_id) form = _get_form(response) assert form.find(attrs={'name': "schema"})['value'] == \ @@ -222,8 +226,6 @@ def test_resource_form_update(self, app): json_value = json.dumps(value) - resource_id = dataset['resources'][0]['id'] - params = { 'url': 'https://example.com/data.csv', 'schema': json_value @@ -250,8 +252,10 @@ def test_resource_form_update_json(self, app): }] ) + resource_id = dataset['resources'][0]['id'] + response = _get_resource_update_page_as_sysadmin( - app, dataset['id'], dataset['resources'][0]['id']) + app, dataset['id'], resource_id) form = _get_form(response) assert form.find(attrs={'name': "schema_json"}).text == \ @@ -267,8 +271,6 @@ def test_resource_form_update_json(self, app): json_value = json.dumps(value) - resource_id = dataset['resources'][0]['id'] - params = { 'url': 'https://example.com/data.csv', 'schema_json': json_value @@ -294,9 +296,10 @@ def test_resource_form_update_url(self, app): 'schema': value }] ) + resource_id = dataset['resources'][0]['id'] response = _get_resource_update_page_as_sysadmin( - app, dataset['id'], dataset['resources'][0]['id']) + app, dataset['id'], resource_id) form = _get_form(response) assert form.find(attrs={'name': "schema_json"}).text ==\ @@ -304,8 +307,6 @@ def test_resource_form_update_url(self, app): value = 'https://example.com/schema.json' - resource_id = dataset['resources'][0]['id'] - params = { 'url': 'https://example.com/data.csv', 'schema_url': value @@ -331,9 +332,10 @@ def test_resource_form_update_upload(self, app): 'schema': value }] ) + resource_id = dataset['resources'][0]['id'] response = _get_resource_update_page_as_sysadmin( - app, dataset['id'], dataset['resources'][0]['id']) + app, dataset['id'], resource_id) form = _get_form(response) assert form.find(attrs={'name': "schema_json"}).text == \ @@ -348,7 +350,6 @@ def test_resource_form_update_upload(self, app): } json_value = ensure_binary(json.dumps(value), encoding='utf-8') - resource_id = dataset['resources'][0]['id'] upload = ('schema_upload', 'schema.json', json_value) params = { @@ -384,9 +385,9 @@ def test_resource_form_create(self, app): } json_value = json.dumps(value) params = { - 'url': 'https://example.com/data.csv', - 'validation_options': json_value, - } + 'url': 'https://example.com/data.csv', + 'validation_options': json_value, + } _post(app, NEW_RESOURCE_URL.format(dataset['id']), params) @@ -407,9 +408,10 @@ def test_resource_form_update(self, app): 'validation_options': value }] ) + resource_id = dataset['resources'][0]['id'] response = _get_resource_update_page_as_sysadmin( - app, dataset['id'], dataset['resources'][0]['id']) + app, dataset['id'], resource_id) form = _get_form(response) assert form.find("textarea", @@ -429,7 +431,6 @@ def test_resource_form_update(self, app): 'url': 'https://example.com/data.csv', 'validation_options': json_value } - resource_id = dataset['resources'][0]['id'] _post(app, EDIT_RESOURCE_URL.format(dataset['id'], resource_id), params, resource_id=resource_id) @@ -538,12 +539,12 @@ def test_resource_form_update_valid(self, mock_open): # 'url': 'https://example.com/data.csv' # } # ]) + # resource_id = dataset['resources'][0]['id'] # response = _get_resource_update_page_as_sysadmin( - # app, dataset['id'], dataset['resources'][0]['id']) + # app, dataset['id'], resource_id) # upload = ('upload', 'invalid.csv', INVALID_CSV) - # resource_id = dataset['resources'][0]['id'] # params = {} # invalid_stream = io.BufferedReader(io.BytesIO(INVALID_CSV)) diff --git a/ckanext/validation/tests/test_helpers.py b/ckanext/validation/tests/test_helpers.py index 0ca7d52b..b92acae3 100644 --- a/ckanext/validation/tests/test_helpers.py +++ b/ckanext/validation/tests/test_helpers.py @@ -79,8 +79,8 @@ def test_report_extracted(self): } errors = { - 'some_field': ['Some error'], - 'validation': [report], + 'some_field': ['Some error'], + 'validation': [report], } extracted_report, errors = validation_extract_report_from_errors( @@ -96,8 +96,8 @@ def test_report_extracted(self): def test_report_not_extracted(self): errors = { - 'some_field': ['Some error'], - 'some_other_field': ['Some other error'] + 'some_field': ['Some error'], + 'some_other_field': ['Some other error'] } report, errors = validation_extract_report_from_errors(errors) diff --git a/ckanext/validation/tests/test_interfaces.py b/ckanext/validation/tests/test_interfaces.py index 1de919a5..63c58634 100644 --- a/ckanext/validation/tests/test_interfaces.py +++ b/ckanext/validation/tests/test_interfaces.py @@ -1,3 +1,5 @@ +# encoding: utf-8 + import mock import pytest @@ -35,7 +37,6 @@ class BaseTestInterfaces(helpers.FunctionalTestBase): @classmethod def setup_class(cls): - super(BaseTestInterfaces, cls).setup_class() if not p.plugin_loaded('test_validation_plugin'): @@ -43,7 +44,6 @@ def setup_class(cls): @classmethod def teardown_class(cls): - super(BaseTestInterfaces, cls).teardown_class() if p.plugin_loaded('test_validation_plugin'): diff --git a/ckanext/validation/tests/test_jobs.py b/ckanext/validation/tests/test_jobs.py index f98642a6..2d09a3fd 100644 --- a/ckanext/validation/tests/test_jobs.py +++ b/ckanext/validation/tests/test_jobs.py @@ -1,7 +1,8 @@ -import mock -import json -import io +# encoding: utf-8 +import io +import json +import mock import pytest from six import BytesIO, ensure_binary @@ -14,8 +15,8 @@ from ckanext.validation.jobs import ( run_validation_job, uploader, Session, requests) from ckanext.validation.tests.helpers import ( - VALID_REPORT, INVALID_REPORT, ERROR_REPORT, VALID_REPORT_LOCAL_FILE, - mock_uploads, MockFieldStorage + VALID_REPORT, INVALID_REPORT, ERROR_REPORT, VALID_REPORT_LOCAL_FILE, + mock_uploads, MockFieldStorage ) @@ -59,10 +60,8 @@ def test_job_run_no_schema(self, mock_requests, mock_get_action, @mock.patch.object(Session, 'commit') @mock.patch.object(ckantoolkit, 'get_action') @mock.patch.object(requests, 'Session', return_value='Some_Session') - def test_job_run_schema( - self, mock_requests, mock_get_action, - mock_commit, mock_validate, dataset): - + def test_job_run_schema(self, mock_requests, mock_get_action, + mock_commit, mock_validate, dataset): schema = { 'fields': [ {'name': 'id', 'type': 'integer'}, @@ -94,7 +93,6 @@ def test_job_run_schema( def test_job_run_uploaded_file(self, mock_requests, mock_get_action, mock_commit, mock_uploader, mock_validate, dataset): - resource = { 'id': 'test', 'url': '__upload', @@ -168,7 +166,7 @@ def test_job_run_uploaded_file_replaces_paths( self, mock_uploader, mock_validate): resource = factories.Resource( - url='__upload', url_type='upload', format='csv') + url='__upload', url_type='upload', format='csv') run_validation_job(resource) diff --git a/ckanext/validation/tests/test_logic.py b/ckanext/validation/tests/test_logic.py index 334d7e5d..63306fcf 100644 --- a/ckanext/validation/tests/test_logic.py +++ b/ckanext/validation/tests/test_logic.py @@ -1,3 +1,5 @@ +# encoding: utf-8 + import datetime import io import json @@ -13,10 +15,11 @@ from ckan.tests import factories import ckantoolkit as t + from ckanext.validation.model import Validation from ckanext.validation.tests.helpers import ( - VALID_CSV, INVALID_CSV, VALID_REPORT, - mock_uploads, MockFieldStorage + VALID_CSV, INVALID_CSV, VALID_REPORT, + mock_uploads, MockFieldStorage ) @@ -165,7 +168,6 @@ def test_resource_validation_show_validation_does_not_exists(self): @change_config('ckanext.validation.run_on_create_async', False) def test_resource_validation_show_returns_all_fields(self): - resource = {'format': 'CSV', 'url': 'https://some.url'} dataset = factories.Dataset(resources=[resource]) @@ -214,7 +216,6 @@ def test_resource_validation_delete_not_exists(self): @change_config('ckanext.validation.run_on_create_async', False) @change_config('ckanext.validation.run_on_update_async', False) def test_resource_validation_delete_removes_object(self): - resource = factories.Resource(format='csv') timestamp = datetime.datetime.utcnow() validation = Validation( @@ -375,7 +376,6 @@ def test_show_anon_public_dataset(self): resource_id=dataset['resources'][0]['id']) def test_show_anon_private_dataset(self): - user = factories.User() org = factories.Organization() dataset = factories.Dataset( @@ -415,7 +415,6 @@ def test_validation_fails_on_upload(self, mock_open): invalid_stream = io.BufferedReader(io.BytesIO(INVALID_CSV)) with mock.patch('io.open', return_value=invalid_stream): - with pytest.raises(t.ValidationError) as e: call_action( 'resource_create', @@ -618,7 +617,6 @@ def test_validation_passes_with_url(self, mock_validate): class TestSchemaFields(object): def test_schema_field(self): - dataset = factories.Dataset() resource = call_action( @@ -633,7 +631,6 @@ def test_schema_field(self): assert 'schema_url' not in resource def test_schema_field_url(self): - url = 'https://example.com/schema.json' dataset = factories.Dataset() @@ -650,7 +647,6 @@ def test_schema_field_url(self): assert 'schema_url' not in resource def test_schema_url_field(self): - url = 'https://example.com/schema.json' dataset = factories.Dataset() @@ -670,11 +666,12 @@ def test_schema_url_field_wrong_url(self): url = 'not-a-url' - pytest.raises( - t.ValidationError, call_action, 'resource_create', - url='http://example.com/file.csv', - schema_url=url - ) + with pytest.raises(t.ValidationError): + call_action( + 'resource_create', + url='http://example.com/file.csv', + schema_url=url + ) @mock_uploads def test_schema_upload_field(self, mock_open): @@ -701,7 +698,6 @@ def test_schema_upload_field(self, mock_open): class TestValidationOptionsField(object): def test_validation_options_field(self): - dataset = factories.Dataset() validation_options = { @@ -720,7 +716,6 @@ def test_validation_options_field(self): assert resource['validation_options'] == validation_options def test_validation_options_field_string(self): - dataset = factories.Dataset() validation_options = '''{ diff --git a/ckanext/validation/tests/test_plugin.py b/ckanext/validation/tests/test_plugin.py index 4cd1cee4..6c218bf7 100644 --- a/ckanext/validation/tests/test_plugin.py +++ b/ckanext/validation/tests/test_plugin.py @@ -1,3 +1,5 @@ +# encoding: utf-8 + import mock import pytest @@ -9,6 +11,13 @@ from ckanext.validation.jobs import run_validation_job +def _assert_validation_enqueued(mock_enqueue, resource_id): + assert mock_enqueue.call_count == 1 + + assert mock_enqueue.call_args[0][0] == run_validation_job + assert mock_enqueue.call_args[0][1][0]['id'] == resource_id + + @pytest.mark.usefixtures("clean_db", "validation_setup") class TestResourceControllerHooksUpdate(object): @@ -52,11 +61,7 @@ def test_validation_run_on_upload(self, mock_enqueue): call_action('resource_update', {}, **dataset['resources'][0]) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] ==\ - dataset['resources'][0]['id'] + _assert_validation_enqueued(mock_enqueue, dataset['resources'][0]['id']) @change_config('ckanext.validation.run_on_create_async', False) @mock.patch('ckanext.validation.logic.enqueue_job') @@ -70,11 +75,7 @@ def test_validation_run_on_url_change(self, mock_enqueue): call_action('resource_update', {}, **dataset['resources'][0]) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] ==\ - dataset['resources'][0]['id'] + _assert_validation_enqueued(mock_enqueue, dataset['resources'][0]['id']) @change_config('ckanext.validation.run_on_create_async', False) @mock.patch('ckanext.validation.logic.enqueue_job') @@ -101,32 +102,23 @@ def test_validation_run_on_schema_change(self, mock_enqueue): call_action('resource_update', {}, **dataset['resources'][0]) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] ==\ - dataset['resources'][0]['id'] + _assert_validation_enqueued(mock_enqueue, dataset['resources'][0]['id']) @change_config('ckanext.validation.run_on_create_async', False) @mock.patch('ckanext.validation.logic.enqueue_job') def test_validation_run_on_format_change(self, mock_enqueue): - resource = factories.Resource() resource['format'] = 'CSV' call_action('resource_update', {}, **resource) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource['id'] + _assert_validation_enqueued(mock_enqueue, resource['id']) @change_config('ckanext.validation.run_on_create_async', False) @change_config('ckanext.validation.run_on_update_async', False) @mock.patch('ckanext.validation.logic.enqueue_job') def test_validation_does_not_run_when_config_false(self, mock_enqueue): - resource = factories.Resource(format='CSV') resource['url'] = 'http://some.new.url' @@ -141,7 +133,6 @@ class TestResourceControllerHooksCreate(object): @mock.patch('ckanext.validation.logic.enqueue_job') def test_validation_does_not_run_on_other_formats(self, mock_enqueue): - factories.Resource(format='PDF') mock_enqueue.assert_not_called() @@ -149,30 +140,21 @@ def test_validation_does_not_run_on_other_formats(self, mock_enqueue): @mock.patch('ckanext.validation.logic.enqueue_job') @change_config('ckanext.validation.run_on_update_async', False) def test_validation_run_with_upload(self, mock_enqueue): - resource = factories.Resource(format='CSV', url_type='upload') - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource['id'] + _assert_validation_enqueued(mock_enqueue, resource['id']) @mock.patch('ckanext.validation.logic.enqueue_job') @change_config('ckanext.validation.run_on_update_async', False) def test_validation_run_with_url(self, mock_enqueue): - resource = factories.Resource(format='CSV', url='http://some.data') - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource['id'] + _assert_validation_enqueued(mock_enqueue, resource['id']) @change_config('ckanext.validation.run_on_create_async', False) @change_config('ckanext.validation.run_on_update_async', False) @mock.patch('ckanext.validation.logic.enqueue_job') def test_validation_does_not_run_when_config_false(self, mock_enqueue): - dataset = factories.Dataset() resource = { @@ -191,7 +173,6 @@ class TestPackageControllerHooksCreate(object): @mock.patch('ckanext.validation.logic.enqueue_job') def test_validation_does_not_run_on_other_formats(self, mock_enqueue): - factories.Dataset(resources=[{'format': 'PDF'}]) mock_enqueue.assert_not_called() @@ -199,7 +180,6 @@ def test_validation_does_not_run_on_other_formats(self, mock_enqueue): @change_config('ckanext.validation.run_on_create_async', False) @mock.patch('ckanext.validation.logic.enqueue_job') def test_validation_does_not_run_when_config_false(self, mock_enqueue): - factories.Dataset(resources=[ {'format': 'CSV', 'url': 'http://some.data'}]) @@ -207,7 +187,6 @@ def test_validation_does_not_run_when_config_false(self, mock_enqueue): @mock.patch('ckanext.validation.logic.enqueue_job') def test_validation_run_with_upload(self, mock_enqueue): - resource = { 'id': 'test-resource-id', 'format': 'CSV', @@ -215,14 +194,10 @@ def test_validation_run_with_upload(self, mock_enqueue): } factories.Dataset(resources=[resource]) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource['id'] + _assert_validation_enqueued(mock_enqueue, resource['id']) @mock.patch('ckanext.validation.logic.enqueue_job') def test_validation_run_with_url(self, mock_enqueue): - resource = { 'id': 'test-resource-id', 'format': 'CSV', @@ -230,10 +205,7 @@ def test_validation_run_with_url(self, mock_enqueue): } factories.Dataset(resources=[resource]) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource['id'] + _assert_validation_enqueued(mock_enqueue, resource['id']) @mock.patch('ckanext.validation.logic.enqueue_job') def test_validation_run_only_supported_formats(self, mock_enqueue): @@ -251,10 +223,7 @@ def test_validation_run_only_supported_formats(self, mock_enqueue): factories.Dataset(resources=[resource1, resource2]) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource1['id'] + _assert_validation_enqueued(mock_enqueue, resource1['id']) @pytest.mark.usefixtures("clean_db", "validation_setup") @@ -277,10 +246,7 @@ def test_validation_runs_with_url(self, mock_enqueue): call_action('package_update', {}, **dataset) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource['id'] + _assert_validation_enqueued(mock_enqueue, resource['id']) @change_config('ckanext.validation.run_on_create_async', False) @mock.patch('ckanext.validation.logic.enqueue_job') @@ -299,10 +265,7 @@ def test_validation_runs_with_upload(self, mock_enqueue): call_action('package_update', {}, **dataset) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource['id'] + _assert_validation_enqueued(mock_enqueue, resource['id']) @change_config('ckanext.validation.run_on_create_async', False) @mock.patch('ckanext.validation.logic.enqueue_job') @@ -346,10 +309,7 @@ def test_validation_run_only_supported_formats(self, mock_enqueue): call_action('package_update', {}, **dataset) - assert mock_enqueue.call_count == 1 - - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource1['id'] + _assert_validation_enqueued(mock_enqueue, resource1['id']) @change_config('ckanext.validation.run_on_create_async', False) @change_config('ckanext.validation.run_on_update_async', False) diff --git a/ckanext/validation/tests/test_utils.py b/ckanext/validation/tests/test_utils.py index 34f0b283..1d055504 100644 --- a/ckanext/validation/tests/test_utils.py +++ b/ckanext/validation/tests/test_utils.py @@ -76,8 +76,8 @@ def test_local_path(self, mock_open): resource_id = str(uuid.uuid4()) assert get_local_upload_path(resource_id) ==\ - '/doesnt_exist/resources/{}/{}/{}'.format( - resource_id[0:3], resource_id[3:6], resource_id[6:]) + '/doesnt_exist/resources/{}/{}/{}'.format( + resource_id[0:3], resource_id[3:6], resource_id[6:]) @mock_uploads def test_delete_upload_file(self, mock_open): diff --git a/ckanext/validation/tests/test_validators.py b/ckanext/validation/tests/test_validators.py index 3cd2155f..bbee4c42 100644 --- a/ckanext/validation/tests/test_validators.py +++ b/ckanext/validation/tests/test_validators.py @@ -51,7 +51,7 @@ def test_resource_schema_invalid_schema_object(self): resource_schema_validator(schema, {}) assert e.value.error.startswith( - 'Invalid Table Schema: ' + + 'Invalid Table Schema: ' 'Descriptor validation error: \'fields\' is a required property') def test_resource_schema_valid_schema_object(self): @@ -105,7 +105,7 @@ def test_default_validation_options(self): value = '{"headers": 3}' assert validation_options_validator(value, {}) ==\ - '{"delimiter": ";", "headers": 3}' + '{"delimiter": ";", "headers": 3}' @change_config('ckanext.validation.default_validation_options', '{"delimiter":";", "headers":2}') @@ -114,4 +114,4 @@ def test_default_validation_optionsi_does_not_override(self): value = '{"headers": 3}' assert validation_options_validator(value, {}) ==\ - '{"delimiter": ";", "headers": 3}' + '{"delimiter": ";", "headers": 3}' diff --git a/ckanext/validation/utils.py b/ckanext/validation/utils.py index d49cdf2c..977f31b6 100644 --- a/ckanext/validation/utils.py +++ b/ckanext/validation/utils.py @@ -70,4 +70,4 @@ def delete_local_uploaded_file(resource_id): os.rmdir(second_directory) except OSError as e: - log.warning(u'Error deleting uploaded file: {}'.format(e)) + log.warning(u'Error deleting uploaded file: %s', e) diff --git a/ckanext/validation/views.py b/ckanext/validation/views.py index 42bb823c..607e7d72 100644 --- a/ckanext/validation/views.py +++ b/ckanext/validation/views.py @@ -1,10 +1,9 @@ -import logging +# encoding: utf-8 from flask import Blueprint -import ckanext.validation.common as common +from ckanext.validation import common -log = logging.getLogger(__name__) validation = Blueprint(u'validation', __name__) diff --git a/setup.py b/setup.py index 7a729246..e9bb51e8 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ # 3 - Alpha # 4 - Beta # 5 - Production/Stable - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', # Pick your license as you wish (should match "license" above) 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)', @@ -60,8 +60,7 @@ # installed, specify them here. If using Python 2.6 or less, then these # have to be included in MANIFEST.in as well. include_package_data=True, - package_data={ - }, + package_data={}, # Although 'package_data' is the preferred approach, in some case you may # need to place data files outside of your packages. From a8ebeb4ed24fa80827bbea7296de9fa997519e5f Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 10:47:11 +1000 Subject: [PATCH 02/25] [QOL-9122] replace six.BytesIO and six.StringIO with io.BytesIO - this is more consistent, when we're already making our strings binary --- ckanext/validation/tests/test_form.py | 13 +++++-------- ckanext/validation/tests/test_helpers.py | 5 ----- ckanext/validation/tests/test_jobs.py | 15 +++++---------- ckanext/validation/tests/test_logic.py | 21 +++++++-------------- 4 files changed, 17 insertions(+), 37 deletions(-) diff --git a/ckanext/validation/tests/test_form.py b/ckanext/validation/tests/test_form.py index f3af3694..f39bb8f2 100644 --- a/ckanext/validation/tests/test_form.py +++ b/ckanext/validation/tests/test_form.py @@ -6,7 +6,7 @@ import pytest from bs4 import BeautifulSoup -from six import BytesIO, ensure_binary +from six import ensure_binary from ckantoolkit import check_ckan_version from ckan.tests import helpers @@ -65,15 +65,14 @@ def _get_form(response): def _post(app, url, data, resource_id='', upload=None): args = [] env = _get_extra_env_as_sysadmin() - if upload and check_ckan_version('2.9'): - for entry in upload: - data[entry[0]] = (BytesIO(entry[2]), entry[1]) - # from the form data['id'] = resource_id data['save'] = '' if check_ckan_version('2.9'): + if upload: + for entry in upload: + data[entry[0]] = (io.BytesIO(entry[2]), entry[1]) kwargs = { 'url': url, 'data': data, @@ -106,7 +105,6 @@ def test_resource_form_includes_json_fields(self, app): assert form.find("textarea", attrs={'name': 'schema_json'}) assert form.find("input", attrs={'name': 'schema_url'}) - @pytest.mark.usefixtures("clean_db", "validation_setup") def test_resource_form_create(self, app): dataset = Dataset() @@ -572,10 +570,9 @@ def test_resource_form_update_valid(self, mock_open): # @mock_uploads # def test_resource_form_fields_are_persisted(self, mock_open, app): - # upload = MockFieldStorage(io.BytesIO(VALID_CSV), 'valid.csv') -# valid_stream = io.BufferedReader(BytesIO(VALID_CSV)) +# valid_stream = io.BufferedReader(io.BytesIO(VALID_CSV)) # dataset = Dataset() # with mock.patch('io.open', return_value=valid_stream): diff --git a/ckanext/validation/tests/test_helpers.py b/ckanext/validation/tests/test_helpers.py index b92acae3..9d9ca9d1 100644 --- a/ckanext/validation/tests/test_helpers.py +++ b/ckanext/validation/tests/test_helpers.py @@ -22,7 +22,6 @@ def _assert_validation_badge_status(resource, status): class TestBadges(object): def test_get_validation_badge_no_validation(self): - resource = factories.Resource( format='CSV', ) @@ -30,7 +29,6 @@ def test_get_validation_badge_no_validation(self): assert get_validation_badge(resource) == '' def test_get_validation_badge_success(self): - resource = factories.Resource( format='CSV', validation_status='success', @@ -40,7 +38,6 @@ def test_get_validation_badge_success(self): _assert_validation_badge_status(resource, 'success') def test_get_validation_badge_failure(self): - resource = factories.Resource( format='CSV', validation_status='failure', @@ -50,7 +47,6 @@ def test_get_validation_badge_failure(self): _assert_validation_badge_status(resource, 'invalid') def test_get_validation_badge_error(self): - resource = factories.Resource( format='CSV', validation_status='error', @@ -60,7 +56,6 @@ def test_get_validation_badge_error(self): _assert_validation_badge_status(resource, 'error') def test_get_validation_badge_other(self): - resource = factories.Resource( format='CSV', validation_status='not-sure', diff --git a/ckanext/validation/tests/test_jobs.py b/ckanext/validation/tests/test_jobs.py index 2d09a3fd..767ccabd 100644 --- a/ckanext/validation/tests/test_jobs.py +++ b/ckanext/validation/tests/test_jobs.py @@ -4,7 +4,7 @@ import json import mock import pytest -from six import BytesIO, ensure_binary +from six import ensure_binary import ckantoolkit from ckan.lib.uploader import ResourceUpload @@ -40,14 +40,13 @@ class TestValidationJob(object): @mock.patch.object(requests, 'Session', return_value='Some_Session') def test_job_run_no_schema(self, mock_requests, mock_get_action, mock_commit, mock_validate, dataset): - resource = { 'id': 'test', 'url': 'http://example.com/file.csv', 'format': 'csv', 'package_id': dataset['id'], } - print("HERE-2") + run_validation_job(resource) mock_validate.assert_called_with( @@ -198,7 +197,7 @@ def test_job_local_paths_are_hidden(self, mock_open): invalid_csv = ensure_binary('id,type\n' + '1,a,\n' * 1010, encoding='utf-8') - invalid_file = BytesIO() + invalid_file = io.BytesIO() invalid_file.write(invalid_csv) @@ -237,9 +236,7 @@ def test_job_pass_validation_options(self, mock_open): 'skip_rows': ['#'] } - invalid_file = BytesIO() - - invalid_file.write(invalid_csv) + invalid_file = io.BytesIO(invalid_csv) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') @@ -274,9 +271,7 @@ def test_job_pass_validation_options_string(self, mock_open): "skip_rows": ["#"] }''' - invalid_file = BytesIO() - - invalid_file.write(invalid_csv) + invalid_file = io.BytesIO(invalid_csv) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') diff --git a/ckanext/validation/tests/test_logic.py b/ckanext/validation/tests/test_logic.py index 63306fcf..f2c41b55 100644 --- a/ckanext/validation/tests/test_logic.py +++ b/ckanext/validation/tests/test_logic.py @@ -6,7 +6,6 @@ import mock import pytest -from six import StringIO, BytesIO from ckan import model from ckan.tests.helpers import ( @@ -405,8 +404,7 @@ def setup(self): @mock_uploads def test_validation_fails_on_upload(self, mock_open): - invalid_file = BytesIO() - invalid_file.write(INVALID_CSV) + invalid_file = io.BytesIO(INVALID_CSV) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') @@ -429,8 +427,7 @@ def test_validation_fails_on_upload(self, mock_open): @mock_uploads def test_validation_fails_no_validation_object_stored(self, mock_open): - invalid_file = BytesIO() - invalid_file.write(INVALID_CSV) + invalid_file = io.BytesIO(INVALID_CSV) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') @@ -455,8 +452,7 @@ def test_validation_fails_no_validation_object_stored(self, mock_open): @mock_uploads def test_validation_passes_on_upload(self, mock_open): - invalid_file = BytesIO() - invalid_file.write(VALID_CSV) + invalid_file = io.BytesIO(VALID_CSV) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') @@ -513,8 +509,7 @@ def test_validation_fails_on_upload(self, mock_open): } ]) - invalid_file = BytesIO() - invalid_file.write(INVALID_CSV) + invalid_file = io.BytesIO(INVALID_CSV) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') @@ -543,8 +538,7 @@ def test_validation_fails_no_validation_object_stored(self, mock_open): } ]) - invalid_file = BytesIO() - invalid_file.write(INVALID_CSV) + invalid_file = io.BytesIO(INVALID_CSV) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') @@ -573,8 +567,7 @@ def test_validation_passes_on_upload(self, mock_open): } ]) - valid_file = BytesIO() - valid_file.write(INVALID_CSV) + valid_file = io.BytesIO(INVALID_CSV) mock_upload = MockFieldStorage(valid_file, 'valid.csv') @@ -676,7 +669,7 @@ def test_schema_url_field_wrong_url(self): @mock_uploads def test_schema_upload_field(self, mock_open): - schema_file = StringIO('{"fields":[{"name":"category"}]}') + schema_file = io.BytesIO(b'{"fields":[{"name":"category"}]}') mock_upload = MockFieldStorage(schema_file, 'schema.json') From e2883d841fdabe7cf4c11415a9eac4a69e5aef79 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 11:08:35 +1000 Subject: [PATCH 03/25] [QOL-9122] introduce 'ckan_cli' to handle both Paster and Click --- .github/workflows/ci.yml | 16 ++++----- bin/ckan_cli | 75 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 10 deletions(-) create mode 100644 bin/ckan_cli diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 30b551b2..36a5fc6a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,16 +52,12 @@ jobs: pip install -e . # Replace default path to CKAN core config file with the one on the container sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Setup extension (CKAN >= 2.9) - if: ${{ matrix.ckan-version != '2.7' && matrix.ckan-version != '2.8' }} + - name: Setup extension run: | - ckan -c test.ini db init - ckan -c test.ini validation init-db - - name: Setup extension (CKAN < 2.9) - if: ${{ matrix.ckan-version == '2.7' || matrix.ckan-version == '2.8' }} - run: | - paster --plugin=ckan db init -c test.ini - paster --plugin=ckanext-validation validation init-db -c test.ini + chmod u+x bin/ckan_cli + export CKAN_INI=test.ini + bin/ckan_cli db init + PASTER_PLUGIN=ckanext-validation bin/ckan_cli validation init-db - name: Run tests run: pytest --ckan-ini=test.ini --cov=ckanext.validation --cov-report=xml --cov-append --disable-warnings ckanext/validation/tests - name: Coveralls @@ -77,4 +73,4 @@ jobs: - name: Coveralls Finished uses: AndreMiras/coveralls-python-action@develop with: - parallel-finished: true \ No newline at end of file + parallel-finished: true diff --git a/bin/ckan_cli b/bin/ckan_cli new file mode 100644 index 00000000..7757dc8b --- /dev/null +++ b/bin/ckan_cli @@ -0,0 +1,75 @@ +#!/bin/sh + +# Call either 'ckan' (from CKAN >= 2.9) or 'paster' (from CKAN <= 2.8) +# with appropriate syntax, depending on what is present on the system. +# This is intended to smooth the upgrade process from 2.8 to 2.9. +# Eg: +# ckan_cli jobs list +# could become either: +# paster --plugin=ckan jobs list -c /etc/ckan/default/production.ini +# or: +# ckan -c /etc/ckan/default/production.ini jobs list + +# This script is aware of the VIRTUAL_ENV environment variable, and will +# attempt to respect it with similar behaviour to commands like 'pip'. +# Eg placing this script in a virtualenv 'bin' directory will cause it +# to call the 'ckan' or 'paster' command in that directory, while +# placing this script elsewhere will cause it to rely on the VIRTUAL_ENV +# variable, or if that is not set, the system PATH. + +# Since the positioning of the CKAN configuration file is central to the +# differences between 'paster' and 'ckan', this script needs to be aware +# of the config file location. It will use the CKAN_INI environment +# variable if it exists, or default to /etc/ckan/default/production.ini. + +# If 'paster' is being used, the default plugin is 'ckan'. A different +# plugin can be specified by setting the PASTER_PLUGIN environment +# variable. This variable is irrelevant if using the 'ckan' command. + +CKAN_INI="${CKAN_INI:-/etc/ckan/default/production.ini}" +PASTER_PLUGIN="${PASTER_PLUGIN:-ckan}" +# First, look for a command alongside this file +ENV_DIR=$(dirname "$0") +if [ -f "$ENV_DIR/ckan" ]; then + COMMAND=ckan +elif [ -f "$ENV_DIR/paster" ]; then + COMMAND=paster +elif [ "$VIRTUAL_ENV" != "" ]; then + # If command not found alongside this file, check the virtualenv + ENV_DIR="$VIRTUAL_ENV/bin" + if [ -f "$ENV_DIR/ckan" ]; then + COMMAND=ckan + elif [ -f "$ENV_DIR/paster" ]; then + COMMAND=paster + fi +else + # if no virtualenv is active, try the system path + if (which ckan > /dev/null 2>&1); then + ENV_DIR=$(dirname $(which ckan)) + COMMAND=ckan + elif (which paster > /dev/null 2>&1); then + ENV_DIR=$(dirname $(which paster)) + COMMAND=paster + else + echo "Unable to locate 'ckan' or 'paster' command" >&2 + exit 1 + fi +fi + +if [ "$COMMAND" = "ckan" ]; then + # adjust args to match ckan expectations + COMMAND=$(echo "$1" | sed -e 's/create-test-data/seed/') + echo "Using 'ckan' command from $ENV_DIR with config ${CKAN_INI} to run $COMMAND..." >&2 + shift + exec $ENV_DIR/ckan -c ${CKAN_INI} $COMMAND "$@" $CLICK_ARGS +elif [ "$COMMAND" = "paster" ]; then + # adjust args to match paster expectations + COMMAND=$1 + echo "Using 'paster' command from $ENV_DIR with config ${CKAN_INI} to run $COMMAND..." >&2 + shift + if [ "$1" = "show" ]; then shift; fi + exec $ENV_DIR/paster --plugin=$PASTER_PLUGIN $COMMAND "$@" -c ${CKAN_INI} +else + echo "Unable to locate 'ckan' or 'paster' command in $ENV_DIR" >&2 + exit 1 +fi From 3ef8a33ffc7801b000a4909dc68bbe38a401c869 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 11:22:23 +1000 Subject: [PATCH 04/25] [QOL-9122] pin version of ckanext-scheming - We won't necessarily be instantly compatible with all future versions, we need version control --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2aa09f19..6a4d6453 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ ckantoolkit>=0.0.3 goodtables==1.5.1 six>=1.16.0 --e git+https://github.com/ckan/ckanext-scheming.git#egg=ckanext-scheming +-e git+https://github.com/ckan/ckanext-scheming.git@release-2.1.0#egg=ckanext-scheming From 870a96fef432a4b83769bb92fae43293cf02663b Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 11:34:16 +1000 Subject: [PATCH 05/25] [QOL-9122] add Flake8 rules and make them pass --- .flake8 | 21 +++++++++++++++++++++ ckanext/validation/logic.py | 16 ++++++++-------- ckanext/validation/plugin/__init__.py | 12 ++++++------ setup.py | 8 ++++---- 4 files changed, 39 insertions(+), 18 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..16e11641 --- /dev/null +++ b/.flake8 @@ -0,0 +1,21 @@ +[flake8] +# @see https://flake8.pycqa.org/en/latest/user/configuration.html?highlight=.flake8 + +exclude = + ckan + scripts + +# Extended output format. +format = pylint + +# Show the source of errors. +show_source = True + +max-complexity = 10 +max-line-length = 127 + +# List ignore rules one per line. +ignore = + E501 + C901 + W503 diff --git a/ckanext/validation/logic.py b/ckanext/validation/logic.py index f278d81d..8ccd1b58 100644 --- a/ckanext/validation/logic.py +++ b/ckanext/validation/logic.py @@ -267,14 +267,14 @@ def resource_validation_run_batch(context, data_dict): if isinstance(dataset_ids, string_types): try: dataset_ids = json.loads(dataset_ids) - except ValueError as e: + except ValueError: dataset_ids = [dataset_ids] search_params = data_dict.get('query') if isinstance(search_params, string_types): try: search_params = json.loads(search_params) - except ValueError as e: + except ValueError: msg = 'Error parsing search parameters' return {'output': msg} @@ -493,9 +493,9 @@ def resource_create(context, data_dict): if run_validation: is_local_upload = ( - hasattr(upload, 'filename') and - upload.filename is not None and - isinstance(upload, uploader.ResourceUpload)) + hasattr(upload, 'filename') + and upload.filename is not None + and isinstance(upload, uploader.ResourceUpload)) _run_sync_validation( resource_id, local_upload=is_local_upload, new_resource=True) @@ -609,9 +609,9 @@ def resource_update(context, data_dict): if run_validation: is_local_upload = ( - hasattr(upload, 'filename') and - upload.filename is not None and - isinstance(upload, uploader.ResourceUpload)) + hasattr(upload, 'filename') + and upload.filename is not None + and isinstance(upload, uploader.ResourceUpload)) _run_sync_validation( id, local_upload=is_local_upload, new_resource=True) diff --git a/ckanext/validation/plugin/__init__.py b/ckanext/validation/plugin/__init__.py index 43ecce53..f1ffb61f 100644 --- a/ckanext/validation/plugin/__init__.py +++ b/ckanext/validation/plugin/__init__.py @@ -188,16 +188,16 @@ def _data_dict_is_dataset(self, data_dict): def _handle_validation_for_resource(self, context, resource): needs_validation = False - if (( + if ( # File uploaded resource.get(u'url_type') == u'upload' # URL defined or resource.get(u'url') - ) and ( + ) and ( # Make sure format is supported resource.get(u'format', u'').lower() in settings.SUPPORTED_FORMATS - )): + ): needs_validation = True if needs_validation: @@ -228,9 +228,9 @@ def before_update(self, context, current_resource, updated_resource): # Format changed or (updated_resource.get(u'format', u'').lower() != current_resource.get(u'format', u'').lower()) - ) and ( - # Make sure format is supported - updated_resource.get(u'format', u'').lower() in + ) and ( + # Make sure format is supported + updated_resource.get(u'format', u'').lower() in settings.SUPPORTED_FORMATS): needs_validation = True diff --git a/setup.py b/setup.py index e9bb51e8..9a430622 100644 --- a/setup.py +++ b/setup.py @@ -50,10 +50,10 @@ namespace_packages=['ckanext'], install_requires=[ - # CKAN extensions should not list dependencies here, but in a separate - # ``requirements.txt`` file. - # - # http://docs.ckan.org/en/latest/extensions/best-practices.html#add-third-party-libraries-to-requirements-txt + # CKAN extensions should not list dependencies here, but in a separate + # ``requirements.txt`` file. + # + # http://docs.ckan.org/en/latest/extensions/best-practices.html#add-third-party-libraries-to-requirements-txt ], # If there are data files included in your packages that need to be From 0844e28e8586c680617a19a0c3aa675356889a99 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 11:43:53 +1000 Subject: [PATCH 06/25] [QOL-9122] use Flake8 config in CI instead of manually specifying --- .flake8 | 1 + .github/workflows/ci.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index 16e11641..7353a3cf 100644 --- a/.flake8 +++ b/.flake8 @@ -10,6 +10,7 @@ format = pylint # Show the source of errors. show_source = True +statistics = True max-complexity = 10 max-line-length = 127 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 36a5fc6a..99b82e07 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ jobs: - name: Install requirements run: pip install flake8 pycodestyle - name: Check syntax - run: flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics --exclude ckan + run: flake8 test: needs: lint From 5cf05573534c36f0041640763d8a8e77da0b5340 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 11:48:33 +1000 Subject: [PATCH 07/25] [QOL-9122] move plugin module to higher level - It's better to put the shared code into plugin.py than into an __init__.py file, and putting it at the higher level lets us skip a bunch of '../' paths --- ckanext/validation/{plugin/__init__.py => plugin.py} | 12 ++++++------ ckanext/validation/plugin_mixins/__init__.py | 0 .../{plugin => plugin_mixins}/flask_plugin.py | 0 .../{plugin => plugin_mixins}/pylons_plugin.py | 0 4 files changed, 6 insertions(+), 6 deletions(-) rename ckanext/validation/{plugin/__init__.py => plugin.py} (97%) create mode 100644 ckanext/validation/plugin_mixins/__init__.py rename ckanext/validation/{plugin => plugin_mixins}/flask_plugin.py (100%) rename ckanext/validation/{plugin => plugin_mixins}/pylons_plugin.py (100%) diff --git a/ckanext/validation/plugin/__init__.py b/ckanext/validation/plugin.py similarity index 97% rename from ckanext/validation/plugin/__init__.py rename to ckanext/validation/plugin.py index f1ffb61f..2a0281ec 100644 --- a/ckanext/validation/plugin/__init__.py +++ b/ckanext/validation/plugin.py @@ -45,9 +45,9 @@ class DefaultTranslation(): if t.check_ckan_version(min_version='2.9.0'): - from ckanext.validation.plugin.flask_plugin import MixinPlugin + from .plugin_mixins.flask_plugin import MixinPlugin else: - from ckanext.validation.plugin.pylons_plugin import MixinPlugin + from .plugin_mixins.pylons_plugin import MixinPlugin class ValidationPlugin(MixinPlugin, p.SingletonPlugin, DefaultTranslation): @@ -65,7 +65,7 @@ def i18n_directory(self): u'''Change the directory of the .mo translation files''' return os.path.join( os.path.dirname(__file__), - '../i18n' + 'i18n' ) # IConfigurer @@ -84,9 +84,9 @@ def update_config(self, config_): else: log.debug(u'Validation tables exist') - t.add_template_directory(config_, u'../templates') - t.add_public_directory(config_, u'../public') - t.add_resource(u'../assets', 'ckanext-validation') + t.add_template_directory(config_, u'templates') + t.add_public_directory(config_, u'public') + t.add_resource(u'assets', 'ckanext-validation') # IActions diff --git a/ckanext/validation/plugin_mixins/__init__.py b/ckanext/validation/plugin_mixins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ckanext/validation/plugin/flask_plugin.py b/ckanext/validation/plugin_mixins/flask_plugin.py similarity index 100% rename from ckanext/validation/plugin/flask_plugin.py rename to ckanext/validation/plugin_mixins/flask_plugin.py diff --git a/ckanext/validation/plugin/pylons_plugin.py b/ckanext/validation/plugin_mixins/pylons_plugin.py similarity index 100% rename from ckanext/validation/plugin/pylons_plugin.py rename to ckanext/validation/plugin_mixins/pylons_plugin.py From 43b1fa7dfd163897fc0142365f493d0bd86bb1bc Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 11:59:59 +1000 Subject: [PATCH 08/25] [QOL-9122] use cgi-based mock file storage for testing on CKAN < 2.9 --- ckanext/validation/tests/helpers.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/ckanext/validation/tests/helpers.py b/ckanext/validation/tests/helpers.py index 817702c8..dc6b2cd4 100644 --- a/ckanext/validation/tests/helpers.py +++ b/ckanext/validation/tests/helpers.py @@ -8,10 +8,10 @@ else: import __builtin__ as builtins -from werkzeug.datastructures import FileStorage as FlaskFileStorage from pyfakefs import fake_filesystem import ckan.lib.uploader +from ckantoolkit import check_ckan_version from ckan.tests.helpers import change_config @@ -146,10 +146,16 @@ def wrapper(*args, **kwargs): return wrapper -class MockFieldStorage(FlaskFileStorage): - content_type = None +if check_ckan_version('2.9'): + from werkzeug.datastructures import FileStorage as MockFieldStorage +else: + import cgi + + class MockFieldStorage(cgi.FieldStorage): + + def __init__(self, fp, filename): - def __init__(self, stream, filename): - self.stream = stream - self.filename = filename - self.name = u"upload" + self.file = fp + self.filename = filename + self.name = u"upload" + self.list = None From dfc5683186577ae34e335d9187f5524326c8b43c Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 12:29:21 +1000 Subject: [PATCH 09/25] [QOL-9122] ensure URL package IDs match resource IDs - extract both the package ID and resource ID from the URL and ensure it's the right package for the resource --- ckanext/validation/common.py | 9 ++++++--- ckanext/validation/views.py | 9 +++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ckanext/validation/common.py b/ckanext/validation/common.py index 4f7bfa53..09561e7b 100644 --- a/ckanext/validation/common.py +++ b/ckanext/validation/common.py @@ -22,8 +22,7 @@ ############################################################################### -def validation(resource_id): - +def validation(resource_id, id=None): try: validation = get_action(u'resource_validation_show')( {u'user': c.user}, @@ -33,9 +32,13 @@ def validation(resource_id): {u'user': c.user}, {u'id': resource_id}) + package_id = resource[u'package_id'] + if id and id != package_id: + raise ObjectNotFound("Resource {} not found in package {}".format(resource_id, id)) + dataset = get_action(u'package_show')( {u'user': c.user}, - {u'id': resource[u'package_id']}) + {u'id': id or resource[u'package_id']}) # Needed for core resource templates c.package = c.pkg_dict = dataset diff --git a/ckanext/validation/views.py b/ckanext/validation/views.py index 607e7d72..d95560b3 100644 --- a/ckanext/validation/views.py +++ b/ckanext/validation/views.py @@ -6,12 +6,9 @@ validation = Blueprint(u'validation', __name__) - -def read(id, resource_id): - return common.validation(resource_id) - - -validation.add_url_rule(u'/dataset//resource//validation', view_func=read) +validation.add_url_rule( + u'/dataset//resource//validation', 'read', methods=('GET',), view_func=common.validation +) def get_blueprints(): From 2af8bc9e819ebe01a270787b0f2b31cbd9c54b87 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 12:45:45 +1000 Subject: [PATCH 10/25] [QOL-9122] mock core enqueue function instead of ours - This will make it easier to adjust and test our exact enqueueing behaviour since we can examine what is actually sent to the queue --- ckanext/validation/tests/test_interfaces.py | 16 ++------ ckanext/validation/tests/test_logic.py | 11 ++---- ckanext/validation/tests/test_plugin.py | 42 ++++++++++----------- 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/ckanext/validation/tests/test_interfaces.py b/ckanext/validation/tests/test_interfaces.py index 63c58634..12bf324f 100644 --- a/ckanext/validation/tests/test_interfaces.py +++ b/ckanext/validation/tests/test_interfaces.py @@ -68,7 +68,6 @@ def _apply_config_changes(cls, cfg): @mock.patch('ckanext.validation.jobs.validate', return_value=VALID_REPORT) def test_can_validate_called_on_create_sync(self, mock_validation): - dataset = factories.Dataset() helpers.call_action( 'resource_create', @@ -84,7 +83,6 @@ def test_can_validate_called_on_create_sync(self, mock_validation): # @helpers.change_config('ckanext.validation.run_on_update_async', False) @mock.patch('ckanext.validation.jobs.validate') def test_can_validate_called_on_create_sync_no_validation(self, mock_validation): - dataset = factories.Dataset() helpers.call_action( 'resource_create', @@ -102,7 +100,6 @@ def test_can_validate_called_on_create_sync_no_validation(self, mock_validation) @mock.patch('ckanext.validation.jobs.validate', return_value=VALID_REPORT) def test_can_validate_called_on_update_sync(self, mock_validation): - dataset = factories.Dataset() resource = factories.Resource(package_id=dataset['id']) helpers.call_action( @@ -120,7 +117,6 @@ def test_can_validate_called_on_update_sync(self, mock_validation): # @helpers.change_config('ckanext.validation.run_on_update_async', False) @mock.patch('ckanext.validation.jobs.validate') def test_can_validate_called_on_update_sync_no_validation(self, mock_validation): - dataset = factories.Dataset() resource = factories.Resource(package_id=dataset['id']) helpers.call_action( @@ -144,9 +140,8 @@ def _apply_config_changes(cls, cfg): cfg['ckanext.validation.run_on_update_sync'] = False # @helpers.change_config('ckanext.validation.run_on_create_async', True) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_can_validate_called_on_create_async(self, mock_validation): - dataset = factories.Dataset() helpers.call_action( 'resource_create', @@ -159,9 +154,8 @@ def test_can_validate_called_on_create_async(self, mock_validation): assert mock_validation.called # @helpers.change_config('ckanext.validation.run_on_create_async', True) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_can_validate_called_on_create_async_no_validation(self, mock_validation): - dataset = factories.Dataset() helpers.call_action( 'resource_create', @@ -176,9 +170,8 @@ def test_can_validate_called_on_create_async_no_validation(self, mock_validation # @helpers.change_config('ckanext.validation.run_on_create_async', False) # @helpers.change_config('ckanext.validation.run_on_update_async', True) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_can_validate_called_on_update_async(self, mock_validation): - dataset = factories.Dataset() resource = factories.Resource(package_id=dataset['id']) helpers.call_action( @@ -194,9 +187,8 @@ def test_can_validate_called_on_update_async(self, mock_validation): # @helpers.change_config('ckanext.validation.run_on_create_async', False) # @helpers.change_config('ckanext.validation.run_on_update_async', True) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_can_validate_called_on_update_async_no_validation(self, mock_validation): - dataset = factories.Dataset() resource = factories.Resource(package_id=dataset['id']) helpers.call_action( diff --git a/ckanext/validation/tests/test_logic.py b/ckanext/validation/tests/test_logic.py index f2c41b55..de0cc70d 100644 --- a/ckanext/validation/tests/test_logic.py +++ b/ckanext/validation/tests/test_logic.py @@ -57,16 +57,14 @@ def test_resource_validation_no_url_or_upload(self): u'Resource must have a valid URL or an uploaded file'} ==\ err.value.error_dict - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_resource_validation_with_url(self, mock_enqueue_job): - resource = factories.Resource(url='http://example.com', format='csv') call_action('resource_validation_run', resource_id=resource['id']) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_resource_validation_with_upload(self, mock_enqueue_job): - resource = factories.Resource(url='', url_type='upload', format='csv') call_action('resource_validation_run', resource_id=resource['id']) @@ -83,10 +81,9 @@ def test_resource_validation_with_upload(self, mock_enqueue_job): # assert len(jobs_after) == len(jobs) + 1 - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_resource_validation_creates_validation_object( self, mock_enqueue_job): - resource = factories.Resource(format='csv') call_action('resource_validation_run', resource_id=resource['id']) @@ -102,7 +99,7 @@ def test_resource_validation_creates_validation_object( assert validation.error is None @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_resource_validation_resets_existing_validation_object( self, mock_enqueue_job): diff --git a/ckanext/validation/tests/test_plugin.py b/ckanext/validation/tests/test_plugin.py index 6c218bf7..50669ea7 100644 --- a/ckanext/validation/tests/test_plugin.py +++ b/ckanext/validation/tests/test_plugin.py @@ -22,7 +22,7 @@ def _assert_validation_enqueued(mock_enqueue, resource_id): class TestResourceControllerHooksUpdate(object): @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_does_not_run_on_other_fields(self, mock_enqueue): resource = {'format': 'CSV'} @@ -36,7 +36,7 @@ def test_validation_does_not_run_on_other_fields(self, mock_enqueue): mock_enqueue.assert_not_called() @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_does_not_run_on_other_formats(self, mock_enqueue): resource = {'format': 'PDF'} @@ -48,7 +48,7 @@ def test_validation_does_not_run_on_other_formats(self, mock_enqueue): mock_enqueue.assert_not_called() @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_run_on_upload(self, mock_enqueue): resource = { @@ -64,7 +64,7 @@ def test_validation_run_on_upload(self, mock_enqueue): _assert_validation_enqueued(mock_enqueue, dataset['resources'][0]['id']) @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_run_on_url_change(self, mock_enqueue): resource = {'format': 'CSV', 'url': 'https://some.url'} @@ -78,7 +78,7 @@ def test_validation_run_on_url_change(self, mock_enqueue): _assert_validation_enqueued(mock_enqueue, dataset['resources'][0]['id']) @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_run_on_schema_change(self, mock_enqueue): resource = { @@ -105,7 +105,7 @@ def test_validation_run_on_schema_change(self, mock_enqueue): _assert_validation_enqueued(mock_enqueue, dataset['resources'][0]['id']) @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_run_on_format_change(self, mock_enqueue): resource = factories.Resource() @@ -117,7 +117,7 @@ def test_validation_run_on_format_change(self, mock_enqueue): @change_config('ckanext.validation.run_on_create_async', False) @change_config('ckanext.validation.run_on_update_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_does_not_run_when_config_false(self, mock_enqueue): resource = factories.Resource(format='CSV') @@ -131,20 +131,20 @@ def test_validation_does_not_run_when_config_false(self, mock_enqueue): @pytest.mark.usefixtures("clean_db", "validation_setup") class TestResourceControllerHooksCreate(object): - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_does_not_run_on_other_formats(self, mock_enqueue): factories.Resource(format='PDF') mock_enqueue.assert_not_called() - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') @change_config('ckanext.validation.run_on_update_async', False) def test_validation_run_with_upload(self, mock_enqueue): resource = factories.Resource(format='CSV', url_type='upload') _assert_validation_enqueued(mock_enqueue, resource['id']) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') @change_config('ckanext.validation.run_on_update_async', False) def test_validation_run_with_url(self, mock_enqueue): resource = factories.Resource(format='CSV', url='http://some.data') @@ -153,7 +153,7 @@ def test_validation_run_with_url(self, mock_enqueue): @change_config('ckanext.validation.run_on_create_async', False) @change_config('ckanext.validation.run_on_update_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_does_not_run_when_config_false(self, mock_enqueue): dataset = factories.Dataset() @@ -171,21 +171,21 @@ def test_validation_does_not_run_when_config_false(self, mock_enqueue): @pytest.mark.usefixtures("clean_db", "validation_setup") class TestPackageControllerHooksCreate(object): - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_does_not_run_on_other_formats(self, mock_enqueue): factories.Dataset(resources=[{'format': 'PDF'}]) mock_enqueue.assert_not_called() @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_does_not_run_when_config_false(self, mock_enqueue): factories.Dataset(resources=[ {'format': 'CSV', 'url': 'http://some.data'}]) mock_enqueue.assert_not_called() - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_run_with_upload(self, mock_enqueue): resource = { 'id': 'test-resource-id', @@ -196,7 +196,7 @@ def test_validation_run_with_upload(self, mock_enqueue): _assert_validation_enqueued(mock_enqueue, resource['id']) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_run_with_url(self, mock_enqueue): resource = { 'id': 'test-resource-id', @@ -207,7 +207,7 @@ def test_validation_run_with_url(self, mock_enqueue): _assert_validation_enqueued(mock_enqueue, resource['id']) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_run_only_supported_formats(self, mock_enqueue): resource1 = { @@ -230,7 +230,7 @@ def test_validation_run_only_supported_formats(self, mock_enqueue): class TestPackageControllerHooksUpdate(object): @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_runs_with_url(self, mock_enqueue): resource = { @@ -249,7 +249,7 @@ def test_validation_runs_with_url(self, mock_enqueue): _assert_validation_enqueued(mock_enqueue, resource['id']) @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_runs_with_upload(self, mock_enqueue): resource = { @@ -268,7 +268,7 @@ def test_validation_runs_with_upload(self, mock_enqueue): _assert_validation_enqueued(mock_enqueue, resource['id']) @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_does_not_run_on_other_formats(self, mock_enqueue): resource = { @@ -287,7 +287,7 @@ def test_validation_does_not_run_on_other_formats(self, mock_enqueue): mock_enqueue.assert_not_called() @change_config('ckanext.validation.run_on_create_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_run_only_supported_formats(self, mock_enqueue): resource1 = { @@ -313,7 +313,7 @@ def test_validation_run_only_supported_formats(self, mock_enqueue): @change_config('ckanext.validation.run_on_create_async', False) @change_config('ckanext.validation.run_on_update_async', False) - @mock.patch('ckanext.validation.logic.enqueue_job') + @mock.patch('ckantoolkit.enqueue_job') def test_validation_does_not_run_when_config_false(self, mock_enqueue): resource = { From 08e07fed0966d777d1d0f476491af2fc73979066 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 12:53:43 +1000 Subject: [PATCH 11/25] [QOL-9122] allow job consumer to accept resource IDs - Passing IDs consumes far less memory on the job queue than entire resource dictionaries --- ckanext/validation/jobs.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ckanext/validation/jobs.py b/ckanext/validation/jobs.py index 24604413..c6c92fb1 100644 --- a/ckanext/validation/jobs.py +++ b/ckanext/validation/jobs.py @@ -22,17 +22,25 @@ def run_validation_job(resource): - - log.debug(u'Validating resource %s', resource['id']) - + # handle either a resource dict or just an ID + # ID is more efficient, as resource dicts can be very large + if isinstance(resource, string_types): + log.debug(u'run_validation_job: calling resource_show: %s', resource) + resource = t.get_action('resource_show')({'ignore_auth': True}, {'id': resource}) + + resource_id = resource.get('id') + if resource_id: + log.debug(u'Validating resource: %s', resource_id) + else: + log.debug(u'Validating resource dict: %s', resource) try: validation = Session.query(Validation).filter( - Validation.resource_id == resource['id']).one() + Validation.resource_id == resource_id).one() except NoResultFound: validation = None if not validation: - validation = Validation(resource_id=resource['id']) + validation = Validation(resource_id=resource_id) validation.status = u'running' Session.add(validation) From c556ffd8587b80bd9dbfa1735d46e23df21146f7 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 13:36:53 +1000 Subject: [PATCH 12/25] [QOL-9122] update README - fix typos - fix repository URLs --- README.md | 56 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 6deb6942..0ad742f9 100644 --- a/README.md +++ b/README.md @@ -36,13 +36,13 @@ Data description and validation for CKAN with [Frictionless Data](https://fricti ## Overview -This extension brings data validation powered by the [goodtables](https://github.com/frictionlessdata/goodtables-py) library to CKAN. It provides out of the box features to validate tabular data and integrate validation reports to the CKAN interface. +This extension brings data validation powered by the [Goodtables](https://github.com/frictionlessdata/goodtables-py) library to CKAN. It provides out-of-the-box features to validate tabular data and integrate validation reports to the CKAN interface. Data validation can be performed automatically on the background or during dataset creation, and the results are stored against each resource. !['Status badges in resources'](https://i.imgur.com/9VIzfwo.png) -Comprehensive reports are created describing issues found with the data, both at the structure level (missing headers, blank rows, etc) and at the data schema level (wrong data types, values out of range etc). +Comprehensive reports are created describing issues found with the data, both at the structure level (missing headers, blank rows, etc) and at the data schema level (wrong data types, values out of range, etc). The extension also exposes all the underlying [actions](#action-functions) so data validation can be integrated in custom workflows from other extensions. @@ -62,12 +62,18 @@ If you want to use [asynchronous validation](#asynchronous-validation) with back To install ckanext-validation, activate your CKAN virtualenv and run: - git clone https://github.com/frictionlessdata/ckanext-validation.git + git clone https://github.com/keitaroinc/ckanext-validation.git cd ckanext-validation pip install -r requirements.txt python setup.py develop -Create the database tables running: +Or: + + pip install -e 'git+https://github.com/keitaroinc/ckanext-validation.git#egg=ckanext-validation' + cd ckanext-validation + pip install -r requirements.txt + +Create the database tables by running: ON CKAN >= 2.9: @@ -80,21 +86,21 @@ ON CKAN <= 2.8: ## Configuration -Once installed, add the `validation` plugin to the `ckan.plugins` configuration option on your INI file: +Once installed, add the `validation` plugin to the `ckan.plugins` configuration option in your INI file: ckan.plugins = ... validation -*Note:* if using CKAN 2.6 or lower and the [asynchronous validation](#asynchronous-validation) also add the `rq` plugin ([see Versions supported and requirements](#versions-supported-and-requirements)) to `ckan.plugins`. +*Note:* if using CKAN 2.6 or lower and [asynchronous validation](#asynchronous-validation), also add the `rq` plugin ([see Versions supported and requirements](#versions-supported-and-requirements)) to `ckan.plugins`. ### Adding schema fields to the Resource metadata -The extension requires changes in the CKAN metadata schema. The easisest way to add those is by using ckanext-scheming. Use these two configuration options to link to the dataset schema (replace with your own if you need to customize it) and the required presets: +The extension requires changes in the CKAN metadata schema. The easiest way to add those is by using ckanext-scheming. Use these two configuration options to link to the dataset schema (replace with your own if you need to customize it) and the required presets: scheming.dataset_schemas = ckanext.validation.examples:ckan_default_schema.json scheming.presets = ckanext.scheming:presets.json ckanext.validation:presets.json -Read more below about to [change the CKAN metadata schema](#changes-in-the-metadata-schema) +Read more below about how to [change the CKAN metadata schema](#changes-in-the-metadata-schema) ### Operation modes @@ -108,7 +114,7 @@ Use the following configuration options to choose the [operation modes](#operati ### Formats to validate -By default validation will be run agaisnt the following formats: `CSV`, `XLSX` and `XLS`. You can modify these formats using the following option: +By default validation will be run against the following formats: `CSV`, `XLSX` and `XLS`. You can modify these formats using the following option: ckanext.validation.formats = csv xlsx @@ -120,7 +126,7 @@ You can also provide [validation options](#validation-options) that will be used Make sure to use indentation if the value spans multiple lines otherwise it won't be parsed. -If you are using a cloud-based storage backend for uploads check [Private datasets](#private-datasets) for other configuration settings that might be relevant. +If you are using a cloud-based storage backend for uploads, check [Private datasets](#private-datasets) for other configuration settings that might be relevant. ### Display badges @@ -133,13 +139,13 @@ To prevent the extension from adding the validation badges next to the resources ### Data Validation -CKAN users will be familiar with the validation performed against the metadata fields when creating or updating datasets. The form will return an error for instance if a field is missing or it doesn't have the expected format. +CKAN users will be familiar with the validation performed against the metadata fields when creating or updating datasets. The form will return an error, for instance, if a field is missing or it doesn't have the expected format. -Data validation follows the same principle but against the actual data published in CKAN, that is the contents of tabular files (Excel, CSV, etc) hosted in CKAN itself or elsewhere. Whenever a resource of the appropiate format is created or updated, the extension will validate the data against a collection of checks. This validation is powered by [goodtables](https://github.com/frictionlessdata/goodtables-py), a very powerful data validation library developed by [Open Knowledge International](https://okfn.org) as part of the [Frictionless Data](https://frictionlessdata.io) project. Goodtables provides an extensive suite of [checks](https://github.com/frictionlessdata/goodtables-py#checks) that cover common issues with tabular data files. +Data validation follows the same principle, but against the actual data published in CKAN, that is the contents of tabular files (Excel, CSV, etc) hosted in CKAN itself or elsewhere. Whenever a resource of the appropriate format is created or updated, the extension will validate the data against a collection of checks. This validation is powered by [Goodtables](https://github.com/frictionlessdata/goodtables-py), a very powerful data validation library developed by [Open Knowledge International](https://okfn.org) as part of the [Frictionless Data](https://frictionlessdata.io) project. Goodtables provides an extensive suite of [checks](https://github.com/frictionlessdata/goodtables-py#checks) that cover common issues with tabular data files. These checks include structural problems like missing headers or values, blank rows, etc., but also can validate the data contents themselves (see [Data Schemas](#data-schemas)) or even run [custom checks](https://github.com/frictionlessdata/goodtables-py#custom-constraint). -The result of this validation is a JSON report. This report contains all the issues found (if any) with their relevant context (row number, columns involved, etc). The reports are stored in the database and linked to the CKAN resource, and can be retrieved [via the API](#resource_validation_show). +The result of this validation is a JSON report. This report contains all the issues found (if any) with their relevant context (row number, columns involved, etc). The reports are stored in the database and linked to the CKAN resources, and can be retrieved [via the API](#resource_validation_show). If there is a report available for a particular resource, a status badge will be displayed in the resource listing and on the resource page, showing whether validation passed or failed for the resource. @@ -149,11 +155,11 @@ Clicking on the badge will take you to the validation report page, where the rep !['Validation report'](https://i.imgur.com/Mm6vKFD.png) -Whenever possible, the report will provide a preview of the cells, rows or columns involved in an error, to make easy to identify and fix it. +Whenever possible, the report will provide a preview of the cells, rows or columns involved in an error, to make it easy to identify and fix it. ### Data Schema -As we mentioned before, data can be validated against a schema. Much in the same way that the standard CKAN schema for metadata fields, the schema describes the data and what its values are expected to be. +As mentioned before, data can be validated against a schema. Much in the same way as the standard CKAN schema for metadata fields, the schema describes the data and what its values are expected to be. These schemas are defined following the [Table Schema](http://frictionlessdata.io/specs/table-schema/) specification, a really simple and flexible standard for describing tabular data. @@ -211,7 +217,7 @@ The following schema describes the expected data: ``` -If we store this schema agaisnt a resource, it will be used to perform a more thorough validation. For instance, updating the resource with the following data would fail validation with a variety of errors, even if the general structure of the file is correct: +If we store this schema against a resource, it will be used to perform a more thorough validation. For instance, updating the resource with the following data would fail validation with a variety of errors, even if the general structure of the file is correct: | id | location | date | measurement | observations | @@ -225,7 +231,7 @@ With the extension enabled and configured, schemas can be attached to the `schem ### Validation Options -As we saw before, the validation process involves many different checks and it's very likely that what "valid" data actually means will vary across CKAN instances or datasets. The validation process can be tweaked by passing any of the [supported options](https://github.com/frictionlessdata/goodtables-py#validatesource-options) on goodtables. These can be used to add or remove specific checks, control limits, etc. +As we saw before, the validation process involves many different checks and it's very likely that what "valid" data actually means will vary across CKAN instances or datasets. The validation process can be tweaked by passing any of the [supported options](https://github.com/frictionlessdata/goodtables-py#validatesource-options) to Goodtables. These can be used to add or remove specific checks, control limits, etc. For instance, the following file would fail validation using the default options, but it may be valid in a given context, or the issues may be known to the publishers: @@ -263,7 +269,7 @@ Validation can be performed on private datasets. When validating a locally uploa In these cases, the API key for the site user will be passed as part of the request (or alternatively `ckanext.validation.pass_auth_header_value` if set in the configuration). -As this involves sending API keys to other extensions this behaviour can be turned off by setting `ckanext.validation.pass_auth_header` to `False`. +As this involves sending API keys to other extensions, this behaviour can be turned off by setting `ckanext.validation.pass_auth_header` to `False`. Again, these settings only affect private resources when using a cloud-based backend. @@ -275,7 +281,7 @@ The data validation process described above can be run in two modes: asynchronou #### Asynchronous validation -Asynchronous validation is run in the background whenever a resource of a supported format is created or updated. Validation won't affect the action performed, so if there are validation errors found the reource will be created or updated anyway. +Asynchronous validation is run in the background whenever a resource of a supported format is created or updated. Validation won't affect the action performed, so if there are validation errors found the resource will be created or updated anyway. This mode might be useful for instances where datasets are harvested from other sources, or where multiple publishers create datasets and as a maintainer you only want to give visibility to the quality of data, encouraging publishers to fix any issues. @@ -341,7 +347,7 @@ The extension requires changes in the default CKAN resource metadata schema to a Here's more detail on the fields added: * `schema`: This can be a [Table Schema](http://frictionlessdata.io/specs/table-schema/) JSON object or an URL pointing to one. In the UI form you can upload a JSON file, link to one providing a URL or enter it directly. If uploaded, the file contents will be read and stored in the `schema` field. In all three cases the contents will be validated against the Table Schema specification. -* `validation_options`: A JSON object with validation options that will be passed to [goodtables](https://github.com/frictionlessdata/goodtables-py#validatesource-options). +* `validation_options`: A JSON object with validation options that will be passed to [Goodtables](https://github.com/frictionlessdata/goodtables-py#validatesource-options). ![Form fields](https://i.imgur.com/ixKOCij.png) @@ -523,7 +529,7 @@ def resource_validation_run_batch(context, data_dict): ### Starting the validation process manually -You can start (asynchronous) validation jobs from the command line using the `ckan validation run` command. If no parameters are provided it will start a validation job for all resources in the site of suitable format (ie `ckanext.validation.formats`): +You can start (asynchronous) validation jobs from the command line using the `validation run` command. If no parameters are provided it will start a validation job for all resources in the site of suitable format (ie `ckanext.validation.formats`): ON CKAN >= 2.9: @@ -566,13 +572,13 @@ ON CKAN >= 2.9: ON CKAN <= 2.8: - paster validation report -c /path/to/ckan/ini + paster validation report -c /path/to/ckan/ini - paster validation report-full -c /path/to/ckan/ini + paster validation report-full -c /path/to/ckan/ini Both commands will print an overview of the total number of datasets and tabular resources, and a breakdown of how many have a validation status of success, -failure or error. Additionally they will create a CSV report. `ckan validation report` will create a report with all failing resources, including the following fields: +failure or error. Additionally they will create a CSV report. `validation report` will create a report with all failing resources, including the following fields: * Dataset name * Resource id @@ -581,7 +587,7 @@ failure or error. Additionally they will create a CSV report. `ckan validation r * Status * Validation report URL -`ckan validation report-full` will add a row on the output CSV for each error found on the validation report (limited to ten occurrences of the same error type per file). So the fields in the generated CSV report will be: +`validation report-full` will add a row on the output CSV for each error found on the validation report (limited to ten occurrences of the same error type per file). So the fields in the generated CSV report will be: * Dataset name * Resource id From cfa3d158bdff90b86159b7be01663e0f25019d25 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 13:45:14 +1000 Subject: [PATCH 13/25] [QOL-9122] skip validating untouched resources in a package - When using a resource API to update a resource, don't treat it as a package update --- ckanext/validation/plugin.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/ckanext/validation/plugin.py b/ckanext/validation/plugin.py index 2a0281ec..a9f64798 100644 --- a/ckanext/validation/plugin.py +++ b/ckanext/validation/plugin.py @@ -162,6 +162,7 @@ def before_create(self, context, data_dict): return self._process_schema_fields(data_dict) resources_to_validate = {} + packages_to_skip = {} def after_create(self, context, data_dict): @@ -213,6 +214,9 @@ def before_update(self, context, current_resource, updated_resource): updated_resource = self._process_schema_fields(updated_resource) + # the call originates from a resource API, so don't validate the entire package + self.packages_to_skip[updated_resource['package_id']] = True + if not get_update_mode_from_config() == u'async': return updated_resource @@ -229,8 +233,8 @@ def before_update(self, context, current_resource, updated_resource): or (updated_resource.get(u'format', u'').lower() != current_resource.get(u'format', u'').lower()) ) and ( - # Make sure format is supported - updated_resource.get(u'format', u'').lower() in + # Make sure format is supported + updated_resource.get(u'format', u'').lower() in settings.SUPPORTED_FORMATS): needs_validation = True @@ -249,14 +253,20 @@ def after_update(self, context, data_dict): and not get_create_mode_from_config() == u'async'): return - if context.get('_validation_performed'): + if context.pop('_validation_performed', None): # Ugly, but needed to avoid circular loops caused by the # validation job calling resource_patch (which calls # package_update) - del context['_validation_performed'] return if is_dataset: + package_id = data_dict.get('id') + if self.packages_to_skip.pop(package_id, None) or context.get('save', False): + # Either we're updating an individual resource, + # or we're updating the package metadata via the web form; + # in both cases, we don't need to validate every resource. + return + for resource in data_dict.get(u'resources', []): if resource[u'id'] in self.resources_to_validate: # This is part of a resource_update call, it will be From e353f9e9cc5bb66faefd426b144430181af95bd4 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 14:01:40 +1000 Subject: [PATCH 14/25] [QOL-9122] retrieve package ID if not provided on resource update --- ckanext/validation/plugin.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ckanext/validation/plugin.py b/ckanext/validation/plugin.py index a9f64798..ee440ba0 100644 --- a/ckanext/validation/plugin.py +++ b/ckanext/validation/plugin.py @@ -215,7 +215,13 @@ def before_update(self, context, current_resource, updated_resource): updated_resource = self._process_schema_fields(updated_resource) # the call originates from a resource API, so don't validate the entire package - self.packages_to_skip[updated_resource['package_id']] = True + package_id = updated_resource.get('package_id') + if not package_id: + existing_resource = t.get_action('resource_show')( + context={'ignore_auth': True}, data_dict={'id': updated_resource['id']}) + if existing_resource: + package_id = existing_resource['package_id'] + self.packages_to_skip[package_id] = True if not get_update_mode_from_config() == u'async': return updated_resource From 661a371c97ff5d47c7a6613d3ee94549333e3a2c Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Fri, 22 Jul 2022 15:40:30 +1000 Subject: [PATCH 15/25] [QOL-9122] shrink validation jobs - Record only the resource ID, instead of the full resource dict. This greatly reduces the memory usage on the job queue server. --- README.md | 3 +++ ckanext/validation/logic.py | 29 +++++++++++++++++++------ ckanext/validation/tests/test_plugin.py | 4 ++-- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0ad742f9..293bcf6e 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,9 @@ The extension requires changes in the CKAN metadata schema. The easiest way to a Read more below about how to [change the CKAN metadata schema](#changes-in-the-metadata-schema) ### Operation modes +Use the following to configure which queue async jobs are added to + + ckanext.validation.queue = bulk (Defaults to default) Use the following configuration options to choose the [operation modes](#operation-modes): diff --git a/ckanext/validation/logic.py b/ckanext/validation/logic.py index 8ccd1b58..6fee325f 100644 --- a/ckanext/validation/logic.py +++ b/ckanext/validation/logic.py @@ -26,12 +26,25 @@ log = logging.getLogger(__name__) -def enqueue_job(*args, **kwargs): - try: - return t.enqueue_job(*args, **kwargs) - except AttributeError: - from ckanext.rq.jobs import enqueue as enqueue_job_legacy - return enqueue_job_legacy(*args, **kwargs) +def enqueue_validation_job(package_id, resource_id): + enqueue_args = { + 'fn': run_validation_job, + 'title': "run_validation_job: package_id: {} resource: {}".format(package_id, resource_id), + 'kwargs': {'resource': resource_id}, + } + if t.check_ckan_version('2.8'): + ttl = 24 * 60 * 60 # 24 hour ttl. + rq_kwargs = { + 'ttl': ttl + } + if t.check_ckan_version('2.9'): + rq_kwargs['failure_ttl'] = ttl + enqueue_args['rq_kwargs'] = rq_kwargs + # Optional variable, if not set, default queue is used + queue = t.config.get('ckanext.validation.queue', None) + if queue: + enqueue_args['queue'] = queue + t.enqueue_job(**enqueue_args) # Auth @@ -129,8 +142,10 @@ def resource_validation_run(context, data_dict): Session.commit() if async_job: - enqueue_job(run_validation_job, [resource]) + package_id = resource['package_id'] + enqueue_validation_job(package_id, resource_id) else: + # run_validation_job(resource_id) # TODO only pass resource_id, tests need to be fixed for this run_validation_job(resource) diff --git a/ckanext/validation/tests/test_plugin.py b/ckanext/validation/tests/test_plugin.py index 50669ea7..3876114a 100644 --- a/ckanext/validation/tests/test_plugin.py +++ b/ckanext/validation/tests/test_plugin.py @@ -14,8 +14,8 @@ def _assert_validation_enqueued(mock_enqueue, resource_id): assert mock_enqueue.call_count == 1 - assert mock_enqueue.call_args[0][0] == run_validation_job - assert mock_enqueue.call_args[0][1][0]['id'] == resource_id + assert mock_enqueue.call_args[1]['fn'] == run_validation_job + assert mock_enqueue.call_args[1]['kwargs']['resource'] == resource_id @pytest.mark.usefixtures("clean_db", "validation_setup") From 1bb3617537cc916926efe9aa362fb94af0df66bf Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 12:45:20 +1000 Subject: [PATCH 16/25] [QOL-9122] add title to the validation link - Just use the validation timestamp, it's useful information --- ckanext/validation/helpers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ckanext/validation/helpers.py b/ckanext/validation/helpers.py index c6381034..fb7f4e2f 100644 --- a/ckanext/validation/helpers.py +++ b/ckanext/validation/helpers.py @@ -40,13 +40,14 @@ def get_validation_badge(resource, in_listing=False): resource_id=resource['id']) return u''' - + {prefix}{status_title} '''.format( validation_url=validation_url, prefix=_('data'), status=status, - status_title=statuses[status]) + status_title=statuses[status], + title=resource.get('validation_timestamp', '')) def validation_extract_report_from_errors(errors): From 1f612605856aedcc188153fe5fa61f0195c0faf4 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 12:52:21 +1000 Subject: [PATCH 17/25] [QOL-9122] use variable to manage ckan_cli path in one place --- .github/workflows/ci.yml | 7 ++++--- .gitignore | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 99b82e07..d2e44f85 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,10 +54,11 @@ jobs: sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - name: Setup extension run: | - chmod u+x bin/ckan_cli + CKAN_CLI=bin/ckan_cli + chmod u+x $CKAN_CLI export CKAN_INI=test.ini - bin/ckan_cli db init - PASTER_PLUGIN=ckanext-validation bin/ckan_cli validation init-db + $CKAN_CLI db init + PASTER_PLUGIN=ckanext-validation $CKAN_CLI validation init-db - name: Run tests run: pytest --ckan-ini=test.ini --cov=ckanext.validation --cov-report=xml --cov-append --disable-warnings ckanext/validation/tests - name: Coveralls diff --git a/.gitignore b/.gitignore index aefc43e6..0d6a6d04 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,6 @@ develop-eggs/ dist/ downloads/ eggs/ -.eggs/ lib/ lib64/ parts/ @@ -24,6 +23,8 @@ wheels/ *.egg-info/ .installed.cfg *.egg +*.eggs +src/ # PyInstaller # Usually these files are written by a python script from a template From 6f1709c86b46482fc80fa6668ca475b99452fb42 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 12:53:32 +1000 Subject: [PATCH 18/25] [QOL-9122] use ValidationStatusHelper to simplify database access --- ckanext/validation/jobs.py | 46 +++-- ckanext/validation/logic.py | 52 ++---- .../validation/validation_status_helper.py | 160 ++++++++++++++++++ 3 files changed, 192 insertions(+), 66 deletions(-) create mode 100644 ckanext/validation/validation_status_helper.py diff --git a/ckanext/validation/jobs.py b/ckanext/validation/jobs.py index c6c92fb1..da21331d 100644 --- a/ckanext/validation/jobs.py +++ b/ckanext/validation/jobs.py @@ -1,12 +1,10 @@ # encoding: utf-8 import logging -import datetime import json import re import requests -from sqlalchemy.orm.exc import NoResultFound from goodtables import validate from six import string_types @@ -15,13 +13,14 @@ import ckantoolkit as t -from ckanext.validation.model import Validation - +from ckanext.validation.validation_status_helper import (ValidationStatusHelper, ValidationJobDoesNotExist, + ValidationJobAlreadyRunning, StatusTypes) log = logging.getLogger(__name__) def run_validation_job(resource): + vsh = ValidationStatusHelper() # handle either a resource dict or just an ID # ID is more efficient, as resource dicts can be very large if isinstance(resource, string_types): @@ -33,18 +32,17 @@ def run_validation_job(resource): log.debug(u'Validating resource: %s', resource_id) else: log.debug(u'Validating resource dict: %s', resource) + validation_record = None try: - validation = Session.query(Validation).filter( - Validation.resource_id == resource_id).one() - except NoResultFound: - validation = None - - if not validation: - validation = Validation(resource_id=resource_id) - - validation.status = u'running' - Session.add(validation) - Session.commit() + validation_record = vsh.updateValidationJobStatus(Session, resource_id, StatusTypes.running) + except ValidationJobAlreadyRunning as e: + log.error("Won't run enqueued job %s as job is already running or in invalid state: %s", resource['id'], e) + return + except ValidationJobDoesNotExist: + validation_record = vsh.createValidationJob(Session, resource['id']) + validation_record = vsh.updateValidationJobStatus( + session=Session, resource_id=resource_id, + status=StatusTypes.running, validationRecord=validation_record) options = t.config.get( u'ckanext.validation.default_validation_options') @@ -105,16 +103,12 @@ def run_validation_job(resource): report['warnings'][index] = re.sub(r'Table ".*"', 'Table', warning) if report['table-count'] > 0: - validation.status = u'success' if report[u'valid'] else u'failure' - validation.report = report + status = StatusTypes.success if report[u'valid'] else StatusTypes.failure + validation_record = vsh.updateValidationJobStatus(Session, resource['id'], status, report, None, validation_record) else: - validation.status = u'error' - validation.error = { - 'message': '\n'.join(report['warnings']) or u'No tables found'} - validation.finished = datetime.datetime.utcnow() - - Session.add(validation) - Session.commit() + status = StatusTypes.error + error_payload = {'message': '\n'.join(report['warnings']) or u'No tables found'} + validation_record = vsh.updateValidationJobStatus(Session, resource['id'], status, None, error_payload, validation_record) # Store result status in resource t.get_action('resource_patch')( @@ -122,8 +116,8 @@ def run_validation_job(resource): 'user': t.get_action('get_site_user')({'ignore_auth': True})['name'], '_validation_performed': True}, {'id': resource['id'], - 'validation_status': validation.status, - 'validation_timestamp': validation.finished.isoformat()}) + 'validation_status': validation_record.status, + 'validation_timestamp': validation_record.finished.isoformat()}) def _validate_table(source, _format=u'csv', schema=None, **options): diff --git a/ckanext/validation/logic.py b/ckanext/validation/logic.py index 6fee325f..0d3a5f04 100644 --- a/ckanext/validation/logic.py +++ b/ckanext/validation/logic.py @@ -1,10 +1,7 @@ # encoding: utf-8 -import datetime import logging import json - -from sqlalchemy.orm.exc import NoResultFound from six import string_types import ckan.plugins as plugins @@ -12,7 +9,6 @@ import ckantoolkit as t -from ckanext.validation.model import Validation from ckanext.validation.interfaces import IDataValidation from ckanext.validation.jobs import run_validation_job from ckanext.validation import settings @@ -21,6 +17,7 @@ get_update_mode_from_config, delete_local_uploaded_file, ) +from ckanext.validation.validation_status_helper import (ValidationStatusHelper, ValidationJobAlreadyEnqueued) log = logging.getLogger(__name__) @@ -119,27 +116,13 @@ def resource_validation_run(context, data_dict): {u'url': u'Resource must have a valid URL or an uploaded file'}) # Check if there was an existing validation for the resource - - Session = context['model'].Session - try: - validation = Session.query(Validation).filter( - Validation.resource_id == resource_id).one() - except NoResultFound: - validation = None - - if validation: - # Reset values - validation.finished = None - validation.report = None - validation.error = None - validation.created = datetime.datetime.utcnow() - validation.status = u'created' - else: - validation = Validation(resource_id=resource['id']) - - Session.add(validation) - Session.commit() + session = context['model'].Session + ValidationStatusHelper().createValidationJob(session, resource_id) + except ValidationJobAlreadyEnqueued: + if async_job: + log.error("resource_validation_run: ValidationJobAlreadyEnqueued %s", data_dict['resource_id']) + return if async_job: package_id = resource['package_id'] @@ -177,13 +160,8 @@ def resource_validation_show(context, data_dict): if not data_dict.get(u'resource_id'): raise t.ValidationError({u'resource_id': u'Missing value'}) - Session = context['model'].Session - - try: - validation = Session.query(Validation).filter( - Validation.resource_id == data_dict['resource_id']).one() - except NoResultFound: - validation = None + session = context['model'].Session + validation = ValidationStatusHelper().getValidationJob(session, data_dict['resource_id']) if not validation: raise t.ObjectNotFound( @@ -209,20 +187,14 @@ def resource_validation_delete(context, data_dict): if not data_dict.get(u'resource_id'): raise t.ValidationError({u'resource_id': u'Missing value'}) - Session = context['model'].Session - - try: - validation = Session.query(Validation).filter( - Validation.resource_id == data_dict['resource_id']).one() - except NoResultFound: - validation = None + session = context['model'].Session + validation = ValidationStatusHelper().getValidationJob(session, data_dict['resource_id']) if not validation: raise t.ObjectNotFound( 'No validation report exists for this resource') - Session.delete(validation) - Session.commit() + ValidationStatusHelper().deleteValidationJob(session, validation) def resource_validation_run_batch(context, data_dict): diff --git a/ckanext/validation/validation_status_helper.py b/ckanext/validation/validation_status_helper.py new file mode 100644 index 00000000..af565462 --- /dev/null +++ b/ckanext/validation/validation_status_helper.py @@ -0,0 +1,160 @@ +# encoding: utf-8 + +import datetime +import logging + +from ckan.model import Session +from ckanext.validation import model +from sqlalchemy.orm.exc import NoResultFound + +log = logging.getLogger(__name__) + +REDIS_PREFIX = 'ckanext-validation:' + + +class StatusTypes: + # could be Enum but keeping it system for now + created = u'created' # Job created and put onto queue + running = u'running' # Job picked up by worker and being processed + success = u'success' # Validation Successful and report attached + failure = u'failure' # Validation Failed and report attached + error = u'error' # Validation Job could not create validation report + + +class ValidationStatusHelper: + """ + Validation status: + created: Job created and put onto queue + running: Job picked up by worker and being processed + success: Validation Successful and report attached + failure: Validation Failed and report attached + error: Validation Job could not create validation report + + This class is to help ensure we don't enqueue validation jobs when a job is enqueued in the last hour + and to stop worker threads from working on jobs which are pending (in progress). + + Use case: + * Ensure validation job/report is not reset multiple times. + * To not re-enqueue if job is in 'created','running' for last hour (can't differentiate on running timestamp) + * Allow job to be enqueued if in 'created','running' if over 1 hour old. + * Allow validation job to be re-queued in all other states i.e. ('success', 'failure', 'error') + """ + + def getValidationJob(self, session=None, resource_id=None): + # type: (object, Session, str) -> model.Validation + """ + Gets Validation record for resource if exists + + :param self: + :param resource_id: + :return Validation: Validation record or None + """ + log.debug("getValidationJob: %s", resource_id) + try: + return session.query(model.Validation).filter( + model.Validation.resource_id == resource_id).order_by(model.Validation.created.desc()).one() + except NoResultFound: + return None + + def deleteValidationJob(self, session=None, validationRecord=None): + # type: (object, Session, model.Validation) -> None + session.delete(validationRecord) + session.commit() + session.flush() + + def createValidationJob(self, session=None, resource_id=None, validationRecord=None): + # type: (object, Session, str) -> model.Validation + ''' + If validation object not exist: + create record in state created with created timestamp of now + + If in state 'created' or 'running' and created time stamp within last hour: + raise exception ValidationJobAlreadyEnqueued + + Else: (object exists and is in final state(success, failure, error)) + reset record to clean state with status 'created', created with timestamp now + + + :param self: + :param string resource_id: resource_id of job + :return Validation record + :throws ValidationJobAlreadyEnqueued exception + + ''' + log.debug("createValidationJob: %s", resource_id) + if validationRecord is None: + validationRecord = self.getValidationJob(session, resource_id) + + if validationRecord is not None: + status = validationRecord.status + if status in (StatusTypes.running, StatusTypes.created) and self.getHoursSince(validationRecord.created) < 1: + error_message = "Validation Job already in pending state: {} on resource: {} created on (gmt): {} data: {}"\ + .format(validationRecord.status, resource_id, validationRecord.created.isoformat(), validationRecord) + log.error(error_message) + raise ValidationJobAlreadyEnqueued(error_message) + + if validationRecord is None: + validationRecord = model.Validation(resource_id=resource_id) + + validationRecord.finished = None + validationRecord.report = None + validationRecord.error = None + validationRecord.created = datetime.datetime.utcnow() + validationRecord.status = StatusTypes.created + + session.add(validationRecord) + session.commit() + session.flush() + return validationRecord + + def updateValidationJobStatus(self, session=None, resource_id=None, status=None, report=None, error=None, validationRecord=None): + # type: (object, Session, str, str, object, object) -> model.Validation + """ + If report or error is attached, update finished to be now + :param self: + :param session Session + :param resource_id: + :param status: + :param report: + :param error: + :return: + """ + log.debug("updateValidationJobStatus: %s status: %s", resource_id, status) + if validationRecord is None: + validationRecord = self.getValidationJob(session, resource_id) + + if validationRecord is None: + log.error("record not found to update statues: %s", resource_id) + raise ValidationJobDoesNotExist() + + # Handle already running status in in last hour + if status == StatusTypes.running and validationRecord.status == status: + if self.getHoursSince(validationRecord.created) < 1: + raise ValidationJobAlreadyRunning() + + validationRecord.status = status + validationRecord.report = report + validationRecord.error = error + if status in (StatusTypes.success, StatusTypes.failure, StatusTypes.error): + validationRecord.finished = datetime.datetime.utcnow() + + Session.add(validationRecord) + Session.commit() + # Flush so other transactions are not waiting + Session.flush() + return validationRecord + + def getHoursSince(self, created): + return (datetime.datetime.utcnow() - created).total_seconds() / (60 * 60) + + +class ValidationJobAlreadyEnqueued(Exception): + """A Validation Job is Already Enqueued.""" + + +class ValidationJobDoesNotExist(Exception): + """A Validation Job does not exist.""" + + +class ValidationJobAlreadyRunning(Exception): + """A Validation Job is Already Running.""" From 4c977db8ab212bdcfa24b7447d9aa39d87093352 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 13:02:33 +1000 Subject: [PATCH 19/25] [QOL-9122] mark broken tests to be skipped instead of commenting them out --- .../validation/plugin_mixins/flask_plugin.py | 6 +- .../validation/plugin_mixins/pylons_plugin.py | 2 +- ckanext/validation/tests/test_form.py | 183 ++++++++++-------- ckanext/validation/tests/test_interfaces.py | 1 + ckanext/validation/tests/test_logic.py | 15 +- ckanext/validation/tests/test_utils.py | 2 +- 6 files changed, 119 insertions(+), 90 deletions(-) diff --git a/ckanext/validation/plugin_mixins/flask_plugin.py b/ckanext/validation/plugin_mixins/flask_plugin.py index a5d4038e..de772cd8 100644 --- a/ckanext/validation/plugin_mixins/flask_plugin.py +++ b/ckanext/validation/plugin_mixins/flask_plugin.py @@ -1,8 +1,8 @@ -# -*- coding: utf-8 -*- +# encoding: utf-8 import ckan.plugins as p -import ckanext.validation.cli as cli -import ckanext.validation.views as views + +from ckanext.validation import cli, views class MixinPlugin(p.SingletonPlugin): diff --git a/ckanext/validation/plugin_mixins/pylons_plugin.py b/ckanext/validation/plugin_mixins/pylons_plugin.py index 23f50954..4f4905cb 100644 --- a/ckanext/validation/plugin_mixins/pylons_plugin.py +++ b/ckanext/validation/plugin_mixins/pylons_plugin.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# encoding: utf-8 import ckan.plugins as p diff --git a/ckanext/validation/tests/test_form.py b/ckanext/validation/tests/test_form.py index f39bb8f2..ae73c9ad 100644 --- a/ckanext/validation/tests/test_form.py +++ b/ckanext/validation/tests/test_form.py @@ -1,22 +1,23 @@ # encoding: utf-8 +import datetime import io import json import mock - import pytest +import unittest from bs4 import BeautifulSoup from six import ensure_binary from ckantoolkit import check_ckan_version from ckan.tests import helpers -from ckan.tests.factories import Sysadmin, Dataset +from ckan.tests.factories import Sysadmin, Dataset, Organization from ckan.tests.helpers import ( FunctionalTestBase, call_action ) from ckanext.validation.tests.helpers import ( - VALID_CSV, mock_uploads, + VALID_CSV, INVALID_CSV, mock_uploads, MockFieldStorage ) if check_ckan_version('2.9'): @@ -117,6 +118,7 @@ def test_resource_form_create(self, app): json_value = json.dumps(value) params = { + 'name': 'test_resource_form_create', 'package_id': dataset['id'], 'url': 'https://example.com/data.csv', 'schema': json_value, @@ -140,6 +142,7 @@ def test_resource_form_create_json(self, app): json_value = json.dumps(value) params = { + 'name': 'test_resource_form_create_json', 'package_id': dataset['id'], 'url': 'https://example.com/data.csv', 'schema_json': json_value, @@ -164,6 +167,7 @@ def test_resource_form_create_upload(self, mock_open): upload = ('schema_upload', 'schema.json', json_value) params = { + 'name': 'test_resource_form_create_upload', 'url': 'https://example.com/data.csv', } @@ -180,6 +184,7 @@ def test_resource_form_create_url(self, app): value = 'https://example.com/schemas.json' params = { + 'name': 'test_resource_form_create_url', 'package_id': dataset['id'], 'url': 'https://example.com/data.csv', 'schema_json': value, @@ -225,6 +230,7 @@ def test_resource_form_update(self, app): json_value = json.dumps(value) params = { + 'name': 'test_resource_form_update', 'url': 'https://example.com/data.csv', 'schema': json_value } @@ -270,6 +276,7 @@ def test_resource_form_update_json(self, app): json_value = json.dumps(value) params = { + 'name': 'test_resource_form_update_json', 'url': 'https://example.com/data.csv', 'schema_json': json_value } @@ -306,6 +313,7 @@ def test_resource_form_update_url(self, app): value = 'https://example.com/schema.json' params = { + 'name': 'test_resource_form_update_url', 'url': 'https://example.com/data.csv', 'schema_url': value } @@ -351,6 +359,7 @@ def test_resource_form_update_upload(self, app): upload = ('schema_upload', 'schema.json', json_value) params = { + 'name': 'test_resource_form_update_upload', 'url': 'https://example.com/data.csv', } @@ -383,6 +392,7 @@ def test_resource_form_create(self, app): } json_value = json.dumps(value) params = { + 'name': 'test_resource_form_create', 'url': 'https://example.com/data.csv', 'validation_options': json_value, } @@ -426,6 +436,7 @@ def test_resource_form_update(self, app): json_value = json.dumps(value) params = { + 'name': 'test_resource_form_update', 'url': 'https://example.com/data.csv', 'validation_options': json_value } @@ -456,6 +467,7 @@ def test_resource_form_create_valid(self, mock_open): valid_stream = io.BufferedReader(io.BytesIO(VALID_CSV)) params = { + 'name': 'test_resource_form_create_valid', 'url': 'https://example.com/data.csv' } @@ -468,29 +480,34 @@ def test_resource_form_create_valid(self, mock_open): assert dataset['resources'][0]['validation_status'] == 'success' assert 'validation_timestamp' in dataset['resources'][0] - # @mock_uploads - # def test_resource_form_create_invalid(self, mock_open, app): - # user = Sysadmin() - # org = Organization(user=user) - # dataset = Dataset(owner_org=org['id']) + @unittest.skip("TODO Fix file mocks so this can run") + @pytest.mark.ckan_config('ckanext.validation.run_on_create_sync', True) + @pytest.mark.ckan_config('ckanext.validation.run_on_update_sync', True) + @mock_uploads + def test_resource_form_create_invalid(self, mock_open): + user = Sysadmin() + org = Organization(user=user) + dataset = Dataset(owner_org=org['id']) + + upload = ('upload', 'invalid.csv', INVALID_CSV) - # upload = ('upload', 'invalid.csv', INVALID_CSV) + invalid_stream = io.BufferedReader(io.BytesIO(INVALID_CSV)) - # invalid_stream = io.BufferedReader(io.BytesIO(INVALID_CSV)) + params = { + 'name': 'test_resource_form_create_invalid', + 'url': 'https://example.com/data.csv' + } - # params = { - # 'url': 'https://example.com/data.csv' - # } + with mock.patch('io.open', return_value=invalid_stream): + response = _get_response_body(_post(self.app, NEW_RESOURCE_URL.format(dataset['id']), + params, upload=[upload])) + print(response) + dataset = call_action('package_show', id=dataset['id']) + print(dataset) - # with mock.patch('io.open', return_value=invalid_stream): - # response = _post(app, NEW_RESOURCE_URL.format(dataset['id']), - # params, upload=[upload]) - # print(response.body) - # dataset = call_action('package_show', id=dataset['id']) - # print(dataset) - # assert 'validation' in response.body - # assert 'missing-value' in response.body - # assert 'Row 2 has a missing value in column 4' in response.body + assert 'validation' in response + assert 'missing-value' in response + assert 'Row 2 has a missing value in column 4' in response @pytest.mark.usefixtures("clean_db", "validation_setup") @@ -516,6 +533,7 @@ def test_resource_form_update_valid(self, mock_open): valid_stream = io.BufferedReader(io.BytesIO(VALID_CSV)) params = { + 'name': 'test_resource_form_update_valid', 'url': 'https://example.com/data.csv' } resource_id = dataset['resources'][0]['id'] @@ -529,76 +547,85 @@ def test_resource_form_update_valid(self, mock_open): assert dataset['resources'][0]['validation_status'] == 'success' assert 'validation_timestamp' in dataset['resources'][0] - # @mock_uploads - # def test_resource_form_update_invalid(self, mock_open, app): + @unittest.skip("TODO Fix file mocks so this can run") + @mock_uploads + def test_resource_form_update_invalid(self, mock_open): - # dataset = Dataset(resources=[ - # { - # 'url': 'https://example.com/data.csv' - # } - # ]) - # resource_id = dataset['resources'][0]['id'] + dataset = Dataset( + resources=[{ + 'name': 'test_resource_form_update_invalid', + 'url': 'https://example.com/data.csv' + }] + ) + resource_id = dataset['resources'][0]['id'] - # response = _get_resource_update_page_as_sysadmin( - # app, dataset['id'], resource_id) + response = _get_resource_update_page_as_sysadmin( + self.app, dataset['id'], resource_id) - # upload = ('upload', 'invalid.csv', INVALID_CSV) - # params = {} - # invalid_stream = io.BufferedReader(io.BytesIO(INVALID_CSV)) + upload = ('upload', 'invalid.csv', INVALID_CSV) + params = {} - # with mock.patch('builtins.open', return_value=invalid_stream): - # response = _post(app, EDIT_RESOURCE_URL.format(dataset['id'], - # resource_id), - # params, resource_id=resource_id, upload=[upload]) - # print(response) - # print(dir(response)) - # assert 'validation' in response.body - # assert 'missing-value' in response.body - # assert 'Row 2 has a missing value in column 4' in response.body + invalid_stream = io.BufferedReader(io.BytesIO(INVALID_CSV)) + with mock.patch('io.open', return_value=invalid_stream): + response = _get_response_body(_post( + self.app, EDIT_RESOURCE_URL.format(dataset['id'], resource_id), + params, resource_id=resource_id, upload=[upload])) + print(response) + print(dir(response)) + assert 'validation' in response + assert 'missing-value' in response + assert 'Row 2 has a missing value in column 4' in response -# @pytest.mark.usefixtures("clean_db", "validation_setup") -# class TestResourceValidationFieldsPersisted(FunctionalTestBase): -# @classmethod -# def _apply_config_changes(cls, cfg): -# cfg['ckanext.validation.run_on_create_sync'] = True -# cfg['ckanext.validation.run_on_update_sync'] = True +@pytest.mark.usefixtures("clean_db", "validation_setup") +class TestResourceValidationFieldsPersisted(FunctionalTestBase): -# def setup(self): -# pass + @classmethod + def _apply_config_changes(cls, cfg): + cfg['ckanext.validation.run_on_create_sync'] = True + cfg['ckanext.validation.run_on_update_sync'] = True -# @mock_uploads -# def test_resource_form_fields_are_persisted(self, mock_open, app): -# upload = MockFieldStorage(io.BytesIO(VALID_CSV), 'valid.csv') + def setup(self): + self.app = helpers._get_test_app() + pass -# valid_stream = io.BufferedReader(io.BytesIO(VALID_CSV)) + @mock_uploads + def test_resource_form_fields_are_persisted(self, mock_open): + upload = ('upload', 'valid.csv', VALID_CSV) + upload_file = MockFieldStorage(io.BytesIO(VALID_CSV), filename='data.csv') -# dataset = Dataset() -# with mock.patch('io.open', return_value=valid_stream): -# resource = call_action( -# 'resource_create', -# package_id=dataset['id'], -# validation_status='success', -# validation_timestamp=datetime.datetime.now().isoformat(), -# upload=upload, -# url='data.csv') + valid_stream = io.BufferedReader(io.BytesIO(VALID_CSV)) + + dataset = Dataset() + with mock.patch('io.open', return_value=valid_stream): + resource = call_action( + 'resource_create', + package_id=dataset['id'], + validation_status='success', + validation_timestamp=datetime.datetime.now().isoformat(), + upload=upload_file, + url='data.csv') + resource = call_action('resource_show', id=resource['id']) + assert 'validation_status' in resource + assert resource['validation_status'] == 'success' + assert not resource.get('description') + resource_id = resource['id'] -# assert 'validation_status' in resource -# assert resource['validation_status'] == 'success' -# assert resource.get('description') is None + params = { + 'description': 'test desc' + } -# params = { -# 'description': 'test desc' -# } + dataset = call_action('package_show', id=dataset['id']) -# dataset = call_action('package_show', id=dataset['id']) + valid_stream = io.BufferedReader(io.BytesIO(VALID_CSV)) -# _post(app, EDIT_RESOURCE_URL.format(dataset['id'], resource['id']), -# params, resource_id=resource['id']) + with mock.patch('io.open', return_value=valid_stream): + _post(self.app, EDIT_RESOURCE_URL.format(dataset['id'], resource['id']), + params, resource_id=resource_id, upload=[upload]) -# dataset = call_action('package_show', id=dataset['id']) + dataset = call_action('package_show', id=dataset['id']) -# assert dataset['resources'][0]['validation_status'] == 'success' -# assert 'validation_timestamp' in dataset['resources'][0] -# assert dataset['resources'][0]['description'] == 'test desc' + assert dataset['resources'][0]['validation_status'] == 'success' + assert 'validation_timestamp' in dataset['resources'][0] + assert dataset['resources'][0]['description'] == 'test desc' diff --git a/ckanext/validation/tests/test_interfaces.py b/ckanext/validation/tests/test_interfaces.py index 12bf324f..e64d822b 100644 --- a/ckanext/validation/tests/test_interfaces.py +++ b/ckanext/validation/tests/test_interfaces.py @@ -132,6 +132,7 @@ def test_can_validate_called_on_update_sync_no_validation(self, mock_validation) assert not mock_validation.called +@pytest.mark.usefixtures("validation_setup") class TestInterfaceAsync(BaseTestInterfaces): @classmethod diff --git a/ckanext/validation/tests/test_logic.py b/ckanext/validation/tests/test_logic.py index de0cc70d..51bf5bd8 100644 --- a/ckanext/validation/tests/test_logic.py +++ b/ckanext/validation/tests/test_logic.py @@ -3,6 +3,7 @@ import datetime import io import json +import unittest import mock import pytest @@ -69,17 +70,17 @@ def test_resource_validation_with_upload(self, mock_enqueue_job): call_action('resource_validation_run', resource_id=resource['id']) - # def test_resource_validation_run_starts_job(self): - - # resource = factories.Resource(format='csv') + @unittest.skip("TODO debug this later") + def test_resource_validation_run_starts_job(self): + resource = factories.Resource(format='csv') - # jobs = call_action('job_list') + jobs = call_action('job_list') - # call_action('resource_validation_run', resource_id=resource['id']) + call_action('resource_validation_run', resource_id=resource['id']) - # jobs_after = call_action('job_list') + jobs_after = call_action('job_list') - # assert len(jobs_after) == len(jobs) + 1 + assert len(jobs_after) == len(jobs) + 1 @mock.patch('ckantoolkit.enqueue_job') def test_resource_validation_creates_validation_object( diff --git a/ckanext/validation/tests/test_utils.py b/ckanext/validation/tests/test_utils.py index 1d055504..1dee025e 100644 --- a/ckanext/validation/tests/test_utils.py +++ b/ckanext/validation/tests/test_utils.py @@ -140,7 +140,7 @@ def test_delete_file_not_deleted_if_resources_second(self, mock_open): patcher.tearDown() @mock_uploads - def test_delete_passes_if_os_exeception(self, mock_open): + def test_delete_passes_if_os_exception(self, mock_open): resource_id = str(uuid.uuid4()) path = '/doesnt_exist/resources/{}/{}/{}'.format( From aa188bc4f1ea5c1b6575982067bba6d3a43d9508 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 13:09:28 +1000 Subject: [PATCH 20/25] [QOL-9122] add unit test for attempting to run validation without sufficient privileges --- ckanext/validation/tests/test_logic.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ckanext/validation/tests/test_logic.py b/ckanext/validation/tests/test_logic.py index 51bf5bd8..f2d9942c 100644 --- a/ckanext/validation/tests/test_logic.py +++ b/ckanext/validation/tests/test_logic.py @@ -241,6 +241,19 @@ def test_resource_validation_delete_removes_object(self): @pytest.mark.usefixtures("clean_db", "validation_setup") class TestAuth(object): + def test_run_anon(self): + + resource = factories.Resource() + + context = { + 'user': None, + 'model': model + } + + with pytest.raises(t.NotAuthorized): + call_auth('resource_validation_run', context=context, + resource_id=resource['id']) + def test_run_sysadmin(self): resource = factories.Resource() sysadmin = factories.Sysadmin() From b85d8dd2ebcd80402d3c9fbcd83f151b3fe8d1e2 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 13:24:18 +1000 Subject: [PATCH 21/25] [QOL-9122] add template helper to reliably detect CKAN 2.9+ --- ckanext/validation/helpers.py | 8 ++++++++ ckanext/validation/plugin.py | 6 ++++-- ckanext/validation/templates/package/resource_read.html | 2 +- .../templates/package/snippets/resource_item.html | 2 +- .../templates/scheming/form_snippets/resource_schema.html | 2 +- .../validation/snippets/validation_report_dialog.html | 2 +- .../validation/snippets/validation_report_dialog_bs2.html | 2 +- .../validation/templates/validation/validation_read.html | 8 ++++---- 8 files changed, 21 insertions(+), 11 deletions(-) diff --git a/ckanext/validation/helpers.py b/ckanext/validation/helpers.py index fb7f4e2f..1ee66cad 100644 --- a/ckanext/validation/helpers.py +++ b/ckanext/validation/helpers.py @@ -95,3 +95,11 @@ def bootstrap_version(): return '3' else: return '2' + + +def is_ckan_29(): + """ + Returns True if using CKAN 2.9+, with Flask and Webassets. + Returns False if those are not present. + """ + return check_ckan_version(min_version='2.9.0') diff --git a/ckanext/validation/plugin.py b/ckanext/validation/plugin.py index ee440ba0..63c751e5 100644 --- a/ckanext/validation/plugin.py +++ b/ckanext/validation/plugin.py @@ -30,6 +30,7 @@ class DefaultTranslation(): validation_extract_report_from_errors, dump_json_value, bootstrap_version, + is_ckan_29, ) from ckanext.validation.validators import ( resource_schema_validator, @@ -44,7 +45,7 @@ class DefaultTranslation(): log = logging.getLogger(__name__) -if t.check_ckan_version(min_version='2.9.0'): +if is_ckan_29(): from .plugin_mixins.flask_plugin import MixinPlugin else: from .plugin_mixins.pylons_plugin import MixinPlugin @@ -72,7 +73,7 @@ def i18n_directory(self): def update_config(self, config_): if not tables_exist(): - if t.check_ckan_version('2.9'): + if is_ckan_29(): init_command = 'ckan validation init-db' else: init_command = 'paster --plugin=ckanext-validation validation init-db' @@ -123,6 +124,7 @@ def get_helpers(self): u'validation_extract_report_from_errors': validation_extract_report_from_errors, u'dump_json_value': dump_json_value, u'bootstrap_version': bootstrap_version, + u'is_ckan_29': is_ckan_29, } # IResourceController diff --git a/ckanext/validation/templates/package/resource_read.html b/ckanext/validation/templates/package/resource_read.html index ae784ee8..bc02a445 100644 --- a/ckanext/validation/templates/package/resource_read.html +++ b/ckanext/validation/templates/package/resource_read.html @@ -6,7 +6,7 @@

{{ h.resource_display_name(res) | truncate(50) }} {{ h.get_validation_badge(res)|safe }}

- {% set type = 'asset' if h.ckan_version().split('.')[1] | int >= 9 else 'resource' %} + {% set type = 'asset' if h.is_ckan_29() else 'resource' %} {% include 'validation/snippets/validation_style_' ~ type ~ '.html' %} {% endblock %} diff --git a/ckanext/validation/templates/package/snippets/resource_item.html b/ckanext/validation/templates/package/snippets/resource_item.html index b9b47c9e..74f31bf8 100644 --- a/ckanext/validation/templates/package/snippets/resource_item.html +++ b/ckanext/validation/templates/package/snippets/resource_item.html @@ -4,7 +4,7 @@ {{ super() }} {{ h.get_validation_badge(res, in_listing=True)|safe }} -{% set type = 'asset' if h.ckan_version().split('.')[1] | int >= 9 else 'resource' %} +{% set type = 'asset' if h.is_ckan_29() else 'resource' %} {% include 'validation/snippets/validation_style_' ~ type ~ '.html' %} {% endblock %} diff --git a/ckanext/validation/templates/scheming/form_snippets/resource_schema.html b/ckanext/validation/templates/scheming/form_snippets/resource_schema.html index ea3b3150..785664ef 100644 --- a/ckanext/validation/templates/scheming/form_snippets/resource_schema.html +++ b/ckanext/validation/templates/scheming/form_snippets/resource_schema.html @@ -60,7 +60,7 @@ {% set existing_value = h.scheming_display_json_value(value, indent=None) if is_json else value %} - {% set type = 'asset' if h.ckan_version().split('.')[1] | int >= 9 else 'resource' %} + {% set type = 'asset' if h.is_ckan_29() else 'resource' %} {% include 'validation/snippets/validation_resource-schema-form_' ~ type ~ '.html' %} diff --git a/ckanext/validation/templates/validation/snippets/validation_report_dialog.html b/ckanext/validation/templates/validation/snippets/validation_report_dialog.html index fc78cd20..e3c8e18b 100644 --- a/ckanext/validation/templates/validation/snippets/validation_report_dialog.html +++ b/ckanext/validation/templates/validation/snippets/validation_report_dialog.html @@ -14,5 +14,5 @@

-{% set type = 'asset' if h.ckan_version().split('.')[1] | int >= 9 else 'resource' %} +{% set type = 'asset' if h.is_ckan_29() else 'resource' %} {% include 'validation/snippets/validation_report_form_' ~ type ~ '.html' %} diff --git a/ckanext/validation/templates/validation/snippets/validation_report_dialog_bs2.html b/ckanext/validation/templates/validation/snippets/validation_report_dialog_bs2.html index 165d46ea..31255d01 100644 --- a/ckanext/validation/templates/validation/snippets/validation_report_dialog_bs2.html +++ b/ckanext/validation/templates/validation/snippets/validation_report_dialog_bs2.html @@ -8,6 +8,6 @@

{{ _('Data Validation Report') }}

-{% set type = 'asset' if h.ckan_version().split('.')[1] | int >= 9 else 'resource' %} +{% set type = 'asset' if h.is_ckan_29() else 'resource' %} {% include 'validation/snippets/validation_report_form_' ~ type ~ '.html' %} diff --git a/ckanext/validation/templates/validation/validation_read.html b/ckanext/validation/templates/validation/validation_read.html index c557c960..badb34ef 100644 --- a/ckanext/validation/templates/validation/validation_read.html +++ b/ckanext/validation/templates/validation/validation_read.html @@ -1,4 +1,4 @@ -{% set type = 'asset' if h.ckan_version().split('.')[1] | int >= 9 else 'resource' %} +{% set type = 'asset' if h.is_ckan_29() else 'resource' %} {% include 'validation/snippets/validation_style_' ~ type ~ '.html' %} @@ -10,10 +10,10 @@ {% block breadcrumb_content %} {{ super() }} - {% if h.ckan_version().split('.')[1] | int >= 9 %} -
  • {{ h.resource_display_name(resource)|truncate(30) }}
  • + {% if h.is_ckan_29() %} +
  • {{ h.resource_display_name(resource)|truncate(30) }}
  • {% else %} -
  • {{ h.resource_display_name(resource)|truncate(30) }}
  • +
  • {{ h.resource_display_name(resource)|truncate(30) }}
  • {% endif %}
  • Validation Report
  • {% endblock %} From 79a16688b62ef89b9921d896b4b7176e05787d38 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 13:41:08 +1000 Subject: [PATCH 22/25] [QOL-9122] add test for avoiding unnecessary validation - Updating a dataset via the web UI should not trigger a validation job --- ckanext/validation/tests/test_form.py | 9 +++++---- ckanext/validation/tests/test_plugin.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/ckanext/validation/tests/test_form.py b/ckanext/validation/tests/test_form.py index ae73c9ad..52d42491 100644 --- a/ckanext/validation/tests/test_form.py +++ b/ckanext/validation/tests/test_form.py @@ -563,7 +563,9 @@ def test_resource_form_update_invalid(self, mock_open): self.app, dataset['id'], resource_id) upload = ('upload', 'invalid.csv', INVALID_CSV) - params = {} + params = { + 'name': 'test_resource_form_update_invalid', + } invalid_stream = io.BufferedReader(io.BytesIO(INVALID_CSV)) @@ -613,11 +615,10 @@ def test_resource_form_fields_are_persisted(self, mock_open): resource_id = resource['id'] params = { - 'description': 'test desc' + 'name': 'test_resource_form_fields_are_persisted', + 'description': 'test desc', } - dataset = call_action('package_show', id=dataset['id']) - valid_stream = io.BufferedReader(io.BytesIO(VALID_CSV)) with mock.patch('io.open', return_value=valid_stream): diff --git a/ckanext/validation/tests/test_plugin.py b/ckanext/validation/tests/test_plugin.py index 3876114a..869e3446 100644 --- a/ckanext/validation/tests/test_plugin.py +++ b/ckanext/validation/tests/test_plugin.py @@ -326,3 +326,17 @@ def test_validation_does_not_run_when_config_false(self, mock_enqueue): call_action('package_update', {}, **dataset) mock_enqueue.assert_not_called() + + @change_config('ckanext.validation.run_on_create_async', False) + @mock.patch('ckantoolkit.enqueue_job') + def test_validation_does_not_run_when_editing_via_web_form(self, mock_enqueue): + resource = { + 'id': 'test-resource-id', + 'format': 'CSV', + 'url': 'http://some.data' + } + dataset = factories.Dataset(resources=[resource]) + + call_action('package_update', context={'save': True}, **dataset) + + mock_enqueue.assert_not_called() From 6bdf4ab46ef4b9f3a43070bf7e0da061635b7f8f Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 14:00:54 +1000 Subject: [PATCH 23/25] [QOL-9122] make custom actions more robust - check for asynchronicity at runtime so we can always hook our action in - use chained actions so we don't block other customisation --- ckanext/validation/logic.py | 66 +++++++++++++++++++----------------- ckanext/validation/plugin.py | 11 ++---- 2 files changed, 38 insertions(+), 39 deletions(-) diff --git a/ckanext/validation/logic.py b/ckanext/validation/logic.py index 0d3a5f04..01e1a022 100644 --- a/ckanext/validation/logic.py +++ b/ckanext/validation/logic.py @@ -408,7 +408,8 @@ def _validation_dictize(validation): return out -def resource_create(context, data_dict): +@t.chained_action +def resource_create(original_action, context, data_dict): '''Appends a new resource to a datasets list of resources. This is duplicate of the CKAN core resource_create action, with just the @@ -420,6 +421,9 @@ def resource_create(context, data_dict): points that will allow a better approach. ''' + if get_create_mode_from_config() != u'sync': + return original_action(context, data_dict) + model = context['model'] package_id = t.get_or_bust(data_dict, 'package_id') @@ -469,22 +473,20 @@ def resource_create(context, data_dict): # Custom code starts - if get_create_mode_from_config() == u'sync': - - run_validation = True + run_validation = True - for plugin in plugins.PluginImplementations(IDataValidation): - if not plugin.can_validate(context, data_dict): - log.debug('Skipping validation for resource {}'.format(resource_id)) - run_validation = False + for plugin in plugins.PluginImplementations(IDataValidation): + if not plugin.can_validate(context, data_dict): + log.debug('Skipping validation for resource %s', resource_id) + run_validation = False - if run_validation: - is_local_upload = ( - hasattr(upload, 'filename') - and upload.filename is not None - and isinstance(upload, uploader.ResourceUpload)) - _run_sync_validation( - resource_id, local_upload=is_local_upload, new_resource=True) + if run_validation: + is_local_upload = ( + hasattr(upload, 'filename') + and upload.filename is not None + and isinstance(upload, uploader.ResourceUpload)) + _run_sync_validation( + resource_id, local_upload=is_local_upload, new_resource=True) # Custom code ends @@ -511,7 +513,8 @@ def resource_create(context, data_dict): return resource -def resource_update(context, data_dict): +@t.chained_action +def resource_update(original_action, context, data_dict): '''Update a resource. This is duplicate of the CKAN core resource_update action, with just the @@ -523,6 +526,9 @@ def resource_update(context, data_dict): points that will allow a better approach. ''' + if get_update_mode_from_config() != u'sync': + return original_action(context, data_dict) + model = context['model'] id = t.get_or_bust(data_dict, "id") @@ -586,21 +592,19 @@ def resource_update(context, data_dict): # Custom code starts - if get_update_mode_from_config() == u'sync': - - run_validation = True - for plugin in plugins.PluginImplementations(IDataValidation): - if not plugin.can_validate(context, data_dict): - log.debug('Skipping validation for resource {}'.format(id)) - run_validation = False - - if run_validation: - is_local_upload = ( - hasattr(upload, 'filename') - and upload.filename is not None - and isinstance(upload, uploader.ResourceUpload)) - _run_sync_validation( - id, local_upload=is_local_upload, new_resource=True) + run_validation = True + for plugin in plugins.PluginImplementations(IDataValidation): + if not plugin.can_validate(context, data_dict): + log.debug('Skipping validation for resource %s', id) + run_validation = False + + if run_validation: + is_local_upload = ( + hasattr(upload, 'filename') + and upload.filename is not None + and isinstance(upload, uploader.ResourceUpload)) + _run_sync_validation( + id, local_upload=is_local_upload, new_resource=False) # Custom code ends diff --git a/ckanext/validation/plugin.py b/ckanext/validation/plugin.py index 63c751e5..36269ed9 100644 --- a/ckanext/validation/plugin.py +++ b/ckanext/validation/plugin.py @@ -92,20 +92,15 @@ def update_config(self, config_): # IActions def get_actions(self): - new_actions = { + return { u'resource_validation_run': resource_validation_run, u'resource_validation_show': resource_validation_show, u'resource_validation_delete': resource_validation_delete, u'resource_validation_run_batch': resource_validation_run_batch, + u'resource_create': custom_resource_create, + u'resource_update': custom_resource_update } - if get_create_mode_from_config() == u'sync': - new_actions[u'resource_create'] = custom_resource_create - if get_update_mode_from_config() == u'sync': - new_actions[u'resource_update'] = custom_resource_update - - return new_actions - # IAuthFunctions def get_auth_functions(self): From 013c17c58b1a0b35de74b50a8a67ec40a18054b8 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 14:24:10 +1000 Subject: [PATCH 24/25] [QOL-9122] skip validation on package_patch API - Unless package_patch provides an update to the resources, there's no need for validation --- ckanext/validation/logic.py | 13 +++++++++++++ ckanext/validation/plugin.py | 2 ++ ckanext/validation/tests/test_logic.py | 16 ++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/ckanext/validation/logic.py b/ckanext/validation/logic.py index 01e1a022..3793207d 100644 --- a/ckanext/validation/logic.py +++ b/ckanext/validation/logic.py @@ -664,3 +664,16 @@ def _run_sync_validation(resource_id, local_upload=False, new_resource=True): raise t.ValidationError({ u'validation': [report]}) + + +@t.chained_action +def package_patch(original_action, context, data_dict): + ''' Detect whether resources have been replaced, and if not, + place a flag in the context accordingly if save flag is not set + + Note: controllers add default context where save is in request params + 'save': 'save' in request.params + ''' + if 'save' not in context and 'resources' not in data_dict: + context['save'] = True + original_action(context, data_dict) diff --git a/ckanext/validation/plugin.py b/ckanext/validation/plugin.py index 36269ed9..2b59d5a1 100644 --- a/ckanext/validation/plugin.py +++ b/ckanext/validation/plugin.py @@ -24,6 +24,7 @@ class DefaultTranslation(): auth_resource_validation_delete, auth_resource_validation_run_batch, resource_create as custom_resource_create, resource_update as custom_resource_update, + package_patch ) from ckanext.validation.helpers import ( get_validation_badge, @@ -97,6 +98,7 @@ def get_actions(self): u'resource_validation_show': resource_validation_show, u'resource_validation_delete': resource_validation_delete, u'resource_validation_run_batch': resource_validation_run_batch, + u'package_patch': package_patch, u'resource_create': custom_resource_create, u'resource_update': custom_resource_update } diff --git a/ckanext/validation/tests/test_logic.py b/ckanext/validation/tests/test_logic.py index f2d9942c..4383d525 100644 --- a/ckanext/validation/tests/test_logic.py +++ b/ckanext/validation/tests/test_logic.py @@ -736,3 +736,19 @@ def test_validation_options_field_string(self): ) assert resource['validation_options'] == json.loads(validation_options) + + +@pytest.mark.usefixtures("clean_db", "validation_setup") +class TestPackageUpdate(object): + + def test_package_patch_without_resources_sets_context_flag(self): + dataset = factories.Dataset() + context = {} + call_action('package_patch', context=context, id=dataset['id']) + assert context.get('save', False) + + def test_package_patch_with_resources_does_not_set_context_flag(self): + dataset = factories.Dataset() + context = {} + call_action('package_patch', context=context, id=dataset['id'], resources=[]) + assert 'save' not in context From fcf353ff7ec7e3b547a9038e4926e66095b7c4a5 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 25 Jul 2022 14:35:05 +1000 Subject: [PATCH 25/25] [QOL-9122] drop TODO that we don't need to implement - Synchronous validation doesn't need to pass just an ID, full dict is fine --- ckanext/validation/logic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ckanext/validation/logic.py b/ckanext/validation/logic.py index 3793207d..20c11173 100644 --- a/ckanext/validation/logic.py +++ b/ckanext/validation/logic.py @@ -128,7 +128,6 @@ def resource_validation_run(context, data_dict): package_id = resource['package_id'] enqueue_validation_job(package_id, resource_id) else: - # run_validation_job(resource_id) # TODO only pass resource_id, tests need to be fixed for this run_validation_job(resource)