From d6ec66158b7c658cc7f601da5e725a87196e25b4 Mon Sep 17 00:00:00 2001 From: baarkerlounger Date: Wed, 9 Oct 2024 15:04:58 +0100 Subject: [PATCH 1/5] Great URL can be null --- .../migrations/0006_alter_great_url.py | 18 +++++++ datahub/company_activity/models/great.py | 2 +- .../tasks/ingest_great_data.py | 2 +- .../test_tasks/test_great_ingestion_task.py | 53 ++++++++----------- 4 files changed, 42 insertions(+), 33 deletions(-) create mode 100644 datahub/company_activity/migrations/0006_alter_great_url.py diff --git a/datahub/company_activity/migrations/0006_alter_great_url.py b/datahub/company_activity/migrations/0006_alter_great_url.py new file mode 100644 index 000000000..a5c9934c4 --- /dev/null +++ b/datahub/company_activity/migrations/0006_alter_great_url.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.15 on 2024-10-09 13:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("company_activity", "0005_alter_great_actor_dit_email_address_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="great", + name="url", + field=models.CharField(max_length=255, null=True), + ), + ] diff --git a/datahub/company_activity/models/great.py b/datahub/company_activity/models/great.py index 86529252c..e59fc1931 100644 --- a/datahub/company_activity/models/great.py +++ b/datahub/company_activity/models/great.py @@ -27,7 +27,7 @@ class Great(models.Model): attributed_to_type = models.CharField(max_length=MAX_LENGTH) attributed_to_id = models.CharField(max_length=MAX_LENGTH) - url = models.CharField(max_length=MAX_LENGTH) + url = models.CharField(max_length=MAX_LENGTH, null=True) meta_action_name = models.CharField(max_length=MAX_LENGTH) meta_template_id = models.CharField(max_length=MAX_LENGTH) meta_email_address = models.CharField(max_length=MAX_LENGTH) diff --git a/datahub/company_activity/tasks/ingest_great_data.py b/datahub/company_activity/tasks/ingest_great_data.py index 28c54d644..918e5279b 100644 --- a/datahub/company_activity/tasks/ingest_great_data.py +++ b/datahub/company_activity/tasks/ingest_great_data.py @@ -88,7 +88,7 @@ def json_to_model(self, jsn): actor_type = actor['type'].split(':')[-1] values = { 'published': obj['published'], - 'url': obj.get('url', ''), + 'url': obj.get('url', None), 'attributed_to_type': attributed_to_type, 'attributed_to_id': attributed_to_id, diff --git a/datahub/company_activity/tests/test_tasks/test_great_ingestion_task.py b/datahub/company_activity/tests/test_tasks/test_great_ingestion_task.py index ad106873a..0a7efeef2 100644 --- a/datahub/company_activity/tests/test_tasks/test_great_ingestion_task.py +++ b/datahub/company_activity/tests/test_tasks/test_great_ingestion_task.py @@ -115,7 +115,7 @@ def test_skip_unchanged_records(self, test_file_path): object={ 'id': 'dit:directoryFormsApi:Submission:5249', 'published': yesterday, - 'attributedT': { + 'attributedTo': { 'type': 'dit:directoryFormsApi:SubmissionAction:gov-notify-email', 'id': 'dit:directoryFormsApi:SubmissionType:export-support-service', }, @@ -156,38 +156,9 @@ def test_invalid_country_code(self): "object": { "id": "dit:directoryFormsApi:Submission:5249", "published": "2024-09-19T14:00:34.069Z", - "attributedTo": { - "type": "dit:directoryFormsApi:SubmissionAction:gov-notify-email", - "id": "dit:directoryFormsApi:SubmissionType:export-support-service" - }, - "url": "https://kane.net/", - "dit:directoryFormsApi:Submission:Meta": { - "action_name": "gov-notify-email", - "template_id": "76f12003-74e8-4e6b-bbe9-8edc1b8619ae", - "email_address": "brownalexandra@example.com" - }, "dit:directoryFormsApi:Submission:Data": { - "comment": "Issue why why morning save parent southern.", - "country": "ZZ", - "full_name": "Tina Gray", - "website_url": "https://www.henderson-thomas.info/", - "company_name": "Foster, Murphy and Diaz", - "company_size": "1 - 10", - "phone_number": "12345678", - "terms_agreed": true, - "email_address": "ericwilliams@example.com", - "opportunities": ["https://white.net/app/tagscategory.php"], - "role_in_company": "test", - "opportunity_urls": "https://www.brown-andrade.com/wp-content/tagfaq.htm" + "country": "ZZ" } - }, - "actor": { - "type": "dit:directoryFormsApi:Submission:Sender", - "id": "dit:directoryFormsApi:Sender:1041", - "dit:emailAddress": "crystalbrock@example.org", - "dit:isBlacklisted": true, - "dit:isWhitelisted": false, - "dit:blackListedReason": null } } """ @@ -201,3 +172,23 @@ def test_invalid_country_code(self): expected_message = 'Could not match country with iso code: ZZ, ' + \ 'for form: 5249' assert sentry_event['logentry']['message'] == expected_message + + @pytest.mark.django_db + @mock_aws + def test_null_url(self, test_file_path): + """ + Test that we can ingest records with URL field null + """ + initial_count = Great.objects.count() + data = """ + { + "object": { + "id": "dit:directoryFormsApi:Submission:5249", + "published": "2024-09-19T14:00:34.069Z", + "url": null + } + } + """ + task = GreatIngestionTask() + task.json_to_model(json.loads(data)) + assert Great.objects.count() == initial_count + 1 From 4c0d829435b728f6b1fe7164a8133fb5a01c4eba Mon Sep 17 00:00:00 2001 From: baarkerlounger Date: Thu, 10 Oct 2024 10:38:52 +0100 Subject: [PATCH 2/5] Fix date format --- .../company_activity/tasks/ingest_great_data.py | 2 +- .../tests/test_tasks/test_great_ingestion_task.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/datahub/company_activity/tasks/ingest_great_data.py b/datahub/company_activity/tasks/ingest_great_data.py index 918e5279b..bbf271bb1 100644 --- a/datahub/company_activity/tasks/ingest_great_data.py +++ b/datahub/company_activity/tasks/ingest_great_data.py @@ -13,7 +13,7 @@ logger = logging.getLogger(__name__) env = environ.Env() REGION = env('AWS_DEFAULT_REGION', default='eu-west-2') -DATE_FORMAT = '%Y-%m-%d %H:%M:%S.%f' +DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%fZ' def ingest_great_data(bucket, file): diff --git a/datahub/company_activity/tests/test_tasks/test_great_ingestion_task.py b/datahub/company_activity/tests/test_tasks/test_great_ingestion_task.py index 0a7efeef2..43b93e54a 100644 --- a/datahub/company_activity/tests/test_tasks/test_great_ingestion_task.py +++ b/datahub/company_activity/tests/test_tasks/test_great_ingestion_task.py @@ -14,7 +14,7 @@ from datahub.company_activity.models import Great, IngestedFile from datahub.company_activity.tasks.ingest_company_activity import BUCKET, GREAT_PREFIX from datahub.company_activity.tasks.ingest_great_data import ( - GreatIngestionTask, ingest_great_data, REGION, + DATE_FORMAT, GreatIngestionTask, ingest_great_data, REGION, ) from datahub.company_activity.tests.factories import ( CompanyActivityGreatFactory, @@ -109,7 +109,7 @@ def test_skip_unchanged_records(self, test_file_path): Test that we skip updating records whose published date is older than the last file ingestion date """ - yesterday = datetime.now() - timedelta(1) + yesterday = datetime.strftime(datetime.now() - timedelta(1), DATE_FORMAT) CompanyActivityIngestedFileFactory(created_on=datetime.now()) record = json.dumps(dict( object={ @@ -192,3 +192,14 @@ def test_null_url(self, test_file_path): task = GreatIngestionTask() task.json_to_model(json.loads(data)) assert Great.objects.count() == initial_count + 1 + data = """ + { + "object": { + "id": "dit:directoryFormsApi:Submission:5250", + "published": "2024-09-19T15:00:34.069Z" + } + } + """ + task = GreatIngestionTask() + task.json_to_model(json.loads(data)) + assert Great.objects.count() == initial_count + 2 From a318f6ad5dbc39a6a7a400c7f06f244555194ade Mon Sep 17 00:00:00 2001 From: baarkerlounger Date: Thu, 10 Oct 2024 11:59:13 +0100 Subject: [PATCH 3/5] Django convention is empty string over null --- ...t_actor_dit_blacklisted_reason_and_more.py | 33 +++++++++++++++++++ .../migrations/0006_alter_great_url.py | 18 ---------- datahub/company_activity/models/great.py | 8 ++--- .../tasks/ingest_great_data.py | 9 +++-- 4 files changed, 41 insertions(+), 27 deletions(-) create mode 100644 datahub/company_activity/migrations/0006_alter_great_actor_dit_blacklisted_reason_and_more.py delete mode 100644 datahub/company_activity/migrations/0006_alter_great_url.py diff --git a/datahub/company_activity/migrations/0006_alter_great_actor_dit_blacklisted_reason_and_more.py b/datahub/company_activity/migrations/0006_alter_great_actor_dit_blacklisted_reason_and_more.py new file mode 100644 index 000000000..5da12a795 --- /dev/null +++ b/datahub/company_activity/migrations/0006_alter_great_actor_dit_blacklisted_reason_and_more.py @@ -0,0 +1,33 @@ +# Generated by Django 4.2.15 on 2024-10-10 10:45 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("company_activity", "0005_alter_great_actor_dit_email_address_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="great", + name="actor_dit_blacklisted_reason", + field=models.CharField(default="", max_length=255), + ), + migrations.AlterField( + model_name="great", + name="actor_dit_email_address", + field=models.CharField(default="", max_length=255), + ), + migrations.AlterField( + model_name="great", + name="actor_type", + field=models.CharField(default="", max_length=255), + ), + migrations.AlterField( + model_name="great", + name="url", + field=models.CharField(default="", max_length=255), + ), + ] diff --git a/datahub/company_activity/migrations/0006_alter_great_url.py b/datahub/company_activity/migrations/0006_alter_great_url.py deleted file mode 100644 index a5c9934c4..000000000 --- a/datahub/company_activity/migrations/0006_alter_great_url.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 4.2.15 on 2024-10-09 13:40 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("company_activity", "0005_alter_great_actor_dit_email_address_and_more"), - ] - - operations = [ - migrations.AlterField( - model_name="great", - name="url", - field=models.CharField(max_length=255, null=True), - ), - ] diff --git a/datahub/company_activity/models/great.py b/datahub/company_activity/models/great.py index e59fc1931..b079499db 100644 --- a/datahub/company_activity/models/great.py +++ b/datahub/company_activity/models/great.py @@ -27,7 +27,7 @@ class Great(models.Model): attributed_to_type = models.CharField(max_length=MAX_LENGTH) attributed_to_id = models.CharField(max_length=MAX_LENGTH) - url = models.CharField(max_length=MAX_LENGTH, null=True) + url = models.CharField(max_length=MAX_LENGTH, default='') meta_action_name = models.CharField(max_length=MAX_LENGTH) meta_template_id = models.CharField(max_length=MAX_LENGTH) meta_email_address = models.CharField(max_length=MAX_LENGTH) @@ -53,11 +53,11 @@ class Great(models.Model): # This is a duplicate of opportunities but in the form of a '\n' delimited string data_opportunity_urls = models.CharField(max_length=MAX_LENGTH) - actor_type = models.CharField(max_length=MAX_LENGTH, null=True) + actor_type = models.CharField(max_length=MAX_LENGTH, default='') actor_id = models.IntegerField(null=True) - actor_dit_email_address = models.CharField(max_length=MAX_LENGTH, null=True) + actor_dit_email_address = models.CharField(max_length=MAX_LENGTH, default='') actor_dit_is_blacklisted = models.BooleanField(null=True) actor_dit_is_whitelisted = models.BooleanField(null=True) - actor_dit_blacklisted_reason = models.CharField(max_length=MAX_LENGTH, null=True) + actor_dit_blacklisted_reason = models.CharField(max_length=MAX_LENGTH, default='') created_on = models.DateTimeField(auto_now_add=True) diff --git a/datahub/company_activity/tasks/ingest_great_data.py b/datahub/company_activity/tasks/ingest_great_data.py index bbf271bb1..be425f7b0 100644 --- a/datahub/company_activity/tasks/ingest_great_data.py +++ b/datahub/company_activity/tasks/ingest_great_data.py @@ -82,13 +82,12 @@ def json_to_model(self, jsn): actor = jsn.get('actor', {}) if not actor: actor_id = None - actor_type = None else: actor_id = actor['id'].split(':')[-1] - actor_type = actor['type'].split(':')[-1] + actor_type = actor.get('type', '').split(':')[-1] values = { 'published': obj['published'], - 'url': obj.get('url', None), + 'url': obj.get('url', ''), 'attributed_to_type': attributed_to_type, 'attributed_to_id': attributed_to_id, @@ -112,10 +111,10 @@ def json_to_model(self, jsn): 'actor_type': actor_type, 'actor_id': actor_id, - 'actor_dit_email_address': actor.get('dit:emailAddress', None), + 'actor_dit_email_address': actor.get('dit:emailAddress', ''), 'actor_dit_is_blacklisted': actor.get('dit:isBlacklisted', None), 'actor_dit_is_whitelisted': actor.get('dit:isWhitelisted', None), - 'actor_dit_blacklisted_reason': actor.get('dit:blackListedReason', None), + 'actor_dit_blacklisted_reason': str(actor.get('dit:blackListedReason', '') or ''), } Great.objects.update_or_create( form_id=form_id, From 65f3fcafc251cb857e2522b84e749a3e130616a5 Mon Sep 17 00:00:00 2001 From: baarkerlounger Date: Thu, 10 Oct 2024 13:49:44 +0100 Subject: [PATCH 4/5] Extract var --- datahub/company_activity/tasks/ingest_great_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datahub/company_activity/tasks/ingest_great_data.py b/datahub/company_activity/tasks/ingest_great_data.py index be425f7b0..b7702f951 100644 --- a/datahub/company_activity/tasks/ingest_great_data.py +++ b/datahub/company_activity/tasks/ingest_great_data.py @@ -85,6 +85,7 @@ def json_to_model(self, jsn): else: actor_id = actor['id'].split(':')[-1] actor_type = actor.get('type', '').split(':')[-1] + actor_blacklisted_reason = str(actor.get('dit:blackListedReason', '') or '') values = { 'published': obj['published'], 'url': obj.get('url', ''), @@ -114,7 +115,7 @@ def json_to_model(self, jsn): 'actor_dit_email_address': actor.get('dit:emailAddress', ''), 'actor_dit_is_blacklisted': actor.get('dit:isBlacklisted', None), 'actor_dit_is_whitelisted': actor.get('dit:isWhitelisted', None), - 'actor_dit_blacklisted_reason': str(actor.get('dit:blackListedReason', '') or ''), + 'actor_dit_blacklisted_reason': actor_blacklisted_reason, } Great.objects.update_or_create( form_id=form_id, From 6a0254f1064756349afd97f0cbfc13ad677bda62 Mon Sep 17 00:00:00 2001 From: baarkerlounger Date: Thu, 10 Oct 2024 14:40:02 +0100 Subject: [PATCH 5/5] Default to empty string when key missing --- ...t_actor_dit_blacklisted_reason_and_more.py | 33 ------------------- datahub/company_activity/models/great.py | 8 ++--- .../tasks/ingest_great_data.py | 2 +- 3 files changed, 5 insertions(+), 38 deletions(-) delete mode 100644 datahub/company_activity/migrations/0006_alter_great_actor_dit_blacklisted_reason_and_more.py diff --git a/datahub/company_activity/migrations/0006_alter_great_actor_dit_blacklisted_reason_and_more.py b/datahub/company_activity/migrations/0006_alter_great_actor_dit_blacklisted_reason_and_more.py deleted file mode 100644 index 5da12a795..000000000 --- a/datahub/company_activity/migrations/0006_alter_great_actor_dit_blacklisted_reason_and_more.py +++ /dev/null @@ -1,33 +0,0 @@ -# Generated by Django 4.2.15 on 2024-10-10 10:45 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("company_activity", "0005_alter_great_actor_dit_email_address_and_more"), - ] - - operations = [ - migrations.AlterField( - model_name="great", - name="actor_dit_blacklisted_reason", - field=models.CharField(default="", max_length=255), - ), - migrations.AlterField( - model_name="great", - name="actor_dit_email_address", - field=models.CharField(default="", max_length=255), - ), - migrations.AlterField( - model_name="great", - name="actor_type", - field=models.CharField(default="", max_length=255), - ), - migrations.AlterField( - model_name="great", - name="url", - field=models.CharField(default="", max_length=255), - ), - ] diff --git a/datahub/company_activity/models/great.py b/datahub/company_activity/models/great.py index b079499db..86529252c 100644 --- a/datahub/company_activity/models/great.py +++ b/datahub/company_activity/models/great.py @@ -27,7 +27,7 @@ class Great(models.Model): attributed_to_type = models.CharField(max_length=MAX_LENGTH) attributed_to_id = models.CharField(max_length=MAX_LENGTH) - url = models.CharField(max_length=MAX_LENGTH, default='') + url = models.CharField(max_length=MAX_LENGTH) meta_action_name = models.CharField(max_length=MAX_LENGTH) meta_template_id = models.CharField(max_length=MAX_LENGTH) meta_email_address = models.CharField(max_length=MAX_LENGTH) @@ -53,11 +53,11 @@ class Great(models.Model): # This is a duplicate of opportunities but in the form of a '\n' delimited string data_opportunity_urls = models.CharField(max_length=MAX_LENGTH) - actor_type = models.CharField(max_length=MAX_LENGTH, default='') + actor_type = models.CharField(max_length=MAX_LENGTH, null=True) actor_id = models.IntegerField(null=True) - actor_dit_email_address = models.CharField(max_length=MAX_LENGTH, default='') + actor_dit_email_address = models.CharField(max_length=MAX_LENGTH, null=True) actor_dit_is_blacklisted = models.BooleanField(null=True) actor_dit_is_whitelisted = models.BooleanField(null=True) - actor_dit_blacklisted_reason = models.CharField(max_length=MAX_LENGTH, default='') + actor_dit_blacklisted_reason = models.CharField(max_length=MAX_LENGTH, null=True) created_on = models.DateTimeField(auto_now_add=True) diff --git a/datahub/company_activity/tasks/ingest_great_data.py b/datahub/company_activity/tasks/ingest_great_data.py index b7702f951..f1b436cf0 100644 --- a/datahub/company_activity/tasks/ingest_great_data.py +++ b/datahub/company_activity/tasks/ingest_great_data.py @@ -88,7 +88,7 @@ def json_to_model(self, jsn): actor_blacklisted_reason = str(actor.get('dit:blackListedReason', '') or '') values = { 'published': obj['published'], - 'url': obj.get('url', ''), + 'url': str(obj.get('url', '') or ''), 'attributed_to_type': attributed_to_type, 'attributed_to_id': attributed_to_id,