From 99ab84a675297cdcfc9619246ea27e548f964ae3 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Mon, 14 Oct 2024 11:34:45 -0700 Subject: [PATCH 1/4] Content harvest removes tmp filepaths if any exist --- content_harvester/by_record.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/content_harvester/by_record.py b/content_harvester/by_record.py index da82ad4c..5f83811e 100644 --- a/content_harvester/by_record.py +++ b/content_harvester/by_record.py @@ -113,6 +113,13 @@ def harvest_record_content( record['calisphere-id'] ) + tmp_files = list(set( + record.get('media', {}).pop('tmp_filepaths', []) + + record.get('thumbnail', {}).pop('tmp_filepaths', []) + )) + if tmp_files: + [os.remove(filepath) for filepath in tmp_files] + return record @@ -208,6 +215,7 @@ def check_component_cache( ): if not persistent_cache: + # always a cache miss, also don't cache this return { **create_component(collection_id, request, *args, **kwargs), 'from-cache': False @@ -224,6 +232,7 @@ def check_component_cache( f"{component_type}: No ETag or Last-Modified headers, " "skipping cache check" ) + # always a cache miss, also don't cache this return { **create_component(collection_id, request, *args, **kwargs), 'from-cache': False @@ -251,8 +260,14 @@ def check_component_cache( collection_id, request, *args, **kwargs) print(f"Created {component_type} component for {request.url}") # set cache key to the component + # but don't cache tmp filepaths + tmp_files = component.pop('tmp_filepaths') persistent_cache[cache_key] = component - return {**component, 'from-cache': False} + return { + **component, + 'from-cache': False, + 'tmp_filepaths': tmp_files + } return check_component_cache return inner @@ -298,6 +313,7 @@ def create_media_component( return None media_tmp_filepath = source_component['path'] + tmp_filepaths = [media_tmp_filepath] mapped_mimetype = mapped_media_source.get('mimetype') mapped_nuxeotype = mapped_media_source.get('nuxeo_type') @@ -311,7 +327,7 @@ def create_media_component( derivative_filepath, f"jp2/{collection_id}/{jp2_destination_filename}" ) - os.remove(derivative_filepath) + tmp_filepaths.append(derivative_filepath) mimetype = 'image/jp2' else: content_s3_filepath = upload_content( @@ -330,7 +346,8 @@ def create_media_component( 'mapped_mimetype': mapped_mimetype, 'src_size': source_component['size'], 'date_content_component_created': datetime.now().isoformat() - } + }, + 'tmp_filepaths': tmp_filepaths } return media_component @@ -351,6 +368,7 @@ def create_thumbnail_component( return None thumbnail_tmp_filepath = source_component['path'] + tmp_filepaths = [thumbnail_tmp_filepath] thumbnail_md5 = source_component['md5'] content_s3_filepath = None @@ -369,7 +387,7 @@ def create_thumbnail_component( derivative_filepath, f"thumbnails/{collection_id}/{thumbnail_md5}" ) dimensions = get_dimensions(derivative_filepath, record_context) - os.remove(derivative_filepath) + tmp_filepaths.append(derivative_filepath) elif mapped_mimetype in ['video/mp4','video/quicktime']: derivative_filepath = derivatives.video_to_thumb(thumbnail_tmp_filepath) if derivative_filepath: @@ -377,7 +395,7 @@ def create_thumbnail_component( derivative_filepath, f"thumbnails/{collection_id}/{thumbnail_md5}" ) dimensions = get_dimensions(derivative_filepath, record_context) - os.remove(derivative_filepath) + tmp_filepaths.append(derivative_filepath) thumbnail_component = { 'mimetype': 'image/jpeg', @@ -389,12 +407,10 @@ def create_thumbnail_component( 'mapped_mimetype': mapped_thumbnail_source.get('mimetype'), 'src_size': source_component['size'], 'date_content_component_created': datetime.now().isoformat() - } + }, + 'tmp_filepaths': tmp_filepaths } - if os.path.exists(thumbnail_tmp_filepath): - os.remove(thumbnail_tmp_filepath) - return thumbnail_component From 5024a2cb8030797cbe5e5cdaca097dd120a0cf8f Mon Sep 17 00:00:00 2001 From: Barbara Hui Date: Tue, 15 Oct 2024 14:58:14 -0700 Subject: [PATCH 2/4] Return a tuple of (component, tmpfiles) from create_component functions --- content_harvester/by_record.py | 57 ++++++++++++---------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/content_harvester/by_record.py b/content_harvester/by_record.py index 5f83811e..7078c409 100644 --- a/content_harvester/by_record.py +++ b/content_harvester/by_record.py @@ -1,7 +1,7 @@ import hashlib import os from datetime import datetime -from typing import Optional, Any +from typing import Optional, Any, Tuple from functools import lru_cache from dataclasses import dataclass, asdict from urllib.parse import quote_plus @@ -94,7 +94,7 @@ def harvest_record_content( media_source_url = media_source.get('url') if media_source_url: request = ContentRequest(media_source_url, auth) - record['media'] = create_media_component( + record['media'], tmp_files = create_media_component( collection_id, request, media_source) thumbnail_src = record.get( @@ -106,19 +106,14 @@ def harvest_record_content( thumbnail_src_url = thumbnail_src.get('url') if thumbnail_src_url: request = ContentRequest(thumbnail_src_url, auth) - record['thumbnail'] = create_thumbnail_component( + record['thumbnail'], tmp_files = create_thumbnail_component( collection_id, request, thumbnail_src, record['calisphere-id'] ) - tmp_files = list(set( - record.get('media', {}).pop('tmp_filepaths', []) + - record.get('thumbnail', {}).pop('tmp_filepaths', []) - )) - if tmp_files: - [os.remove(filepath) for filepath in tmp_files] + [os.remove(filepath) for filepath in tmp_files] return record @@ -215,11 +210,9 @@ def check_component_cache( ): if not persistent_cache: - # always a cache miss, also don't cache this - return { - **create_component(collection_id, request, *args, **kwargs), - 'from-cache': False - } + # always a cache miss, don't cache this + component, tmp_files = create_component(collection_id, request, *args, **kwargs) + return {**component, 'from-cache': False}, tmp_files # Do a head request to get the current ETag and # Last-Modified values, used to create a cache key @@ -233,10 +226,8 @@ def check_component_cache( "skipping cache check" ) # always a cache miss, also don't cache this - return { - **create_component(collection_id, request, *args, **kwargs), - 'from-cache': False - } + component, tmp_files = create_component(collection_id, request, *args, **kwargs) + return {**component, 'from-cache': False}, tmp_files # Create cache key cache_key = '/'.join([ @@ -254,20 +245,14 @@ def check_component_cache( f"Retrieved {component_type} component from cache for " f"{request.url}" ) - return {**component, 'from-cache': True} + return {**component, 'from-cache': True}, [] else: - component = create_component( + component, tmp_files = create_component( collection_id, request, *args, **kwargs) print(f"Created {component_type} component for {request.url}") # set cache key to the component - # but don't cache tmp filepaths - tmp_files = component.pop('tmp_filepaths') persistent_cache[cache_key] = component - return { - **component, - 'from-cache': False, - 'tmp_filepaths': tmp_files - } + return {**component, 'from-cache': False}, tmp_files return check_component_cache return inner @@ -301,7 +286,7 @@ def create_media_component( collection_id, request: ContentRequest, mapped_media_source: dict[str, str] - ) -> Optional[dict[str, Any]]: + ) -> Tuple[Optional[dict[str, Any]], list]: ''' download source file to local disk ''' @@ -310,7 +295,7 @@ def create_media_component( source_component = download_url(request) if not source_component: - return None + return None, [] media_tmp_filepath = source_component['path'] tmp_filepaths = [media_tmp_filepath] @@ -346,11 +331,10 @@ def create_media_component( 'mapped_mimetype': mapped_mimetype, 'src_size': source_component['size'], 'date_content_component_created': datetime.now().isoformat() - }, - 'tmp_filepaths': tmp_filepaths + } } - return media_component + return media_component, tmp_filepaths @content_component_cache('thumbnail') @@ -359,13 +343,13 @@ def create_thumbnail_component( request: ContentRequest, mapped_thumbnail_source: dict[str, str], record_context: str - ) -> Optional[dict[str, Any]]: + ) -> Tuple[Optional[dict[str, Any]], list]: ''' download source file to local disk ''' source_component = download_url(request) if not source_component: - return None + return None, [] thumbnail_tmp_filepath = source_component['path'] tmp_filepaths = [thumbnail_tmp_filepath] @@ -407,11 +391,10 @@ def create_thumbnail_component( 'mapped_mimetype': mapped_thumbnail_source.get('mimetype'), 'src_size': source_component['size'], 'date_content_component_created': datetime.now().isoformat() - }, - 'tmp_filepaths': tmp_filepaths + } } - return thumbnail_component + return thumbnail_component, tmp_filepaths def upload_content(filepath: str, destination: str) -> str: From 8811d9ec97ebffc0941956e3a706bb33392345f3 Mon Sep 17 00:00:00 2001 From: Barbara Hui Date: Tue, 15 Oct 2024 17:13:26 -0700 Subject: [PATCH 3/4] Ensure tmp_files exists and contains all filenames. --- content_harvester/by_record.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/content_harvester/by_record.py b/content_harvester/by_record.py index 7078c409..88ac6a8c 100644 --- a/content_harvester/by_record.py +++ b/content_harvester/by_record.py @@ -90,12 +90,14 @@ def harvest_record_content( if rikolti_mapper_type == 'nuxeo.nuxeo': auth = (settings.NUXEO_USER, settings.NUXEO_PASS) + tmp_files = [] media_source = record.get('media_source', {}) media_source_url = media_source.get('url') if media_source_url: request = ContentRequest(media_source_url, auth) - record['media'], tmp_files = create_media_component( + record['media'], media_tmp_files = create_media_component( collection_id, request, media_source) + tmp_files.extend(media_tmp_files) thumbnail_src = record.get( 'thumbnail_source', @@ -106,12 +108,13 @@ def harvest_record_content( thumbnail_src_url = thumbnail_src.get('url') if thumbnail_src_url: request = ContentRequest(thumbnail_src_url, auth) - record['thumbnail'], tmp_files = create_thumbnail_component( + record['thumbnail'], thumbnail_tmp_files = create_thumbnail_component( collection_id, request, thumbnail_src, record['calisphere-id'] ) + tmp_files.extend(thumbnail_tmp_files) [os.remove(filepath) for filepath in tmp_files] From 32bc66e56d30a02588b1d66187987b17e9fc9e00 Mon Sep 17 00:00:00 2001 From: Barbara Hui Date: Tue, 15 Oct 2024 17:27:06 -0700 Subject: [PATCH 4/4] Dedup tmp_files before deleting --- content_harvester/by_record.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content_harvester/by_record.py b/content_harvester/by_record.py index 88ac6a8c..c848968d 100644 --- a/content_harvester/by_record.py +++ b/content_harvester/by_record.py @@ -116,7 +116,7 @@ def harvest_record_content( ) tmp_files.extend(thumbnail_tmp_files) - [os.remove(filepath) for filepath in tmp_files] + [os.remove(filepath) for filepath in set(tmp_files)] return record