Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delete tmpfiles #1135

Merged
merged 4 commits into from
Oct 16, 2024
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 28 additions & 26 deletions content_harvester/by_record.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import hashlib
import os
from datetime import datetime
from typing import Optional, Any
from typing import Optional, Any, Tuple
from functools import lru_cache
from dataclasses import dataclass, asdict
from urllib.parse import quote_plus
Expand Down Expand Up @@ -90,12 +90,14 @@ def harvest_record_content(
if rikolti_mapper_type == 'nuxeo.nuxeo':
auth = (settings.NUXEO_USER, settings.NUXEO_PASS)

tmp_files = []
media_source = record.get('media_source', {})
media_source_url = media_source.get('url')
if media_source_url:
request = ContentRequest(media_source_url, auth)
record['media'] = create_media_component(
record['media'], media_tmp_files = create_media_component(
collection_id, request, media_source)
tmp_files.extend(media_tmp_files)

thumbnail_src = record.get(
'thumbnail_source',
Expand All @@ -106,12 +108,15 @@ def harvest_record_content(
thumbnail_src_url = thumbnail_src.get('url')
if thumbnail_src_url:
request = ContentRequest(thumbnail_src_url, auth)
record['thumbnail'] = create_thumbnail_component(
record['thumbnail'], thumbnail_tmp_files = create_thumbnail_component(
collection_id,
request,
thumbnail_src,
record['calisphere-id']
)
tmp_files.extend(thumbnail_tmp_files)

[os.remove(filepath) for filepath in tmp_files]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So one problem with this route is that tmp_files could be undefined in the case where there's no media and no thumbnail.

Another problem is that, if there is a media file and a thumbnail, any tmp_files returned by create_media_component will be overwritten by the tmp_files returned by create_thumbnail_component, and won't be deleted here at line 116.

Copy link
Collaborator Author

@barbarahui barbarahui Oct 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

D'oh, of course. How does the new commit look?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good, but now, in the case where the media_source and the thumbnail_source are the same, you might wind up with duplicates in tmp_files and os.remove will throw a FileNotFoundError when it tries to delete the same file a second time.

[os.remove(filepath) for filepath in set(tmp_files)] would de-dupe the list, or you could explicitly check if os.path.exists(filepath) before removing it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Egads, right you are! Pushed another commit.


return record

Expand Down Expand Up @@ -208,10 +213,9 @@ def check_component_cache(
):

if not persistent_cache:
return {
**create_component(collection_id, request, *args, **kwargs),
'from-cache': False
}
# always a cache miss, don't cache this
component, tmp_files = create_component(collection_id, request, *args, **kwargs)
return {**component, 'from-cache': False}, tmp_files

# Do a head request to get the current ETag and
# Last-Modified values, used to create a cache key
Expand All @@ -224,10 +228,9 @@ def check_component_cache(
f"{component_type}: No ETag or Last-Modified headers, "
"skipping cache check"
)
return {
**create_component(collection_id, request, *args, **kwargs),
'from-cache': False
}
# always a cache miss, also don't cache this
component, tmp_files = create_component(collection_id, request, *args, **kwargs)
return {**component, 'from-cache': False}, tmp_files

# Create cache key
cache_key = '/'.join([
Expand All @@ -245,14 +248,14 @@ def check_component_cache(
f"Retrieved {component_type} component from cache for "
f"{request.url}"
)
return {**component, 'from-cache': True}
return {**component, 'from-cache': True}, []
else:
component = create_component(
component, tmp_files = create_component(
collection_id, request, *args, **kwargs)
print(f"Created {component_type} component for {request.url}")
# set cache key to the component
persistent_cache[cache_key] = component
return {**component, 'from-cache': False}
return {**component, 'from-cache': False}, tmp_files

return check_component_cache
return inner
Expand Down Expand Up @@ -286,7 +289,7 @@ def create_media_component(
collection_id,
request: ContentRequest,
mapped_media_source: dict[str, str]
) -> Optional[dict[str, Any]]:
) -> Tuple[Optional[dict[str, Any]], list]:
'''
download source file to local disk
'''
Expand All @@ -295,9 +298,10 @@ def create_media_component(

source_component = download_url(request)
if not source_component:
return None
return None, []

media_tmp_filepath = source_component['path']
tmp_filepaths = [media_tmp_filepath]
mapped_mimetype = mapped_media_source.get('mimetype')
mapped_nuxeotype = mapped_media_source.get('nuxeo_type')

Expand All @@ -311,7 +315,7 @@ def create_media_component(
derivative_filepath,
f"jp2/{collection_id}/{jp2_destination_filename}"
)
os.remove(derivative_filepath)
tmp_filepaths.append(derivative_filepath)
mimetype = 'image/jp2'
else:
content_s3_filepath = upload_content(
Expand All @@ -333,7 +337,7 @@ def create_media_component(
}
}

return media_component
return media_component, tmp_filepaths


@content_component_cache('thumbnail')
Expand All @@ -342,15 +346,16 @@ def create_thumbnail_component(
request: ContentRequest,
mapped_thumbnail_source: dict[str, str],
record_context: str
) -> Optional[dict[str, Any]]:
) -> Tuple[Optional[dict[str, Any]], list]:
'''
download source file to local disk
'''
source_component = download_url(request)
if not source_component:
return None
return None, []

thumbnail_tmp_filepath = source_component['path']
tmp_filepaths = [thumbnail_tmp_filepath]
thumbnail_md5 = source_component['md5']

content_s3_filepath = None
Expand All @@ -369,15 +374,15 @@ def create_thumbnail_component(
derivative_filepath, f"thumbnails/{collection_id}/{thumbnail_md5}"
)
dimensions = get_dimensions(derivative_filepath, record_context)
os.remove(derivative_filepath)
tmp_filepaths.append(derivative_filepath)
elif mapped_mimetype in ['video/mp4','video/quicktime']:
derivative_filepath = derivatives.video_to_thumb(thumbnail_tmp_filepath)
if derivative_filepath:
content_s3_filepath = upload_content(
derivative_filepath, f"thumbnails/{collection_id}/{thumbnail_md5}"
)
dimensions = get_dimensions(derivative_filepath, record_context)
os.remove(derivative_filepath)
tmp_filepaths.append(derivative_filepath)

thumbnail_component = {
'mimetype': 'image/jpeg',
Expand All @@ -392,10 +397,7 @@ def create_thumbnail_component(
}
}

if os.path.exists(thumbnail_tmp_filepath):
os.remove(thumbnail_tmp_filepath)

return thumbnail_component
return thumbnail_component, tmp_filepaths


def upload_content(filepath: str, destination: str) -> str:
Expand Down
Loading