Skip to content

Commit

Permalink
Merge branch 'update-nmdc_metadata_gen' into 'master'
Browse files Browse the repository at this point in the history
Update nmdc metadata gen

See merge request mass-spectrometry/enviroms!1
  • Loading branch information
corilo committed Feb 14, 2024
2 parents 548fa37 + aa197c1 commit 00a0892
Showing 1 changed file with 24 additions and 16 deletions.
40 changes: 24 additions & 16 deletions enviroMS/nmdc_metadata_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,18 @@

env_mediums = {'ENVO_00002042': 'surface water',
'ENVO_00002007': 'sediment',
'ENVO:00001998': 'soil'
}
env_local_scales = {'ENVO_00000022': 'river'}
env_broad_scales = {'ENVO_01000253': 'freshwater river biome'}
env_local_scales = {'ENVO_00000022': 'river',
'ENVO:01000861': 'area of dwarf scrub',
'ENVO:00000516': 'hummock',
'ENVO:01000869': 'area of scrub',
'ENVO:01000887': 'area of sedge- and forb-dominated herbaceous vegetation',
'ENVO:01001370': 'tundra ecosystem'
}
env_broad_scales = {'ENVO_01000253': 'freshwater river biome',
'ENVO:00000446': 'terrestrial biome'
}


@dataclass
Expand All @@ -39,16 +48,16 @@ class DataObject:
nom_raw_data_object_type:str = "Direct Infusion FT ICR-MS Raw Data"
nom_raw_data_object_description:str = "Raw 21T Direct Infusion Data"
nom_dp_data_object_type:str = "FT ICR-MS Analysis Results"
nom_dp_data_object_description:str = "EnviroMS FT ICR-MS natural organic matter workflow molecular formula assignment output details",
nom_dp_data_object_description:str = "EnviroMS FT ICR-MS natural organic matter workflow molecular formula assignment output details"

@dataclass
class BioSample:
class Biosample:
pass

@dataclass
class NMDC_Types:

BioSample:str = "nmdc:BioSample"
Biosample:str = "nmdc:Biosample"
OmicsProcessing:str = "nmdc:OmicsProcessing"
NomAnalysisActivity:str = "nmdc:NomAnalysisActivity"
DataObject:str = "nmdc:DataObject"
Expand Down Expand Up @@ -93,7 +102,7 @@ def mint_nmdc_id(type:NMDC_Types, how_many:int = 1) -> List[str]:

def get_biosample_object(emsl_metadata:EMSL_Metadata) -> nmdc.Biosample:

nmdc_id = mint_nmdc_id({'id': NMDC_Types.BioSample})[0]
nmdc_id = mint_nmdc_id({'id': NMDC_Types.Biosample})[0]

env_medium = {
'has_raw_value': emsl_metadata.env_medium,
Expand Down Expand Up @@ -121,14 +130,14 @@ def get_biosample_object(emsl_metadata:EMSL_Metadata) -> nmdc.Biosample:
"longitude": emsl_metadata.longitude,
}

collection_date = { 'has_raw_value': emsl_metadata.collection_date}
collection_date = {'has_raw_value': emsl_metadata.collection_date}

geo_loc_name = {'has_raw_value': emsl_metadata.geo_loc_name}

data_dict = {'id': nmdc_id,
'env_medium' : env_medium,
'env_local_scale' : env_local_scale,
'env_broad_scales_data' : env_broad_scale,
'env_broad_scale' : env_broad_scale,
'lat_lon': lat_lon,
'location': emsl_metadata.location,
'ecosystem_type': emsl_metadata.ecosystem_type,
Expand Down Expand Up @@ -228,21 +237,21 @@ def create_nmdc_metadata(raw_data_path:Path, data_product_path:Path, base_url:st

if not biosample_id:

biosample_id = mint_nmdc_id({'id': NMDC_Types.BioSample})[0]
biosample_id = mint_nmdc_id({'id': NMDC_Types.Biosample})[0]
bioSample = get_biosample_object(emsl_metadata)
biosample_id = bioSample.id

else:

''' needs to finish the logic for creating biosamples, this will fail because it is missing some required fields'''
bioSample = nmdc.BioSample(id=biosample_id)
bioSample = nmdc.Biosample(id=biosample_id)

omicsProcessing = get_omics_processing(raw_data_path,
OmicsProcessing.nom_12T_instrument_name,
biosample_id, None,
biosample_id, 'nmdc:placeholder',
OmicsProcessing.nom_omics_processing_type,
OmicsProcessing.nom_omics_processing_description,
emsl_metadata.nmdc_study_id
emsl_metadata.nmdc_study
)

rawDataObject = get_data_object(raw_data_path, base_url + 'nom/grow/raw/',
Expand All @@ -252,7 +261,7 @@ def create_nmdc_metadata(raw_data_path:Path, data_product_path:Path, base_url:st

nomAnalysisActivity = get_nom_analysis_activity(NomAnalysisActivity.cluster_name,
NomAnalysisActivity.codebase_url,
rawDataObject.id, None, False,
rawDataObject.id, 'nmdc:placeholder', False,
omicsProcessing.id,
NomAnalysisActivity.nom_12T_instrument_name)

Expand All @@ -262,9 +271,8 @@ def create_nmdc_metadata(raw_data_path:Path, data_product_path:Path, base_url:st
description =DataObject.nom_dp_data_object_description)

#circular dependencies : great!
nomAnalysisActivity.has_input = rawDataObject.id
nomAnalysisActivity.has_output = dataProductDataObject.id
omicsProcessing.has_output = rawDataObject.id
nomAnalysisActivity.has_output = [dataProductDataObject.id]
omicsProcessing.has_output = [rawDataObject.id]

nom_metadata_db.biosample_set.append(bioSample)
nom_metadata_db.data_object_set.append(rawDataObject)
Expand Down

0 comments on commit 00a0892

Please sign in to comment.