diff --git a/src/pyobo/extract.py b/src/pyobo/extract.py index 4820ab95..308fa838 100644 --- a/src/pyobo/extract.py +++ b/src/pyobo/extract.py @@ -20,6 +20,7 @@ from .identifier_utils import normalize_curie, wrap_norm_prefix from .path_utils import prefix_cache_join from .registries import not_available_as_obo +from .sources import has_nomenclature_plugin, run_nomenclature_plugin from .struct import Reference, TypeDef, get_reference_tuple from .struct.typedef import has_member, is_a, part_of @@ -64,7 +65,14 @@ RelationHint = Union[Reference, TypeDef, Tuple[str, str]] -NO_ALTS = {'ncbigene'} +NO_ALTS = { + 'ncbigene', +} + + +def _get_version(prefix: str) -> Optional[str]: + if has_nomenclature_plugin(prefix): + return run_nomenclature_plugin(prefix).data_version def get_name_by_curie(curie: str) -> Optional[str]: @@ -131,7 +139,7 @@ def get_id_name_mapping(prefix: str, force: bool = False, **kwargs) -> Mapping[s logger.info('[%s] done loading name mappings', prefix) return rv - path = prefix_cache_join(prefix, 'names.tsv') + path = prefix_cache_join(prefix, 'names.tsv', version=_get_version(prefix)) @cached_mapping(path=path, header=[f'{prefix}_id', 'name'], force=force) def _get_id_name_mapping() -> Mapping[str, str]: @@ -164,7 +172,7 @@ def get_id_species_mapping(prefix: str, force: bool = False, **kwargs) -> Mappin logger.info('[%s] done loading species mappings', prefix) return rv - path = prefix_cache_join(prefix, 'species.tsv') + path = prefix_cache_join(prefix, 'species.tsv', version=_get_version(prefix)) @cached_mapping(path=path, header=[f'{prefix}_id', 'species'], force=force) def _get_id_species_mapping() -> Mapping[str, str]: @@ -180,7 +188,7 @@ def _get_id_species_mapping() -> Mapping[str, str]: @wrap_norm_prefix def get_typedef_id_name_mapping(prefix: str, force: bool = False, **kwargs) -> Mapping[str, str]: """Get an identifier to name mapping for the typedefs in an OBO file.""" - path = prefix_cache_join(prefix, 'typedefs.tsv') + path = prefix_cache_join(prefix, 'typedefs.tsv', version=_get_version(prefix)) @cached_mapping(path=path, header=[f'{prefix}_id', 'name'], force=force) def _get_typedef_id_name_mapping() -> Mapping[str, str]: @@ -195,7 +203,7 @@ def _get_typedef_id_name_mapping() -> Mapping[str, str]: @wrap_norm_prefix def get_id_synonyms_mapping(prefix: str, force: bool = False, **kwargs) -> Mapping[str, List[str]]: """Get the OBO file and output a synonym dictionary.""" - path = prefix_cache_join(prefix, "synonyms.tsv") + path = prefix_cache_join(prefix, "synonyms.tsv", version=_get_version(prefix)) @cached_multidict(path=path, header=[f'{prefix}_id', 'synonym'], force=force) def _get_multidict() -> Mapping[str, List[str]]: @@ -209,7 +217,7 @@ def _get_multidict() -> Mapping[str, List[str]]: @wrap_norm_prefix def get_properties_df(prefix: str, force: bool = False, **kwargs) -> pd.DataFrame: """Extract properties.""" - path = prefix_cache_join(prefix, "properties.tsv") + path = prefix_cache_join(prefix, "properties.tsv", version=_get_version(prefix)) @cached_df(path=path, dtype=str, force=force) def _df_getter() -> pd.DataFrame: @@ -231,8 +239,8 @@ def get_filtered_properties_mapping( **kwargs, ) -> Mapping[str, str]: """Extract a single property for each term as a dictionary.""" - path = prefix_cache_join(prefix, 'properties', f"{prop}.tsv") - all_properties_path = prefix_cache_join(prefix, 'properties.tsv') + path = prefix_cache_join(prefix, 'properties', f"{prop}.tsv", version=_get_version(prefix)) + all_properties_path = prefix_cache_join(prefix, 'properties.tsv', version=_get_version(prefix)) @cached_mapping(path=path, header=[f'{prefix}_id', prop], force=force) def _mapping_getter() -> Mapping[str, str]: @@ -260,8 +268,8 @@ def get_filtered_properties_df( **kwargs, ) -> pd.DataFrame: """Extract a single property for each term.""" - path = prefix_cache_join(prefix, 'properties', f"{prop}.tsv") - all_properties_path = prefix_cache_join(prefix, 'properties.tsv') + path = prefix_cache_join(prefix, 'properties', f"{prop}.tsv", version=_get_version(prefix)) + all_properties_path = prefix_cache_join(prefix, 'properties.tsv', version=_get_version(prefix)) @cached_df(path=path, dtype=str, force=force) def _df_getter() -> pd.DataFrame: @@ -288,7 +296,7 @@ def get_relations_df( **kwargs, ) -> pd.DataFrame: """Get all relations from the OBO.""" - path = prefix_cache_join(prefix, 'relations.tsv') + path = prefix_cache_join(prefix, 'relations.tsv', version=_get_version(prefix)) @cached_df(path=path, dtype=str, force=force) def _df_getter() -> pd.DataFrame: @@ -317,8 +325,10 @@ def get_filtered_relations_df( ) -> pd.DataFrame: """Get all of the given relation.""" relation_prefix, relation_identifier = relation = get_reference_tuple(relation) - path = prefix_cache_join(prefix, 'relations', f'{relation_prefix}:{relation_identifier}.tsv') - all_relations_path = prefix_cache_join(prefix, 'relations.tsv') + path = prefix_cache_join( + prefix, 'relations', f'{relation_prefix}:{relation_identifier}.tsv', version=_get_version(prefix), + ) + all_relations_path = prefix_cache_join(prefix, 'relations.tsv', version=_get_version(prefix)) @cached_df(path=path, dtype=str, force=force) def _df_getter() -> pd.DataFrame: @@ -361,8 +371,8 @@ def get_filtered_xrefs( **kwargs, ) -> Mapping[str, str]: """Get xrefs to a given target.""" - path = prefix_cache_join(prefix, 'xrefs', f"{xref_prefix}.tsv") - all_xrefs_path = prefix_cache_join(prefix, 'xrefs.tsv') + path = prefix_cache_join(prefix, 'xrefs', f"{xref_prefix}.tsv", version=_get_version(prefix)) + all_xrefs_path = prefix_cache_join(prefix, 'xrefs.tsv', version=_get_version(prefix)) header = [f'{prefix}_id', f'{xref_prefix}_id'] @cached_mapping(path=path, header=header, use_tqdm=use_tqdm, force=force) @@ -388,7 +398,7 @@ def _get_mapping() -> Mapping[str, str]: @wrap_norm_prefix def get_xrefs_df(prefix: str, *, use_tqdm: bool = False, force: bool = False, **kwargs) -> pd.DataFrame: """Get all xrefs.""" - path = prefix_cache_join(prefix, 'xrefs.tsv') + path = prefix_cache_join(prefix, 'xrefs.tsv', version=_get_version(prefix)) @cached_df(path=path, dtype=str, force=force) def _df_getter() -> pd.DataFrame: @@ -406,7 +416,7 @@ def get_id_to_alts(prefix: str, force: bool = False, **kwargs) -> Mapping[str, L if prefix in NO_ALTS: return {} - path = prefix_cache_join(prefix, 'alt_ids.tsv') + path = prefix_cache_join(prefix, 'alt_ids.tsv', version=_get_version(prefix)) header = [f'{prefix}_id', 'alt_id'] @cached_multidict(path=path, header=header, force=force) diff --git a/src/pyobo/getters.py b/src/pyobo/getters.py index ceb79278..d717c8e4 100644 --- a/src/pyobo/getters.py +++ b/src/pyobo/getters.py @@ -63,14 +63,13 @@ def get(prefix: str, *, url: Optional[str] = None, local: bool = False) -> Obo: if path.exists() and not local: logger.debug('[%s] using obonet cache at %s', prefix, path) return Obo.from_obonet_gz(path) - else: - logger.debug('[%s] no obonet cache found at %s', prefix, path) - - if has_nomenclature_plugin(prefix): + elif has_nomenclature_plugin(prefix): obo = run_nomenclature_plugin(prefix) - logger.info('[%s] caching OBO at %s', prefix, path) + logger.info('[%s] caching nomenclature plugin', prefix) obo.write_default() return obo + else: + logger.debug('[%s] no obonet cache found at %s', prefix, path) obo = _get_obo_via_obonet(prefix=prefix, url=url, local=local) if not local: @@ -176,9 +175,11 @@ def iter_helper_helper(f: Callable[[str], X], strict: bool = True) -> Iterable[T :raises URLError: If another problem was encountered during download :raises ValueError: If the data was not in the format that was expected (e.g., OWL) """ - for prefix in sorted(bioregistry.read_bioregistry()): + it = tqdm(sorted(bioregistry.read_bioregistry())) + for prefix in it: if prefix in SKIP: continue + it.set_postfix({'prefix': prefix}) try: mapping = f(prefix) except NoBuild: diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py index 1d06370e..a3063f28 100644 --- a/src/pyobo/struct/struct.py +++ b/src/pyobo/struct/struct.py @@ -366,62 +366,119 @@ def from_obonet_gz(cls, path: Union[str, pathlib.Path]) -> 'Obo': """Read OBO from a pre-compiled Obonet JSON.""" return cls.from_obonet(get_gzipped_graph(path)) - def _path(self, *parts: str): + def _path(self, *parts: str) -> Path: return prefix_directory_join(self.ontology, *parts, version=self.data_version) - def _cache(self, *parts: str): + def _cache(self, *parts: str) -> Path: return self._path('cache', *parts) - def write_default(self, use_tqdm: bool = True, write_obo: bool = False, write_obonet: bool = False) -> None: + @property + def _names_path(self) -> Path: + return self._cache('names.tsv') + + @property + def _species_path(self) -> Path: + return self._cache('species.tsv') + + @property + def _synonyms_path(self) -> Path: + return self._cache('synonyms.tsv') + + @property + def _alts_path(self): + return self._cache('alt_ids.tsv') + + @property + def _xrefs_path(self) -> Path: + return self._cache('xrefs.tsv') + + @property + def _relations_path(self) -> Path: + return self._cache('relations.tsv') + + @property + def _properties_path(self) -> Path: + return self._cache('properties.tsv') + + @property + def _obo_path(self) -> Path: + return get_prefix_obo_path(self.ontology, version=self.data_version) + + @property + def _obonet_gz_path(self) -> Path: + return self._path(f"{self.ontology}.obonet.json.gz") + + def write_default( + self, + use_tqdm: bool = True, + force: bool = False, + write_obo: bool = False, + write_obonet: bool = False, + ) -> None: """Write the OBO to the default path.""" - write_map_tsv( - path=self._cache('names.tsv'), - header=[f'{self.ontology}_id', 'name'], - rv=self.get_id_name_mapping(), - ) - write_map_tsv( - path=self._cache('species.tsv'), - header=[f'{self.ontology}_id', 'taxonomy_id'], - rv=self.get_id_species_mapping(), - ) - write_multimap_tsv( - path=self._cache('synonyms.tsv'), - header=[f'{self.ontology}_id', 'synonym'], - rv=self.get_id_synonyms_mapping(), - ) - write_multimap_tsv( - path=self._cache('alt_ids.tsv'), - header=[f'{self.ontology}_id', 'alt_id'], - rv=self.get_id_alts_mapping(), - ) + if not self._names_path.exists() or force: + logger.info('[%s] caching names', self.ontology) + write_map_tsv( + path=self._names_path, + header=[f'{self.ontology}_id', 'name'], + rv=self.get_id_name_mapping(), + ) - for df_name, get_df in [ - ('xrefs', self.get_xrefs_df), - ('relations', self.get_relations_df), - ('properties', self.get_properties_df), + if not self._species_path.exists() or force: + logger.info('[%s] caching species', self.ontology) + write_map_tsv( + path=self._species_path, + header=[f'{self.ontology}_id', 'taxonomy_id'], + rv=self.get_id_species_mapping(), + ) + + if not self._synonyms_path.exists() or force: + logger.info('[%s] caching synonyms', self.ontology) + write_multimap_tsv( + path=self._synonyms_path, + header=[f'{self.ontology}_id', 'synonym'], + rv=self.get_id_synonyms_mapping(), + ) + + if not self._alts_path.exists() or force: + logger.info('[%s] caching alts', self.ontology) + write_multimap_tsv( + path=self._alts_path, + header=[f'{self.ontology}_id', 'alt_id'], + rv=self.get_id_alts_mapping(), + ) + + for path, get_df in [ + (self._xrefs_path, self.get_xrefs_df), + (self._relations_path, self.get_relations_df), + (self._properties_path, self.get_properties_df), ]: + if path.exists() and not force: + continue + logger.info('[%s] caching %s', self.ontology, path) df: pd.DataFrame = get_df(use_tqdm=use_tqdm) - if len(df.index): - df.sort_values(list(df.columns), inplace=True) - df.to_csv(self._cache(f'{df_name}.tsv'), sep='\t', index=False) + df.sort_values(list(df.columns), inplace=True) + df.to_csv(path, sep='\t', index=False) for relation in (is_a, has_part, part_of, from_species, orthologous): if relation is not is_a and relation not in self.typedefs: continue + relations_path = self._cache('relations', f'{relation.curie}.tsv') + if relations_path.exists() and not force: + continue + logger.info('[%s] caching relation %s ! %', self.ontology, relation.curie, relation.name) relation_df = self.get_filtered_relations_df(relation) if not len(relation_df.index): continue relation_df.sort_values(list(relation_df.columns), inplace=True) - relation_df.to_csv(self._cache('relations', f'{relation.curie}.tsv'), sep='\t', index=False) + relation_df.to_csv(relations_path, sep='\t', index=False) - if write_obo: - obo_path = get_prefix_obo_path(self.ontology, version=self.data_version) - self.write_obo(obo_path, use_tqdm=use_tqdm) + if write_obo and (not self._obo_path.exists() or force): + self.write_obo(self._obo_path, use_tqdm=use_tqdm) - if write_obonet: - obonet_gz_path = self._path(f"{self.ontology}.obonet.json.gz") - logger.info('writing obonet to %s', obonet_gz_path) - self.write_obonet_gz(obonet_gz_path) + if write_obonet and (not self._obonet_gz_path.exists() or force): + logger.info('writing obonet to %s', self._obonet_gz_path) + self.write_obonet_gz(self._obonet_gz_path) def __iter__(self): # noqa: D105 if self.iter_only: