Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Add pubmlst utils #192

Merged
merged 39 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
45ff7d7
PubMLST authentication utilities
ahdamin Dec 6, 2024
a2864c6
Refactor PubMLST data fetching
ahdamin Dec 6, 2024
4ed5881
Add quotations
ahdamin Dec 7, 2024
4802515
Add rauth v0.7.3
ahdamin Dec 9, 2024
7ea4aa9
Add pubmlst config
ahdamin Dec 11, 2024
b740bde
Add pubmlst config loader
ahdamin Dec 11, 2024
279c3bc
Add session token validation
ahdamin Dec 11, 2024
705dbdc
Improve OAuth credentials usage
ahdamin Dec 11, 2024
60a3cde
Read credentials from config
ahdamin Dec 11, 2024
1c0c62c
dd OAuth header generation
ahdamin Dec 11, 2024
9265ee0
Refactor PubMLST fetch process
ahdamin Dec 11, 2024
1a00339
Update database query checks
ahdamin Dec 12, 2024
b98b2d3
Add logger to authentication
ahdamin Dec 12, 2024
882d0c8
Handle nested db entries and kip non-sequence dbs
ahdamin Dec 12, 2024
1dc4c91
Remove credentials file
ahdamin Dec 12, 2024
d7567a0
Replace print with logger
ahdamin Dec 12, 2024
bb8b172
Add reusable path resolver
ahdamin Dec 12, 2024
cd51465
Enhance logging for Auth
ahdamin Dec 12, 2024
f27b9f7
Match organisms
ahdamin Dec 12, 2024
f1de77f
Handle MLST profiles in CSV format
ahdamin Dec 13, 2024
817bd90
Add PubMLST congis and improve config path checks
ahdamin Dec 13, 2024
826bfec
Improve config path handling
ahdamin Dec 13, 2024
95c4ba2
Replace f-strings with format
ahdamin Dec 13, 2024
d01cc5a
Refactor path handling functions
ahdamin Dec 13, 2024
5e129f9
Improve exception handling
ahdamin Dec 13, 2024
f8c8c04
Ensure directories exist
ahdamin Dec 13, 2024
2b87fca
Add check_database_metadata
ahdamin Dec 13, 2024
1235c6a
Fix methods position
ahdamin Dec 13, 2024
f70e836
Fix logger
ahdamin Dec 13, 2024
8268a3f
Add client, constants, and exceptions
ahdamin Dec 16, 2024
21e3094
Fix checks for pubmlst config
ahdamin Dec 16, 2024
941bd53
Add URL rules for pubmlst db
ahdamin Dec 17, 2024
d7a7c92
Add InvalidURLError
ahdamin Dec 17, 2024
9325d01
Add URL parsing helper
ahdamin Dec 17, 2024
a5f4c71
Add MLST scheme retrieval & URL parsing helper
ahdamin Dec 17, 2024
977a240
Replace manual HTTP requests with PubMLSTClient
ahdamin Dec 17, 2024
d36ba0d
Remove client object
ahdamin Dec 17, 2024
6153cff
Fix tests
Vince-janv Dec 17, 2024
2d13f27
Merge branch 'master' into add-pubmlst-utils
karlnyr Jan 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions configExample.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,21 @@
"project": "production",
"type": "core"
},

"regex": {
"regex": {
"mail_recipient": "[email protected]",
"_comment": "File finding patterns. Only single capture group accepted (for reverse/forward identifier)",
"file_pattern": "\\w{8,12}_\\w{8,10}(?:-\\d+)*_L\\d_(?:R)*(\\d{1}).fastq.gz",
"_comment": "Organisms recognized enough to be considered stable",
"verified_organisms": []
},

"_comment": "Folders",
"folders": {
"folders": {
"_comment": "Root folder for ALL output",
"results": "/tmp/MLST/results/",
"_comment": "Report collection folder",
"reports": "/tmp/MLST/reports/",
"_comment": "Log file position and name",
"log_file": "/tmp/microsalt.log",

"_comment": "Root folder for input fasta sequencing data",
"seqdata": "/tmp/projects/",
"_comment": "ST profiles. Each ST profile file under 'profiles' have an identicial folder under references",
Expand All @@ -35,18 +32,18 @@
"_comment": "Resistances. Commonly from resFinder",
"resistances": "/tmp/MLST/references/resistances",
"_comment": "Download path for NCBI genomes, for alignment usage",
"genomes": "/tmp/MLST/references/genomes"
"genomes": "/tmp/MLST/references/genomes",
"_comment": "PubMLST credentials",
"pubmlst_credentials": "/tmp/MLST/credentials"
},

"_comment": "Database/Flask configuration",
"database": {
"SQLALCHEMY_DATABASE_URI": "sqlite:////tmp/microsalt.db",
"SQLALCHEMY_TRACK_MODIFICATIONS": "False",
"DEBUG": "True"
},

"_comment": "Thresholds for Displayed results",
"threshold": {
"threshold": {
"_comment": "Typing thresholds",
"mlst_id": 100,
"mlst_novel_id": 99.5,
Expand All @@ -72,11 +69,15 @@
"bp_50x_warn": 50,
"bp_100x_warn": 20
},

"_comment": "Genologics temporary configuration file",
"genologics": {
"baseuri": "https://lims.facility.se/",
"username": "limsuser",
"password": "mypassword"
},
"_comment": "PubMLST credentials",
"pubmlst": {
"client_id": "",
"client_secret": ""
}
}
}
7 changes: 7 additions & 0 deletions microSALT/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,15 @@
app.config["folders"] = preset_config.get("folders", {})

# Ensure PubMLST configuration is included

app.config["pubmlst"] = preset_config.get("pubmlst", {
"client_id": "",
"client_secret": ""
})

app.config["pubmlst"] = preset_config.get("pubmlst", {"client_id": "", "client_secret": ""})


# Add extrapaths to config
preset_config["folders"]["expec"] = os.path.abspath(
os.path.join(pathlib.Path(__file__).parent.parent, "unique_references/ExPEC.fsa")
Expand Down
Empty file.
106 changes: 106 additions & 0 deletions microSALT/utils/pubmlst/authentication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import json
import os
from datetime import datetime, timedelta
from dateutil import parser
from rauth import OAuth1Session
from microSALT import logger
from microSALT.utils.pubmlst.helpers import BASE_API, save_session_token, load_auth_credentials, get_path, folders_config, credentials_path_key, pubmlst_session_credentials_file_name
from microSALT.utils.pubmlst.exceptions import (
PUBMLSTError,
SessionTokenRequestError,
SessionTokenResponseError,
)

session_token_validity = 12 # 12-hour validity
session_expiration_buffer = 60 # 60-second buffer

def get_new_session_token(db: str):
"""Request a new session token using all credentials for a specific database."""
logger.debug("Fetching a new session token for database '{db}'...")

try:
consumer_key, consumer_secret, access_token, access_secret = load_auth_credentials()

url = f"{BASE_API}/db/{db}/oauth/get_session_token"

session = OAuth1Session(
consumer_key=consumer_key,
consumer_secret=consumer_secret,
access_token=access_token,
access_token_secret=access_secret,
)

response = session.get(url, headers={"User-Agent": "BIGSdb downloader"})
logger.debug("Response Status Code: {status_code}")

if response.ok:
try:
token_data = response.json()
session_token = token_data.get("oauth_token")
session_secret = token_data.get("oauth_token_secret")

if not session_token or not session_secret:
raise SessionTokenResponseError(
db, "Missing 'oauth_token' or 'oauth_token_secret' in response."
)

expiration_time = datetime.now() + timedelta(hours=session_token_validity)

save_session_token(db, session_token, session_secret, expiration_time)
return session_token, session_secret

except (ValueError, KeyError) as e:
raise SessionTokenResponseError(db, f"Invalid response format: {str(e)}")
else:
raise SessionTokenRequestError(
db, response.status_code, response.text
)

except PUBMLSTError as e:
logger.error(f"Error during token fetching: {e}")
raise
except Exception as e:
logger.error(f"Unexpected error: {e}")
raise PUBMLSTError(f"Unexpected error while fetching session token for database '{db}': {e}")

def load_session_credentials(db: str):
"""Load session token from file for a specific database."""
try:
credentials_file = os.path.join(
get_path(folders_config, credentials_path_key),
pubmlst_session_credentials_file_name
)

if not os.path.exists(credentials_file):
logger.debug("Session file does not exist. Fetching a new session token.")
return get_new_session_token(db)

with open(credentials_file, "r") as f:
try:
all_sessions = json.load(f)
except json.JSONDecodeError as e:
raise SessionTokenResponseError(db, f"Failed to parse session file: {str(e)}")

db_session_data = all_sessions.get("databases", {}).get(db)
if not db_session_data:
logger.debug(f"No session token found for database '{db}'. Fetching a new session token.")
return get_new_session_token(db)

expiration = parser.parse(db_session_data.get("expiration", ""))
if datetime.now() < expiration - timedelta(seconds=session_expiration_buffer):
logger.debug(f"Using existing session token for database '{db}'.")
session_token = db_session_data.get("token")
session_secret = db_session_data.get("secret")

return session_token, session_secret

logger.debug(f"Session token for database '{db}' has expired. Fetching a new session token.")
return get_new_session_token(db)

except PUBMLSTError as e:
logger.error(f"PUBMLST-specific error occurred: {e}")
raise
except Exception as e:
logger.error(f"Unexpected error: {e}")
raise PUBMLSTError(f"Unexpected error while loading session token for database '{db}': {e}")

116 changes: 116 additions & 0 deletions microSALT/utils/pubmlst/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import requests
from urllib.parse import urlencode
from microSALT.utils.pubmlst.helpers import (
BASE_API,
generate_oauth_header,
load_auth_credentials,
parse_pubmlst_url
)
from microSALT.utils.pubmlst.constants import RequestType, HTTPMethod, ResponseHandler
from microSALT.utils.pubmlst.exceptions import PUBMLSTError, SessionTokenRequestError
from microSALT.utils.pubmlst.authentication import load_session_credentials
from microSALT import logger

class PubMLSTClient:
"""Client for interacting with the PubMLST authenticated API."""

def __init__(self):
"""Initialize the PubMLST client."""
try:
self.consumer_key, self.consumer_secret, self.access_token, self.access_secret = load_auth_credentials()
self.database = "pubmlst_test_seqdef"
self.session_token, self.session_secret = load_session_credentials(self.database)
except PUBMLSTError as e:
logger.error(f"Failed to initialize PubMLST client: {e}")
raise


@staticmethod
def parse_pubmlst_url(url: str):
"""
Wrapper for the parse_pubmlst_url function.
"""
return parse_pubmlst_url(url)


def _make_request(self, request_type: RequestType, method: HTTPMethod, url: str, db: str = None, response_handler: ResponseHandler = ResponseHandler.JSON):
""" Handle API requests."""
try:
if db:
session_token, session_secret = load_session_credentials(db)
else:
session_token, session_secret = self.session_token, self.session_secret

if request_type == RequestType.AUTH:
headers = {
"Authorization": generate_oauth_header(url, self.consumer_key, self.consumer_secret, self.access_token, self.access_secret)
}
elif request_type == RequestType.DB:
headers = {
"Authorization": generate_oauth_header(url, self.consumer_key, self.consumer_secret, session_token, session_secret)
}
else:
raise ValueError(f"Unsupported request type: {request_type}")

if method == HTTPMethod.GET:
response = requests.get(url, headers=headers)
elif method == HTTPMethod.POST:
response = requests.post(url, headers=headers)
elif method == HTTPMethod.PUT:
response = requests.put(url, headers=headers)
else:
raise ValueError(f"Unsupported HTTP method: {method}")

response.raise_for_status()

if response_handler == ResponseHandler.CONTENT:
return response.content
elif response_handler == ResponseHandler.TEXT:
return response.text
elif response_handler == ResponseHandler.JSON:
return response.json()
else:
raise ValueError(f"Unsupported response handler: {response_handler}")

except requests.exceptions.HTTPError as e:
raise SessionTokenRequestError(db or self.database, e.response.status_code, e.response.text) from e
except requests.exceptions.RequestException as e:
logger.error(f"Request failed: {e}")
raise PUBMLSTError(f"Request failed: {e}") from e
except Exception as e:
logger.error(f"Unexpected error during request: {e}")
raise PUBMLSTError(f"An unexpected error occurred: {e}") from e


def query_databases(self):
"""Query available PubMLST databases."""
url = f"{BASE_API}/db"
return self._make_request(RequestType.DB, HTTPMethod.GET, url, response_handler=ResponseHandler.JSON)


def download_locus(self, db: str, locus: str, **kwargs):
"""Download locus sequence files."""
base_url = f"{BASE_API}/db/{db}/loci/{locus}/alleles_fasta"
query_string = urlencode(kwargs)
url = f"{base_url}?{query_string}" if query_string else base_url
return self._make_request(RequestType.DB, HTTPMethod.GET, url, db=db, response_handler=ResponseHandler.TEXT)


def download_profiles_csv(self, db: str, scheme_id: int):
"""Download MLST profiles in CSV format."""
if not scheme_id:
raise ValueError("Scheme ID is required to download profiles CSV.")
url = f"{BASE_API}/db/{db}/schemes/{scheme_id}/profiles_csv"
return self._make_request(RequestType.DB, HTTPMethod.GET, url, db=db, response_handler=ResponseHandler.TEXT)


def retrieve_scheme_info(self, db: str, scheme_id: int):
"""Retrieve information about a specific MLST scheme."""
url = f"{BASE_API}/db/{db}/schemes/{scheme_id}"
return self._make_request(RequestType.DB, HTTPMethod.GET, url, db=db, response_handler=ResponseHandler.JSON)


def list_schemes(self, db: str):
"""List available MLST schemes for a specific database."""
url = f"{BASE_API}/db/{db}/schemes"
return self._make_request(RequestType.DB, HTTPMethod.GET, url, db=db, response_handler=ResponseHandler.JSON)
79 changes: 79 additions & 0 deletions microSALT/utils/pubmlst/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from enum import Enum
from werkzeug.routing import Map, Rule

class RequestType(Enum):
AUTH = "auth"
DB = "db"

class CredentialsFile(Enum):
MAIN = "main"
SESSION = "session"

class Encoding(Enum):
UTF8 = "utf-8"

class HTTPMethod(Enum):
GET = "GET"
POST = "POST"
PUT = "PUT"
DELETE = "DELETE"
PATCH = "PATCH"
HEAD = "HEAD"
OPTIONS = "OPTIONS"

class ResponseHandler(Enum):
CONTENT = "content"
TEXT = "text"
JSON = "json"

url_map = Map([
Rule('/', endpoint='root'),
Rule('/db', endpoint='db_root'),
Rule('/db/<db>', endpoint='database_root'),
Rule('/db/<db>/classification_schemes', endpoint='classification_schemes'),
Rule('/db/<db>/classification_schemes/<int:classification_scheme_id>', endpoint='classification_scheme'),
Rule('/db/<db>/classification_schemes/<int:classification_scheme_id>/groups', endpoint='classification_scheme_groups'),
Rule('/db/<db>/classification_schemes/<int:classification_scheme_id>/groups/<int:group_id>', endpoint='classification_scheme_group'),
Rule('/db/<db>/loci', endpoint='loci'),
Rule('/db/<db>/loci/<locus>', endpoint='locus'),
Rule('/db/<db>/loci/<locus>/alleles', endpoint='locus_alleles'),
Rule('/db/<db>/loci/<locus>/alleles_fasta', endpoint='locus_alleles_fasta'),
Rule('/db/<db>/loci/<locus>/alleles/<int:allele_id>', endpoint='locus_allele'),
Rule('/db/<db>/loci/<locus>/sequence', endpoint='locus_sequence_post'),
Rule('/db/<db>/sequence', endpoint='sequence_post'),
Rule('/db/<db>/sequences', endpoint='sequences'),
Rule('/db/<db>/schemes', endpoint='schemes'),
Rule('/db/<db>/schemes/<int:scheme_id>', endpoint='scheme'),
Rule('/db/<db>/schemes/<int:scheme_id>/loci', endpoint='scheme_loci'),
Rule('/db/<db>/schemes/<int:scheme_id>/fields/<field>', endpoint='scheme_field'),
Rule('/db/<db>/schemes/<int:scheme_id>/profiles', endpoint='scheme_profiles'),
Rule('/db/<db>/schemes/<int:scheme_id>/profiles_csv', endpoint='scheme_profiles_csv'),
Rule('/db/<db>/schemes/<int:scheme_id>/profiles/<int:profile_id>', endpoint='scheme_profile'),
Rule('/db/<db>/schemes/<int:scheme_id>/sequence', endpoint='scheme_sequence_post'),
Rule('/db/<db>/schemes/<int:scheme_id>/designations', endpoint='scheme_designations_post'),
Rule('/db/<db>/isolates', endpoint='isolates'),
Rule('/db/<db>/genomes', endpoint='genomes'),
Rule('/db/<db>/isolates/search', endpoint='isolates_search_post'),
Rule('/db/<db>/isolates/<int:isolate_id>', endpoint='isolate'),
Rule('/db/<db>/isolates/<int:isolate_id>/allele_designations', endpoint='isolate_allele_designations'),
Rule('/db/<db>/isolates/<int:isolate_id>/allele_designations/<locus>', endpoint='isolate_allele_designation_locus'),
Rule('/db/<db>/isolates/<int:isolate_id>/allele_ids', endpoint='isolate_allele_ids'),
Rule('/db/<db>/isolates/<int:isolate_id>/schemes/<int:scheme_id>/allele_designations', endpoint='isolate_scheme_allele_designations'),
Rule('/db/<db>/isolates/<int:isolate_id>/schemes/<int:scheme_id>/allele_ids', endpoint='isolate_scheme_allele_ids'),
Rule('/db/<db>/isolates/<int:isolate_id>/contigs', endpoint='isolate_contigs'),
Rule('/db/<db>/isolates/<int:isolate_id>/contigs_fasta', endpoint='isolate_contigs_fasta'),
Rule('/db/<db>/isolates/<int:isolate_id>/history', endpoint='isolate_history'),
Rule('/db/<db>/contigs/<int:contig_id>', endpoint='contig'),
Rule('/db/<db>/fields', endpoint='fields'),
Rule('/db/<db>/fields/<field>', endpoint='field'),
Rule('/db/<db>/users/<int:user_id>', endpoint='user'),
Rule('/db/<db>/curators', endpoint='curators'),
Rule('/db/<db>/projects', endpoint='projects'),
Rule('/db/<db>/projects/<int:project_id>', endpoint='project'),
Rule('/db/<db>/projects/<int:project_id>/isolates', endpoint='project_isolates'),
Rule('/db/<db>/submissions', endpoint='submissions'),
Rule('/db/<db>/submissions/<int:submission_id>', endpoint='submission'),
Rule('/db/<db>/submissions/<int:submission_id>/messages', endpoint='submission_messages'),
Rule('/db/<db>/submissions/<int:submission_id>/files', endpoint='submission_files'),
Rule('/db/<db>/submissions/<int:submission_id>/files/<filename>', endpoint='submission_file'),
])
Loading