forked from EnzymeML/EnzymeML_JupyterLab
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from PyEED/mmseqs
Mmseqs
- Loading branch information
Showing
3 changed files
with
115 additions
and
117 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,133 +1,128 @@ | ||
from fastapi import FastAPI, HTTPException | ||
from pydantic import BaseModel | ||
from fastapi import FastAPI, HTTPException, Request | ||
from starlette.responses import FileResponse | ||
import logging | ||
|
||
import subprocess | ||
import os | ||
from uuid import uuid4 | ||
import shutil | ||
|
||
app = FastAPI() | ||
|
||
# Define a model for the input parameters | ||
class MMSeqsParams(BaseModel): | ||
query: str # The query sequence | ||
database: str | ||
output: str # The output directory | ||
sensitivity: float = 7.5 # Sensitivity parameter for mmseqs2 | ||
threads: int = 4 # Number of threads to use | ||
blast_format: bool = True # Option to convert to BLAST+ format | ||
|
||
# Dictionary to keep track of running jobs and results | ||
job_results = {} | ||
|
||
def create_fastas_file_from_seq(seq, filename): | ||
with open(filename, 'w') as file: | ||
file.write(f">seq\n{seq}\n") | ||
|
||
def create_queryDB_from_seq(filename): | ||
# this will create a db from a single sequence file | ||
# the command is mmseqs createdb <input> <output> | ||
# the output should be a file with the same name as the input but with the extension .db | ||
|
||
command = [ | ||
"mmseqs", "createdb", | ||
filename, | ||
filename.replace('fasta', '') + ".db" | ||
] | ||
|
||
try: | ||
subprocess.run(command, check=True) | ||
|
||
except subprocess.CalledProcessError as e: | ||
raise HTTPException(status_code=600, detail=str(e)) | ||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
logger.info("FastAPI server is running...") | ||
|
||
|
||
def create_fastas_file_from_seq(query_string, filename): | ||
""" | ||
Creates a FASTA file from a single string containing FASTA-formatted sequences. | ||
Args: | ||
query_string (str): String containing FASTA-formatted sequences. | ||
filename (str): Path to the output FASTA file. | ||
Raises: | ||
ValueError: If any sequence contains invalid characters. | ||
""" | ||
def validate_sequence(sequence: str) -> bool: | ||
"""Validate that a sequence contains only valid amino acid characters.""" | ||
valid_chars = set("ACDEFGHIKLMNPQRSTVWY*X") # Allow amino acids + stop codon (*), unknown (X) | ||
sequence = sequence.upper().strip().replace("\n", "") # Remove whitespace and newlines | ||
return all(char in valid_chars for char in sequence) | ||
|
||
# Split query string into lines | ||
lines = query_string.strip().split("\n") | ||
|
||
# Parse headers and sequences | ||
multifasta = [] | ||
current_header = None | ||
current_sequence = [] | ||
|
||
for line in lines: | ||
if line.startswith(">"): # Header line | ||
if current_header: # Save the previous sequence | ||
sequence = "".join(current_sequence) | ||
if not validate_sequence(sequence): | ||
raise ValueError(f"Invalid characters in sequence under {current_header}") | ||
multifasta.append(f"{current_header}\n{sequence}") | ||
current_header = line.strip() # Update header | ||
current_sequence = [] # Reset sequence buffer | ||
else: # Sequence line | ||
current_sequence.append(line.strip()) | ||
|
||
# Add the last sequence | ||
if current_header and current_sequence: | ||
sequence = "".join(current_sequence) | ||
if not validate_sequence(sequence): | ||
raise ValueError(f"Invalid characters in sequence under {current_header}") | ||
multifasta.append(f"{current_header}\n{sequence}") | ||
|
||
# Write to file | ||
with open(filename, 'w', encoding='utf-8') as f: | ||
f.write("\n".join(multifasta) + "\n") # Ensure newline at end of file | ||
|
||
print(f"FASTA file created: {filename}") | ||
|
||
|
||
@app.get("/") | ||
async def read_root(): | ||
return {"message": "Welcome to the MMSeqs2 API!"} | ||
|
||
@app.post("/run_mmseqs") | ||
async def run_mmseqs(params: MMSeqsParams): | ||
# Create a unique job id | ||
job_id = str(uuid4()) | ||
output_dir = f"/tmp/{job_id}" | ||
@app.get("/help") | ||
def help(): | ||
try: | ||
results = subprocess.run( | ||
["mmseqs", "-h"], | ||
capture_output=True, | ||
text=True, | ||
) | ||
return {"help": results.stdout} | ||
except subprocess.CalledProcessError as e: | ||
raise HTTPException(status_code=400, detail=f"Command failed {e.stderr}") | ||
|
||
# Prepare the output directory | ||
os.makedirs(output_dir, exist_ok=True) | ||
@app.post("/easycluster") | ||
async def easycluster(request: Request): | ||
data = await request.json() | ||
logger.info(f"Received request data: {data}") | ||
|
||
BASE_DIR = "/app" | ||
query_filename = os.path.join(BASE_DIR, "in.fasta") | ||
result_filename = os.path.join(BASE_DIR, "output") | ||
tmp_dir = os.path.join(BASE_DIR, "tmp") | ||
|
||
# Prepare paths | ||
result_m8_path = os.path.join(output_dir, "result.m8") | ||
result_tsv_path = os.path.join(output_dir, "result.tsv") | ||
os.makedirs(tmp_dir, exist_ok=True) | ||
open(result_filename, 'w').close() # Clear or create result file | ||
|
||
# Create the FASTA file | ||
path_query = os.path.join(output_dir, "query.fasta") | ||
path_queryDB = path_query.replace('fasta', '') + ".db" | ||
create_fastas_file_from_seq(params.query, path_query) | ||
create_queryDB_from_seq(path_query) | ||
# Create the FASTA file from the query string | ||
create_fastas_file_from_seq(data['query'], query_filename) | ||
|
||
# Run the mmseqs2 search command | ||
# Run the mmseqs2 command | ||
command = [ | ||
"mmseqs", "search", | ||
path_queryDB, | ||
params.database, | ||
os.path.join(output_dir, "result"), | ||
output_dir, | ||
"--threads", str(params.threads), | ||
"--sensitivity", str(params.sensitivity) | ||
] | ||
|
||
"mmseqs", | ||
"easy-cluster", | ||
query_filename, | ||
result_filename, | ||
'--min-seq-id', str(data['min_seq_id']), | ||
'-c', str(data['coverage']), | ||
'--cov-mode', str(data['cov_mode']), | ||
tmp_dir] | ||
logger.info(f"Running command: {' '.join(command)}") | ||
|
||
try: | ||
# Execute mmseqs search | ||
subprocess.run(command, check=True) | ||
|
||
# Convert the results to BLAST+ format if requested | ||
if params.blast_format: | ||
# mmseqs convertalis queryDB targetDB resultDB resultDB.m8 | ||
# Convert to BLAST tabular format (BLAST m8 format) | ||
convert_command = [ | ||
"mmseqs", "convertalis", | ||
params.query, | ||
params.database, | ||
os.path.join(output_dir, "result"), | ||
result_m8_path, | ||
] | ||
subprocess.run(convert_command, check=True) | ||
|
||
# Store the result path for m8 format | ||
job_results[job_id] = { | ||
"status": "completed", | ||
"result_path": result_m8_path | ||
} | ||
else: | ||
# Store the result path for standard mmseqs2 output (TSV format) | ||
job_results[job_id] = { | ||
"status": "completed", | ||
"result_path": result_tsv_path | ||
} | ||
|
||
return {"job_id": job_id} | ||
except subprocess.CalledProcessError as e: | ||
raise HTTPException(status_code=500, detail=f"mmseqs2 failed: {str(e)}") | ||
|
||
@app.get("/results/{job_id}") | ||
async def get_results(job_id: str): | ||
# Check if the job exists | ||
if job_id not in job_results: | ||
raise HTTPException(status_code=404, detail="Job not found") | ||
result = subprocess.run(command, capture_output=True, text=True, check=True) | ||
logger.info(f"Command output: {result.stdout}") | ||
|
||
# Get the result path | ||
result = job_results[job_id] | ||
|
||
# Read and return the result (assuming it's a text file you want to read and return) | ||
result_file = result["result_path"] | ||
if os.path.exists(result_file): | ||
with open(result_file, "r") as file: | ||
data = file.read() | ||
return {"status": result["status"], "results": data} | ||
else: | ||
raise HTTPException(status_code=404, detail="Result file not found") | ||
except subprocess.CalledProcessError as e: | ||
logger.error(f"Command failed with return code {e.returncode}") | ||
logger.error(f"STDOUT: {e.stdout}") | ||
logger.error(f"STDERR: {e.stderr}") | ||
raise HTTPException(status_code=500, detail=f"Command failed: {e.stderr}") | ||
|
||
with open("/app/output_all_seqs.fasta", 'r') as file: | ||
logger.info(f"Reading result file: /app/output_all_seqs.fasta") | ||
result = file.read() | ||
|
||
return result | ||
|
||
if __name__ == '__main__': | ||
import uvicorn | ||
|
||
uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True) | ||
uvicorn.run("app:app", host="0.0.0.0", port=8001, reload=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
sudo docker stop mmseq_docker | ||
sudo docker remove mmseq_docker | ||
sudo docker build --no-cache -t mmseq_docker . | ||
sudo docker run --name mmseq_docker --volume /mnt/databases:/app -p 8000:8000 mmseq_docker | ||
sudo docker stop mmseqs | ||
sudo docker remove mmseqs | ||
sudo docker build --no-cache -t mmseqs_docker . | ||
sudo docker run -d --name mmseqs -p 8001:8001 mmseqs_docker |