Skip to content

Commit

Permalink
Merge pull request #5 from PyEED/mmseqs
Browse files Browse the repository at this point in the history
Mmseqs
  • Loading branch information
NiklasAbraham authored Jan 16, 2025
2 parents f533666 + 2241ccb commit 2b0d3af
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 117 deletions.
7 changes: 5 additions & 2 deletions mmseqs2/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@ RUN apt-get update && apt-get install -y python3 python3-pip
RUN pip3 install fastapi uvicorn

# Copy the FastAPI app to the container
COPY app.py app.py
COPY app.py /usr/local/bin/app.py

# Set the working directory
WORKDIR /usr/local/bin

# Expose the port on which FastAPI will run
EXPOSE 8000
EXPOSE 8001

# Start the FastAPI server when the container starts
CMD ["python3", "app.py"]
217 changes: 106 additions & 111 deletions mmseqs2/app.py
Original file line number Diff line number Diff line change
@@ -1,133 +1,128 @@
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from fastapi import FastAPI, HTTPException, Request
from starlette.responses import FileResponse
import logging

import subprocess
import os
from uuid import uuid4
import shutil

app = FastAPI()

# Define a model for the input parameters
class MMSeqsParams(BaseModel):
query: str # The query sequence
database: str
output: str # The output directory
sensitivity: float = 7.5 # Sensitivity parameter for mmseqs2
threads: int = 4 # Number of threads to use
blast_format: bool = True # Option to convert to BLAST+ format

# Dictionary to keep track of running jobs and results
job_results = {}

def create_fastas_file_from_seq(seq, filename):
with open(filename, 'w') as file:
file.write(f">seq\n{seq}\n")

def create_queryDB_from_seq(filename):
# this will create a db from a single sequence file
# the command is mmseqs createdb <input> <output>
# the output should be a file with the same name as the input but with the extension .db

command = [
"mmseqs", "createdb",
filename,
filename.replace('fasta', '') + ".db"
]

try:
subprocess.run(command, check=True)

except subprocess.CalledProcessError as e:
raise HTTPException(status_code=600, detail=str(e))
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("FastAPI server is running...")


def create_fastas_file_from_seq(query_string, filename):
"""
Creates a FASTA file from a single string containing FASTA-formatted sequences.
Args:
query_string (str): String containing FASTA-formatted sequences.
filename (str): Path to the output FASTA file.
Raises:
ValueError: If any sequence contains invalid characters.
"""
def validate_sequence(sequence: str) -> bool:
"""Validate that a sequence contains only valid amino acid characters."""
valid_chars = set("ACDEFGHIKLMNPQRSTVWY*X") # Allow amino acids + stop codon (*), unknown (X)
sequence = sequence.upper().strip().replace("\n", "") # Remove whitespace and newlines
return all(char in valid_chars for char in sequence)

# Split query string into lines
lines = query_string.strip().split("\n")

# Parse headers and sequences
multifasta = []
current_header = None
current_sequence = []

for line in lines:
if line.startswith(">"): # Header line
if current_header: # Save the previous sequence
sequence = "".join(current_sequence)
if not validate_sequence(sequence):
raise ValueError(f"Invalid characters in sequence under {current_header}")
multifasta.append(f"{current_header}\n{sequence}")
current_header = line.strip() # Update header
current_sequence = [] # Reset sequence buffer
else: # Sequence line
current_sequence.append(line.strip())

# Add the last sequence
if current_header and current_sequence:
sequence = "".join(current_sequence)
if not validate_sequence(sequence):
raise ValueError(f"Invalid characters in sequence under {current_header}")
multifasta.append(f"{current_header}\n{sequence}")

# Write to file
with open(filename, 'w', encoding='utf-8') as f:
f.write("\n".join(multifasta) + "\n") # Ensure newline at end of file

print(f"FASTA file created: {filename}")


@app.get("/")
async def read_root():
return {"message": "Welcome to the MMSeqs2 API!"}

@app.post("/run_mmseqs")
async def run_mmseqs(params: MMSeqsParams):
# Create a unique job id
job_id = str(uuid4())
output_dir = f"/tmp/{job_id}"
@app.get("/help")
def help():
try:
results = subprocess.run(
["mmseqs", "-h"],
capture_output=True,
text=True,
)
return {"help": results.stdout}
except subprocess.CalledProcessError as e:
raise HTTPException(status_code=400, detail=f"Command failed {e.stderr}")

# Prepare the output directory
os.makedirs(output_dir, exist_ok=True)
@app.post("/easycluster")
async def easycluster(request: Request):
data = await request.json()
logger.info(f"Received request data: {data}")

BASE_DIR = "/app"
query_filename = os.path.join(BASE_DIR, "in.fasta")
result_filename = os.path.join(BASE_DIR, "output")
tmp_dir = os.path.join(BASE_DIR, "tmp")

# Prepare paths
result_m8_path = os.path.join(output_dir, "result.m8")
result_tsv_path = os.path.join(output_dir, "result.tsv")
os.makedirs(tmp_dir, exist_ok=True)
open(result_filename, 'w').close() # Clear or create result file

# Create the FASTA file
path_query = os.path.join(output_dir, "query.fasta")
path_queryDB = path_query.replace('fasta', '') + ".db"
create_fastas_file_from_seq(params.query, path_query)
create_queryDB_from_seq(path_query)
# Create the FASTA file from the query string
create_fastas_file_from_seq(data['query'], query_filename)

# Run the mmseqs2 search command
# Run the mmseqs2 command
command = [
"mmseqs", "search",
path_queryDB,
params.database,
os.path.join(output_dir, "result"),
output_dir,
"--threads", str(params.threads),
"--sensitivity", str(params.sensitivity)
]

"mmseqs",
"easy-cluster",
query_filename,
result_filename,
'--min-seq-id', str(data['min_seq_id']),
'-c', str(data['coverage']),
'--cov-mode', str(data['cov_mode']),
tmp_dir]
logger.info(f"Running command: {' '.join(command)}")

try:
# Execute mmseqs search
subprocess.run(command, check=True)

# Convert the results to BLAST+ format if requested
if params.blast_format:
# mmseqs convertalis queryDB targetDB resultDB resultDB.m8
# Convert to BLAST tabular format (BLAST m8 format)
convert_command = [
"mmseqs", "convertalis",
params.query,
params.database,
os.path.join(output_dir, "result"),
result_m8_path,
]
subprocess.run(convert_command, check=True)

# Store the result path for m8 format
job_results[job_id] = {
"status": "completed",
"result_path": result_m8_path
}
else:
# Store the result path for standard mmseqs2 output (TSV format)
job_results[job_id] = {
"status": "completed",
"result_path": result_tsv_path
}

return {"job_id": job_id}
except subprocess.CalledProcessError as e:
raise HTTPException(status_code=500, detail=f"mmseqs2 failed: {str(e)}")

@app.get("/results/{job_id}")
async def get_results(job_id: str):
# Check if the job exists
if job_id not in job_results:
raise HTTPException(status_code=404, detail="Job not found")
result = subprocess.run(command, capture_output=True, text=True, check=True)
logger.info(f"Command output: {result.stdout}")

# Get the result path
result = job_results[job_id]

# Read and return the result (assuming it's a text file you want to read and return)
result_file = result["result_path"]
if os.path.exists(result_file):
with open(result_file, "r") as file:
data = file.read()
return {"status": result["status"], "results": data}
else:
raise HTTPException(status_code=404, detail="Result file not found")
except subprocess.CalledProcessError as e:
logger.error(f"Command failed with return code {e.returncode}")
logger.error(f"STDOUT: {e.stdout}")
logger.error(f"STDERR: {e.stderr}")
raise HTTPException(status_code=500, detail=f"Command failed: {e.stderr}")

with open("/app/output_all_seqs.fasta", 'r') as file:
logger.info(f"Reading result file: /app/output_all_seqs.fasta")
result = file.read()

return result

if __name__ == '__main__':
import uvicorn

uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
uvicorn.run("app:app", host="0.0.0.0", port=8001, reload=True)
8 changes: 4 additions & 4 deletions mmseqs2/reload_development.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sudo docker stop mmseq_docker
sudo docker remove mmseq_docker
sudo docker build --no-cache -t mmseq_docker .
sudo docker run --name mmseq_docker --volume /mnt/databases:/app -p 8000:8000 mmseq_docker
sudo docker stop mmseqs
sudo docker remove mmseqs
sudo docker build --no-cache -t mmseqs_docker .
sudo docker run -d --name mmseqs -p 8001:8001 mmseqs_docker

0 comments on commit 2b0d3af

Please sign in to comment.