From bb27c146edc8b7584d627a27396662c4b7ef98ed Mon Sep 17 00:00:00 2001 From: alacheim Date: Fri, 3 Jan 2025 12:00:54 +0000 Subject: [PATCH 1/4] changes commands in Dockerfile and app.py --- mmseqs2/Dockerfile | 2 +- mmseqs2/app.py | 93 +++++++++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 36 deletions(-) diff --git a/mmseqs2/Dockerfile b/mmseqs2/Dockerfile index f747526..c628220 100644 --- a/mmseqs2/Dockerfile +++ b/mmseqs2/Dockerfile @@ -13,7 +13,7 @@ RUN pip3 install fastapi uvicorn COPY app.py app.py # Expose the port on which FastAPI will run -EXPOSE 8000 +EXPOSE 8001 # Start the FastAPI server when the container starts CMD ["python3", "app.py"] diff --git a/mmseqs2/app.py b/mmseqs2/app.py index 98b1586..8a4ee8e 100644 --- a/mmseqs2/app.py +++ b/mmseqs2/app.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, Request from pydantic import BaseModel import subprocess import os @@ -8,16 +8,16 @@ app = FastAPI() # Define a model for the input parameters -class MMSeqsParams(BaseModel): - query: str # The query sequence - database: str - output: str # The output directory - sensitivity: float = 7.5 # Sensitivity parameter for mmseqs2 - threads: int = 4 # Number of threads to use - blast_format: bool = True # Option to convert to BLAST+ format +# class MMSeqsParams(BaseModel): +# query: str # The query sequence +# database: str +# output: str # The output directory +# sensitivity: float = 7.5 # Sensitivity parameter for mmseqs2 +# threads: int = 4 # Number of threads to use +# blast_format: bool = True # Option to convert to BLAST+ format -# Dictionary to keep track of running jobs and results -job_results = {} +# # Dictionary to keep track of running jobs and results +# job_results = {} def create_fastas_file_from_seq(seq, filename): with open(filename, 'w') as file: @@ -45,34 +45,58 @@ def create_queryDB_from_seq(filename): async def read_root(): return {"message": "Welcome to the MMSeqs2 API!"} +@app.get("/help") +def help(): + try: + results = subprocess.run( + ["mmseqs", "--help"], + capture_output=True, + text=True, + ) + return {"help": results.stdout} + except subprocess.CalledProcessError as e: + raise HTTPException(status_code=400, detail=f"Command failed {e.stderr}") + @app.post("/run_mmseqs") -async def run_mmseqs(params: MMSeqsParams): +async def run_mmseqs(request: Request): + + data = await request.json() + + print(f" Received request to run blastp with data: {data}")) + + query_filename =f"in.fasta" + result_filename = f"out.out" + + # Clear or create result file + open(result_filename, 'w').close() + + # Create the fasta file from the query string + create_fastas_file_from_seq(data['query'], query_filename) # Create a unique job id - job_id = str(uuid4()) - output_dir = f"/tmp/{job_id}" + # job_id = str(uuid4()) + # output_dir = f"/tmp/{job_id}" - # Prepare the output directory - os.makedirs(output_dir, exist_ok=True) + # # Prepare the output directory + # os.makedirs(output_dir, exist_ok=True) - # Prepare paths - result_m8_path = os.path.join(output_dir, "result.m8") - result_tsv_path = os.path.join(output_dir, "result.tsv") + # # Prepare paths + # result_m8_path = os.path.join(output_dir, "result.m8") + # result_tsv_path = os.path.join(output_dir, "result.tsv") - # Create the FASTA file - path_query = os.path.join(output_dir, "query.fasta") - path_queryDB = path_query.replace('fasta', '') + ".db" - create_fastas_file_from_seq(params.query, path_query) - create_queryDB_from_seq(path_query) + # # Create the FASTA file + # path_query = os.path.join(output_dir, "query.fasta") + # path_queryDB = path_query.replace('fasta', '') + ".db" + # create_fastas_file_from_seq(params.query, path_query) + # create_queryDB_from_seq(path_query) # Run the mmseqs2 search command command = [ "mmseqs", "search", - path_queryDB, - params.database, - os.path.join(output_dir, "result"), - output_dir, - "--threads", str(params.threads), - "--sensitivity", str(params.sensitivity) + query_filename, + data['db'], + result_filename, + "--threads", str(data['threads']), + "--sensitivity", str(data['sensitivity']) ] try: @@ -80,15 +104,14 @@ async def run_mmseqs(params: MMSeqsParams): subprocess.run(command, check=True) # Convert the results to BLAST+ format if requested - if params.blast_format: + if data['blast_format']: # mmseqs convertalis queryDB targetDB resultDB resultDB.m8 # Convert to BLAST tabular format (BLAST m8 format) convert_command = [ "mmseqs", "convertalis", - params.query, - params.database, - os.path.join(output_dir, "result"), - result_m8_path, + data['query'], + data['database'], + result_filename, ] subprocess.run(convert_command, check=True) @@ -130,4 +153,4 @@ async def get_results(job_id: str): if __name__ == '__main__': import uvicorn - uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True) \ No newline at end of file + uvicorn.run("app:app", host="0.0.0.0", port=8001, reload=True) \ No newline at end of file From 3e6eecd1cf55911c7bd376e7443c5662446f6c78 Mon Sep 17 00:00:00 2001 From: alacheim Date: Sun, 5 Jan 2025 12:48:56 +0000 Subject: [PATCH 2/4] finished implementation of app.py- still not working --- mmseqs2/Dockerfile | 5 +- mmseqs2/app.py | 131 +++++++++++----------------------- mmseqs2/reload_development.sh | 8 +-- 3 files changed, 49 insertions(+), 95 deletions(-) diff --git a/mmseqs2/Dockerfile b/mmseqs2/Dockerfile index c628220..ef882dd 100644 --- a/mmseqs2/Dockerfile +++ b/mmseqs2/Dockerfile @@ -10,7 +10,10 @@ RUN apt-get update && apt-get install -y python3 python3-pip RUN pip3 install fastapi uvicorn # Copy the FastAPI app to the container -COPY app.py app.py +COPY app.py /usr/local/bin/app.py + +# Set the working directory +WORKDIR /usr/local/bin # Expose the port on which FastAPI will run EXPOSE 8001 diff --git a/mmseqs2/app.py b/mmseqs2/app.py index 8a4ee8e..24fc887 100644 --- a/mmseqs2/app.py +++ b/mmseqs2/app.py @@ -1,32 +1,22 @@ from fastapi import FastAPI, HTTPException, Request -from pydantic import BaseModel +import logging + import subprocess import os -from uuid import uuid4 -import shutil app = FastAPI() +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +logger.info("FastAPI server is running...") -# Define a model for the input parameters -# class MMSeqsParams(BaseModel): -# query: str # The query sequence -# database: str -# output: str # The output directory -# sensitivity: float = 7.5 # Sensitivity parameter for mmseqs2 -# threads: int = 4 # Number of threads to use -# blast_format: bool = True # Option to convert to BLAST+ format - -# # Dictionary to keep track of running jobs and results -# job_results = {} -def create_fastas_file_from_seq(seq, filename): - with open(filename, 'w') as file: - file.write(f">seq\n{seq}\n") +def create_fastas_file_from_seq(queries, filename): + with open(filename, 'w') as f: + for idx, query in enumerate(queries): + f.write(f">seq{idx}\n{query}\n") + print(f"FASTA file created: {filename}") def create_queryDB_from_seq(filename): - # this will create a db from a single sequence file - # the command is mmseqs createdb - # the output should be a file with the same name as the input but with the extension .db command = [ "mmseqs", "createdb", @@ -49,7 +39,7 @@ async def read_root(): def help(): try: results = subprocess.run( - ["mmseqs", "--help"], + ["mmseqs", "-h"], capture_output=True, text=True, ) @@ -57,12 +47,17 @@ def help(): except subprocess.CalledProcessError as e: raise HTTPException(status_code=400, detail=f"Command failed {e.stderr}") -@app.post("/run_mmseqs") -async def run_mmseqs(request: Request): + +@app.post("/easycluster") +async def easycluster(request: Request): + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) data = await request.json() + logger.info(f"Received request data: {data}") - print(f" Received request to run blastp with data: {data}")) + print(f" Received request to run mmseqs with data: {data}") query_filename =f"in.fasta" result_filename = f"out.out" @@ -72,82 +67,38 @@ async def run_mmseqs(request: Request): # Create the fasta file from the query string create_fastas_file_from_seq(data['query'], query_filename) - # Create a unique job id - # job_id = str(uuid4()) - # output_dir = f"/tmp/{job_id}" - - # # Prepare the output directory - # os.makedirs(output_dir, exist_ok=True) - - # # Prepare paths - # result_m8_path = os.path.join(output_dir, "result.m8") - # result_tsv_path = os.path.join(output_dir, "result.tsv") - - # # Create the FASTA file - # path_query = os.path.join(output_dir, "query.fasta") - # path_queryDB = path_query.replace('fasta', '') + ".db" - # create_fastas_file_from_seq(params.query, path_query) - # create_queryDB_from_seq(path_query) # Run the mmseqs2 search command + # command = [ + # "mmseqs", "easy-cluster", + # query_filename, + # result_filename, + # "--min-seq-id", request['min_seq_id'], + # "-c", request['coverage'], + # "--cov-mode", request['cov_mode'], + # "tmp" + # ] command = [ - "mmseqs", "search", + "mmseqs", "easy-cluster", query_filename, - data['db'], result_filename, - "--threads", str(data['threads']), - "--sensitivity", str(data['sensitivity']) + "--min-seq-id", str(data['min_seq_id']), + "-c", str(data['coverage']), + "--cov-mode", str(data['cov_mode']), + "tmp" ] - + logger.info(f"Running command: {' '.join(command)}") + try: - # Execute mmseqs search subprocess.run(command, check=True) - - # Convert the results to BLAST+ format if requested - if data['blast_format']: - # mmseqs convertalis queryDB targetDB resultDB resultDB.m8 - # Convert to BLAST tabular format (BLAST m8 format) - convert_command = [ - "mmseqs", "convertalis", - data['query'], - data['database'], - result_filename, - ] - subprocess.run(convert_command, check=True) - - # Store the result path for m8 format - job_results[job_id] = { - "status": "completed", - "result_path": result_m8_path - } - else: - # Store the result path for standard mmseqs2 output (TSV format) - job_results[job_id] = { - "status": "completed", - "result_path": result_tsv_path - } - - return {"job_id": job_id} except subprocess.CalledProcessError as e: - raise HTTPException(status_code=500, detail=f"mmseqs2 failed: {str(e)}") - -@app.get("/results/{job_id}") -async def get_results(job_id: str): - # Check if the job exists - if job_id not in job_results: - raise HTTPException(status_code=404, detail="Job not found") - - # Get the result path - result = job_results[job_id] + logger.error(f"MMSeqs command failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + with open(result_filename, 'r') as file: + result = file.read() - # Read and return the result (assuming it's a text file you want to read and return) - result_file = result["result_path"] - if os.path.exists(result_file): - with open(result_file, "r") as file: - data = file.read() - return {"status": result["status"], "results": data} - else: - raise HTTPException(status_code=404, detail="Result file not found") + return result if __name__ == '__main__': diff --git a/mmseqs2/reload_development.sh b/mmseqs2/reload_development.sh index d074569..9099a5a 100644 --- a/mmseqs2/reload_development.sh +++ b/mmseqs2/reload_development.sh @@ -1,4 +1,4 @@ -sudo docker stop mmseq_docker -sudo docker remove mmseq_docker -sudo docker build --no-cache -t mmseq_docker . -sudo docker run --name mmseq_docker --volume /mnt/databases:/app -p 8000:8000 mmseq_docker \ No newline at end of file +sudo docker stop mmseqs +sudo docker remove mmseqs +sudo docker build --no-cache -t mmseqs_docker . +sudo docker run -d --name mmseqs -p 8001:8001 mmseqs_docker \ No newline at end of file From 8dd6b62ad1ca5234868fbf8f05fd705204ab8b89 Mon Sep 17 00:00:00 2001 From: alacheim Date: Sun, 12 Jan 2025 12:34:56 +0000 Subject: [PATCH 3/4] clustering works but it does not return the correct file --- mmseqs2/app.py | 157 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 117 insertions(+), 40 deletions(-) diff --git a/mmseqs2/app.py b/mmseqs2/app.py index 24fc887..6967aab 100644 --- a/mmseqs2/app.py +++ b/mmseqs2/app.py @@ -10,12 +10,56 @@ logger.info("FastAPI server is running...") -def create_fastas_file_from_seq(queries, filename): - with open(filename, 'w') as f: - for idx, query in enumerate(queries): - f.write(f">seq{idx}\n{query}\n") - print(f"FASTA file created: {filename}") +def create_fastas_file_from_seq(query_string, filename): + """ + Creates a FASTA file from a single string containing FASTA-formatted sequences. + + Args: + query_string (str): String containing FASTA-formatted sequences. + filename (str): Path to the output FASTA file. + + Raises: + ValueError: If any sequence contains invalid characters. + """ + def validate_sequence(sequence: str) -> bool: + """Validate that a sequence contains only valid amino acid characters.""" + valid_chars = set("ACDEFGHIKLMNPQRSTVWY*X") # Allow amino acids + stop codon (*), unknown (X) + sequence = sequence.upper().strip().replace("\n", "") # Remove whitespace and newlines + return all(char in valid_chars for char in sequence) + + # Split query string into lines + lines = query_string.strip().split("\n") + + # Parse headers and sequences + multifasta = [] + current_header = None + current_sequence = [] + + for line in lines: + if line.startswith(">"): # Header line + if current_header: # Save the previous sequence + sequence = "".join(current_sequence) + if not validate_sequence(sequence): + raise ValueError(f"Invalid characters in sequence under {current_header}") + multifasta.append(f"{current_header}\n{sequence}") + current_header = line.strip() # Update header + current_sequence = [] # Reset sequence buffer + else: # Sequence line + current_sequence.append(line.strip()) + + # Add the last sequence + if current_header and current_sequence: + sequence = "".join(current_sequence) + if not validate_sequence(sequence): + raise ValueError(f"Invalid characters in sequence under {current_header}") + multifasta.append(f"{current_header}\n{sequence}") + + # Write to file + with open(filename, 'w', encoding='utf-8') as f: + f.write("\n".join(multifasta) + "\n") # Ensure newline at end of file + print(f"FASTA file created: {filename}") + def create_queryDB_from_seq(filename): command = [ @@ -48,58 +92,91 @@ def help(): raise HTTPException(status_code=400, detail=f"Command failed {e.stderr}") -@app.post("/easycluster") -async def easycluster(request: Request): +# @app.post("/easycluster") +# async def easycluster(request: Request): - logging.basicConfig(level=logging.INFO) - logger = logging.getLogger(__name__) +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) - data = await request.json() - logger.info(f"Received request data: {data}") +# data = await request.json() +# logger.info(f"Received request data: {data}") + +# print(f" Received request to run mmseqs with data: {data}") + +# query_filename =f"in.fasta" +# result_filename = f"out" - print(f" Received request to run mmseqs with data: {data}") +# # Clear or create result file +# open(result_filename, 'w').close() - query_filename =f"in.fasta" - result_filename = f"out.out" +# # Create the fasta file from the query string +# create_fastas_file_from_seq(data['query'], query_filename) + +# # Run the mmseqs2 search command +# # command = [ +# # "mmseqs", "easy-cluster", +# # query_filename, +# # result_filename, +# # "--min-seq-id", request['min_seq_id'], +# # "-c", request['coverage'], +# # "--cov-mode", request['cov_mode'], +# # "tmp" +# # ] +# command = ["mmseqs", "easy-cluster", query_filename, result_filename, "tmp"] + +# logger.info(f"Running command: {' '.join(command)}") - # Clear or create result file - open(result_filename, 'w').close() +# try: +# subprocess.run(command, check=True) +# except subprocess.CalledProcessError as e: +# logger.error(f"MMSeqs command failed: {e}") +# raise HTTPException(status_code=500, detail=str(e)) + +# with open(result_filename, 'r') as file: +# result = file.read() + +# return result + +@app.post("/easycluster") +async def easycluster(request: Request): + data = await request.json() + logger.info(f"Received request data: {data}") - # Create the fasta file from the query string + BASE_DIR = "/app" + query_filename = os.path.join(BASE_DIR, "in.fasta") + result_filename = os.path.join(BASE_DIR, "output.out") + tmp_dir = os.path.join(BASE_DIR, "tmp") + + os.makedirs(tmp_dir, exist_ok=True) + open(result_filename, 'w').close() # Clear or create result file + + # Create the FASTA file from the query string create_fastas_file_from_seq(data['query'], query_filename) - # Run the mmseqs2 search command - # command = [ - # "mmseqs", "easy-cluster", - # query_filename, - # result_filename, - # "--min-seq-id", request['min_seq_id'], - # "-c", request['coverage'], - # "--cov-mode", request['cov_mode'], - # "tmp" - # ] - command = [ - "mmseqs", "easy-cluster", - query_filename, - result_filename, - "--min-seq-id", str(data['min_seq_id']), - "-c", str(data['coverage']), - "--cov-mode", str(data['cov_mode']), - "tmp" - ] + # Run the mmseqs2 command + command = ["mmseqs", "easy-cluster", query_filename, result_filename, tmp_dir] logger.info(f"Running command: {' '.join(command)}") try: - subprocess.run(command, check=True) + results = subprocess.run(command, capture_output=True, text=True, check=True) + logger.info(f"Command output: {results.stdout}") + except subprocess.CalledProcessError as e: - logger.error(f"MMSeqs command failed: {e}") - raise HTTPException(status_code=500, detail=str(e)) + logger.error(f"Command failed with return code {e.returncode}") + logger.error(f"STDOUT: {e.stdout}") + logger.error(f"STDERR: {e.stderr}") + raise HTTPException(status_code=500, detail=f"Command failed: {e.stderr}") + + # with open(f"/app/{result_filename}.out_all_seqs.fasta", 'r') as file: + # result = file.read() with open(result_filename, 'r') as file: + print(f"Reading result file: {result_filename}") result = file.read() - return result + return {"result": result} +# add easy search if __name__ == '__main__': import uvicorn From 2241ccb076d0870e9f61d4dae0a13252f1cea998 Mon Sep 17 00:00:00 2001 From: alacheim Date: Tue, 14 Jan 2025 08:02:56 +0000 Subject: [PATCH 4/4] fixed errors- functions for easy clustering --- mmseqs2/app.py | 94 ++++++++++---------------------------------------- 1 file changed, 19 insertions(+), 75 deletions(-) diff --git a/mmseqs2/app.py b/mmseqs2/app.py index 6967aab..5cafb98 100644 --- a/mmseqs2/app.py +++ b/mmseqs2/app.py @@ -1,8 +1,10 @@ from fastapi import FastAPI, HTTPException, Request +from starlette.responses import FileResponse import logging import subprocess import os +import shutil app = FastAPI() logging.basicConfig(level=logging.INFO) @@ -60,21 +62,6 @@ def validate_sequence(sequence: str) -> bool: print(f"FASTA file created: {filename}") -def create_queryDB_from_seq(filename): - - command = [ - "mmseqs", "createdb", - filename, - filename.replace('fasta', '') + ".db" - ] - - try: - subprocess.run(command, check=True) - - except subprocess.CalledProcessError as e: - raise HTTPException(status_code=600, detail=str(e)) - - @app.get("/") async def read_root(): return {"message": "Welcome to the MMSeqs2 API!"} @@ -91,52 +78,6 @@ def help(): except subprocess.CalledProcessError as e: raise HTTPException(status_code=400, detail=f"Command failed {e.stderr}") - -# @app.post("/easycluster") -# async def easycluster(request: Request): - -# logging.basicConfig(level=logging.INFO) -# logger = logging.getLogger(__name__) - -# data = await request.json() -# logger.info(f"Received request data: {data}") - -# print(f" Received request to run mmseqs with data: {data}") - -# query_filename =f"in.fasta" -# result_filename = f"out" - -# # Clear or create result file -# open(result_filename, 'w').close() - -# # Create the fasta file from the query string -# create_fastas_file_from_seq(data['query'], query_filename) - -# # Run the mmseqs2 search command -# # command = [ -# # "mmseqs", "easy-cluster", -# # query_filename, -# # result_filename, -# # "--min-seq-id", request['min_seq_id'], -# # "-c", request['coverage'], -# # "--cov-mode", request['cov_mode'], -# # "tmp" -# # ] -# command = ["mmseqs", "easy-cluster", query_filename, result_filename, "tmp"] - -# logger.info(f"Running command: {' '.join(command)}") - -# try: -# subprocess.run(command, check=True) -# except subprocess.CalledProcessError as e: -# logger.error(f"MMSeqs command failed: {e}") -# raise HTTPException(status_code=500, detail=str(e)) - -# with open(result_filename, 'r') as file: -# result = file.read() - -# return result - @app.post("/easycluster") async def easycluster(request: Request): data = await request.json() @@ -144,7 +85,7 @@ async def easycluster(request: Request): BASE_DIR = "/app" query_filename = os.path.join(BASE_DIR, "in.fasta") - result_filename = os.path.join(BASE_DIR, "output.out") + result_filename = os.path.join(BASE_DIR, "output") tmp_dir = os.path.join(BASE_DIR, "tmp") os.makedirs(tmp_dir, exist_ok=True) @@ -154,29 +95,32 @@ async def easycluster(request: Request): create_fastas_file_from_seq(data['query'], query_filename) # Run the mmseqs2 command - command = ["mmseqs", "easy-cluster", query_filename, result_filename, tmp_dir] + command = [ + "mmseqs", + "easy-cluster", + query_filename, + result_filename, + '--min-seq-id', str(data['min_seq_id']), + '-c', str(data['coverage']), + '--cov-mode', str(data['cov_mode']), + tmp_dir] logger.info(f"Running command: {' '.join(command)}") try: - results = subprocess.run(command, capture_output=True, text=True, check=True) - logger.info(f"Command output: {results.stdout}") + result = subprocess.run(command, capture_output=True, text=True, check=True) + logger.info(f"Command output: {result.stdout}") except subprocess.CalledProcessError as e: logger.error(f"Command failed with return code {e.returncode}") logger.error(f"STDOUT: {e.stdout}") logger.error(f"STDERR: {e.stderr}") raise HTTPException(status_code=500, detail=f"Command failed: {e.stderr}") - - # with open(f"/app/{result_filename}.out_all_seqs.fasta", 'r') as file: - # result = file.read() - - with open(result_filename, 'r') as file: - print(f"Reading result file: {result_filename}") - result = file.read() - - return {"result": result} -# add easy search + with open("/app/output_all_seqs.fasta", 'r') as file: + logger.info(f"Reading result file: /app/output_all_seqs.fasta") + result = file.read() + + return result if __name__ == '__main__': import uvicorn