initial code upload

HKU-BAL · Oct 19, 2024 · 327fce8 · 327fce8
1 parent 3675a99
commit 327fce8
Show file tree

Hide file tree

Showing 9 changed files with 1,393 additions and 1 deletion.
diff --git a/AutoPM3_main.py b/AutoPM3_main.py
diff --git a/PM3-Bench/README.md b/PM3-Bench/README.md
diff --git a/README.md b/README.md
@@ -1 +1,112 @@
-# AutoPM3
+# AutoPM3: Enhancing Variant Interpretation via LLM-driven PM3 Evidence Extraction from Scientific Literature
+
+[![License](https://img.shields.io/badge/license-MIT-blue)](https://opensource.org/license/mit/) 
+
+
+Contact: Ruibang Luo, Shumin Li
+
+Email: [email protected], [email protected]
+
+
+## Introduction
+We introduce AutoPM3, a method for automating the extraction of ACMG/AMP PM3 evidence from scientific literature using open-source LLMs. It combines an optimized RAG system for text comprehension and a TableLLM equipped with Text2SQL for data extraction. We evaluated AutoPM3 using our collected PM3-Bench, a dataset from ClinGen with 1,027 variant-publication pairs. AutoPM3 significantly outperformed other methods in variant hit and in trans variant identification, thanks to the four key modules. Additionally, we wrapped AutoPM3 with a user-friendly interface to enhances its accessibility. This study presents a powerful tool to improve rare disease diagnosis workflows by facilitating PM3-relevant evidence extraction from scientific literature.
+
+![](./images/img1.png)
+---
+
+## Contents
+
+- [Latest Updates](#latest-updates)
+- [Installations](#installation)
+    - [Dependency Installation](#dependency-installation)
+    - [Ollama Setup](#using-ollama-to-host-llms)
+- [Usage](#usage)
+    - [Quick Start](#quick-start)
+    - [Advanced Usage](#advanced-usage-of-the-python-script)
+- [TODO](#todo)
+---
+
+## Latest Updates
+* v0.1 (Oct, 2024): Initial release.
+---
+
+## Installation
+### Dependency Installation
+```bash
+conda create -n AutoPM3 python=3.10
+conda activate AutoPM3
+pip3 install -r requirements.txt
+```
+
+### Using Ollama to host LLMs
+1. Download Ollama [Guidance](https://github.com/ollama/ollama)  
+2. Change the directory of Ollama models:
+```bash
+# please change the target folder as you prefer
+mkdir ollama_models
+export OLLAMA_MODELS=./ollama_models
+```
+
+3. Launch Ollama server:
+
+```bash
+
+ollama serve
+
+```
+
+3. Download sqlcoder-mistral-7B model and fine-tuned Llama3:
+```bash
+cd $OLLAMA_MODELS
+wget https://huggingface.co/MaziyarPanahi/sqlcoder-7b-Mistral-7B-Instruct-v0.2-slerp-GGUF/resolve/main/sqlcoder-7b-Mistral-7B-Instruct-v0.2-slerp.Q8_0.gguf?download=true
+mv 'sqlcoder-7b-Mistral-7B-Instruct-v0.2-slerp.Q8_0.gguf?download=true' 'sqlcoder-7b-Mistral-7B-Instruct-v0.2-slerp.Q8_0.gguf'
+echo "FROM ./sqlcoder-7b-Mistral-7B-Instruct-v0.2-slerp.Q8_0.gguf" >Modelfile1
+ollama create sqlcoder-7b-Mistral-7B-Instruct-v0.2-slerp.Q8_0 -f Modelfile1
+
+wget http://bio8.cs.hku.hk/AutoPM3/llama3_loraFT-8b-f16.gguf
+echo "FROM ./llama3_loraFT-8b-f16.gguf" >Modelfile2
+ollama create llama3_loraFT-8b-f16 -f Modelfile2
+
+```
+
+5. Check the created models:
+
+```bash
+
+ollama list
+
+```
+
+6. (Optional) Download other models as the backend of the RAG system:
+```
+# e.g. download Llama3:70B
+ollama pull llama3:70B
+
+```
+
+## Usage
+
+### Quick start
+
+* Step 1. Launch the local web-server:
+```bash
+streamlit run lit.py
+```
+* Step 2. Copy the following `http://localhost:8501` to the brower and start to use.
+
+### Advanced usage of the python script
+
+* Check the help of AutoPM3_main.py
+```bash
+python AutoPM3_main.py -h
+```
+* The example of running python scripts: 
+```bash
+python AutoPM3_main.py 
+--query_variant "NM_004004.5:c.516G>C" ## HVGS format query variant
+--paper_path ./xml_papers/20201936.xml ## paper path.
+--model_name_text llama3_loraFT-8b-f16 ## change to llama3:70b or other hosted models as the backend of RAG as you prefer, noted that you need pull the model in Ollama in advance.
+```
+
+## TODO
+* A fast set up for AutoPM3.
diff --git a/images/img1.png b/images/img1.png
diff --git a/lit.py b/lit.py
@@ -0,0 +1,66 @@
+import streamlit as st
+import os
+import requests
+
+from AutoPM3_main import query_variant_in_paper_xml
+
+
+# Function to load a XML file from a URL
+def load_xml(url,pmid):
+
+    temp_paper_file_root = "./xml_papers"
+    if(not os.path.exists(temp_paper_file_root)):
+        os.mkdir(temp_paper_file_root)
+    fn = str(pmid)+".xml"
+    xml_path = os.path.join(temp_paper_file_root,fn)
+    if(os.path.exists(xml_path)):
+        return xml_path
+
+    headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+    }
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200 and response.headers['Content-type'] == 'text/xml':
+        xml_data = response.content
+        # save it to the temp dir
+        with open(xml_path, 'wb') as f:
+            f.write(response.content)
+        return xml_path
+    else:
+        raise Exception('Invalid PMID. Make sure the publication has OpenAccess.')
+        return None
+
+
+# Function to display model results
+def display_results(model, data):
+    # Assuming 'model' is your trained model and 'data' is the input to the model
+    results = model.predict(data)
+    st.write(results)
+
+# Main
+st.title('AutoPM3')
+
+variant_name = st.text_input('Step 1. Enter the variant (HGVS notation)')
+
+# Get the URL of the XML from the user
+paper_url = ''
+pmid = st.text_input('Step 2. Enter the PMID of the paper')
+if pmid:
+    try:
+        pmid = int(pmid)
+        paper_url = f'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/{pmid}/unicode'
+    except ValueError:
+        st.write('Invalid PMID.')
+
+if st.button('Run', type='primary'):
+    summarized_results = ""
+    if paper_url and variant_name:
+        try:
+            # Load and display the XML
+            xml_path = load_xml(paper_url,pmid)
+            summarized_results = query_variant_in_paper_xml(variant_name, xml_path, 'sqlcoder-7b-Mistral-7B-Instruct-v0.2-slerp.Q8_0', 'llama3_loraFT-8b-f16')
+            # Display the summarized results
+            st.write(summarized_results)
+        except Exception as e:
+            st.write('An error has occurred.')
+            st.write(str(e))
diff --git a/protein.txt b/protein.txt
@@ -0,0 +1,21 @@
+Ala	A
+Arg	R
+Asn	N
+Asp	D
+Cys	C
+Gln	Q
+Glu	E
+Gly	G
+His	H
+Ile	I
+Leu	L
+Lys	K
+Met	M
+Phe	F
+Pro	P
+Ser	S
+Ter	X
+Thr	T
+Trp	W
+Tyr	Y
+Val	V
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,85 @@
+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+async-timeout==4.0.3
+attrs==24.2.0
+beautifulsoup4==4.12.2
+bioc==2.1
+blinker==1.8.2
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+dataclasses-json==0.6.7
+docopt==0.6.2
+exceptiongroup==1.2.2
+frozenlist==1.4.1
+func_timeout==4.3.5
+gitdb==4.0.11
+GitPython==3.1.43
+greenlet==3.1.1
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+idna==3.10
+intervaltree==3.1.0
+Jinja2==3.1.4
+jsonlines==4.0.0
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+langchain==0.2.16
+langchain-community==0.2.6
+langchain-core==0.2.41
+langchain-experimental==0.0.63
+langchain-text-splitters==0.2.4
+langsmith==0.1.136
+lxml==5.2.2
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.23.0
+mdurl==0.1.2
+multidict==6.1.0
+mypy-extensions==1.0.0
+narwhals==1.9.4
+numpy==1.26.4
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.2
+pillow==10.4.0
+propcache==0.2.0
+protobuf==5.28.2
+pyarrow==17.0.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydeck==0.9.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+requests==2.32.3
+requests-toolbelt==1.0.0
+rich==13.9.2
+rpds-py==0.20.0
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+SQLAlchemy==2.0.36
+streamlit==1.39.0
+tenacity==8.5.0
+toml==0.10.2
+tornado==6.4.1
+tqdm==4.66.5
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchdog==5.0.3
+yarl==1.15.5
-Original file line number
+Diff line change
@@ -0,0 +1,21 @@
+    Ala	A
+    Arg	R
+    Asn	N
+    Asp	D
+    Cys	C
+    Gln	Q
+    Glu	E
+    Gly	G
+    His	H
+    Ile	I
+    Leu	L
+    Lys	K
+    Met	M
+    Phe	F
+    Pro	P
+    Ser	S
+    Ter	X
+    Thr	T
+    Trp	W
+    Tyr	Y
+    Val	V