Allow .h5ad, .h5 and mtx as input and fix requirements (#264)

* Update utils.py to include `_read_mtx`, `_read_h5ad` and `_read_h5` * Update `method_launcher.py` * Update `cpdb_statistical_analysis_helper.py` and `cpdb_statistical_analysis_complex_method.py` * Requirements: drop pandas versions (fix compatibility with python versions); add numpy, h5py and anndata; for now scikit-learn (geosketch dependency); remains fixed at 0.22 until further testing of geosketch v1.2 * Update `requirements.txt` and `win-requirements.txt` and make `setup.py` pickup requirements from requirements.txt Co-authored-by: @prete
Teichlab · Feb 23, 2021 · 85b337c · 85b337c
1 parent 4eb22a2
commit 85b337c
Show file tree

Hide file tree

Showing 13 changed files with 124 additions and 76 deletions.
diff --git a/README.md b/README.md
@@ -14,52 +14,53 @@ To start using CellPhoneDB, you can use our interactive web application ([cellph
 
 
 ### Installing CellPhoneDB
-NOTE: Works with Python v3.5 or greater. If your default Python interpreter is for `v2.x` (you can check it with `python --version`), calls to `python`/`pip` should be substituted by `python3`/`pip3`.
+NOTE: Works with Python v3.6 or greater. If your default Python interpreter is for `v2.x` (you can check it with `python --version`), calls to `python`/`pip` should be substituted by `python3`/`pip3`.
 
-We highly recommend using a python virtual environment (as described in steps 1 and 2) but you could of course omit these steps and install via `pip` immediately.
+We highly recommend using an isolated python environment (as described in steps 1 and 2) using [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or [virtualenv](https://docs.python.org/3/library/venv.html) but you could of course omit these steps and install via `pip` immediately.
 
-1. Create python>3.5 virtualenv
-```shell
-python -m venv cpdb-venv
-```
+1. Create python=>3.6 environment
+- Using conda: `conda create -n cpdb python=3.7`
+- Using virtualenv: `python -m venv cpdb`
 
-2. Activate virtualenv
-```shell
-source cpdb-venv/bin/activate
-```
+2. Activate environment
+- Using conda: `source activate cpdb`
+- Using virtualenv: `source cpdb/bin/activate`
 
-3. Install CellPhoneDB
-```shell
-pip install cellphonedb
-```
+3. Install CellPhoneDB `pip install cellphonedb`
 
 
 ## Running CellPhoneDB Methods
 
-Please, run step 0 if you didn't activate the virtualenv previously
-
-0. Activate virtualenv
-```shell
-source cpdb-venv/bin/activate
-```
+Please, activate your environment if you didn't previously
+- Using conda: `source activate cpdb`
+- Using virtualenv: `source cpdb/bin/activate`
 
 To use the example data, please [download meta/counts test data](https://github.com/Teichlab/cellphonedb/blob/master/in/example_data/cellphonedb_example_data.zip?raw=true). i.e.
 ```shell
 curl https://raw.githubusercontent.com/Teichlab/cellphonedb/master/in/example_data/test_counts.txt --output test_counts.txt
 curl https://raw.githubusercontent.com/Teichlab/cellphonedb/master/in/example_data/test_meta.txt --output test_meta.txt
 ```
 
+Note: counts file can be a text file or `h5ad` (recoommended), `h5` or a path to a folder containing `mtx/barcode/features`.
+
+
 ####  Example with running the statistical method
 ```shell
-cellphonedb method statistical_analysis test_meta.txt test_counts.txt 
+cellphonedb method statistical_analysis test_meta.txt test_counts.txt
 ```
 
 
 #### Example without using the statistical method
+**Using text files**
 ```shell
 cellphonedb method analysis test_meta.txt test_counts.txt 
 ```
 
+**Using h5ad count file**
+```shell
+cellphonedb method analysis test_meta.txt test_counts.h5ad
+```
+
 Please check the [results documentation](Docs/RESULTS-DOCUMENTATION.md) in order to understand the results.
 
 ### Optional Parameters
@@ -86,7 +87,7 @@ Please check the [results documentation](Docs/RESULTS-DOCUMENTATION.md) in order
 - `--pvalues-result-name`: P-values result filename [pvalues]
 - `--pvalue`: P-value threshold [0.05]
 - `--debug-seed`: Debug random seed -1. To disable it please use a value >=0 [-1]
-- `--threads`: Number of threads to use. >=1 [-1]
+- `--threads`: Number of threads to use. >=1 [4]
 
 ### Usage Examples
 
@@ -126,7 +127,7 @@ cellphonedb plot dot_plot
 cellphonedb plot heatmap_plot yourmeta.txt
 ```
 
-### `dot_pot`
+### `dot_plot`
 This plot type requires `ggplot2` R package installed and working
 
 You can tweak the options for the plot with these arguments:
@@ -223,7 +224,7 @@ Generate specific parameters:
 
 - `--user-protein`: Protein input file
 - `--user-gene`: Gene input file
-- `--user-complex`: Compex input file
+- `--user-complex`: Complex input file
 - `--user-interactions`: Interactions input file
 - `--fetch`: Some lists can be downloaded from original sources while creating the database, eg: uniprot, ensembl. By default, the snapshots included in the CellPhoneDB package will be used; to enable a fresh copy `--fetch` must be appended to the command
 - `--result-path`: Output folder

diff --git a/cellphonedb/src/core/generators/complex_generator.py b/cellphonedb/src/core/generators/complex_generator.py
@@ -1,5 +1,5 @@
 import pandas as pd
-
+import numpy as np
 from cellphonedb.src.app.app_logger import app_logger
 from cellphonedb.src.core.generators.generator_helper import set_defaults
 from cellphonedb.src.exceptions.MissingRequiredColumns import MissingRequiredColumns
@@ -20,22 +20,22 @@ def _merge_complex(base_complex: pd.DataFrame, additional: pd.DataFrame, log_fil
     additional = additional.copy()
 
     defaults = {
-        'uniprot_3': pd.np.nan,
-        'uniprot_4': pd.np.nan,
+        'uniprot_3': np.nan,
+        'uniprot_4': np.nan,
         'receptor': False,
         'integrin': False,
         'other': False,
-        'other_desc': pd.np.nan,
+        'other_desc': np.nan,
         'peripheral': False,
-        'receptor_desc': pd.np.nan,
-        'secreted_desc': pd.np.nan,
+        'receptor_desc': np.nan,
+        'secreted_desc': np.nan,
         'secreted_highlight': False,
         'secreted': False,
         'transmembrane': False,
         'pdb_structure': False,
-        'pdb_id': pd.np.nan,
-        'stoichiometry': pd.np.nan,
-        'comments_complex': pd.np.nan
+        'pdb_id': np.nan,
+        'stoichiometry': np.nan,
+        'comments_complex': np.nan
     }
 
     default_types = {

diff --git a/cellphonedb/src/core/generators/generator_helper.py b/cellphonedb/src/core/generators/generator_helper.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import numpy as np
 
 
 def set_defaults(df: pd.DataFrame, defaults: dict, quiet=False) -> pd.DataFrame:
@@ -10,6 +11,6 @@ def set_defaults(df: pd.DataFrame, defaults: dict, quiet=False) -> pd.DataFrame:
             df[column_name] = default_value
             continue
 
-        df[column_name].replace({pd.np.nan: default_value}, inplace=True)
+        df[column_name].replace({np.nan: default_value}, inplace=True)
 
     return df
diff --git a/cellphonedb/src/core/methods/cpdb_analysis_complex_method.py b/cellphonedb/src/core/methods/cpdb_analysis_complex_method.py
@@ -1,5 +1,5 @@
 import pandas as pd
-
+import numpy as np
 from cellphonedb.src.core.core_logger import core_logger
 from cellphonedb.src.core.exceptions.AllCountsFilteredException import AllCountsFilteredException
 from cellphonedb.src.core.exceptions.NoInteractionsFound import NoInteractionsFound
@@ -228,7 +228,7 @@ def deconvolute_interaction_component(interactions, suffix, counts_data):
     deconvoluted_result[['protein_name', 'gene_name', 'name', 'is_complex', 'id_cp_interaction', 'receptor']] = \
         interactions[['protein_name{}'.format(suffix), 'gene_name{}'.format(suffix), 'name{}'.format(suffix),
                       'is_complex{}'.format(suffix), 'id_cp_interaction', 'receptor{}'.format(suffix)]]
-    deconvoluted_result['complex_name'] = pd.np.nan
+    deconvoluted_result['complex_name'] = np.nan
 
     return deconvoluted_result
 

diff --git a/cellphonedb/src/core/methods/cpdb_analysis_helper.py b/cellphonedb/src/core/methods/cpdb_analysis_helper.py
@@ -1,5 +1,5 @@
 import pandas as pd
-
+import numpy as np
 
 def percent_analysis(clusters: dict,
                      threshold: float,
@@ -64,7 +64,7 @@ def get_significant_means(mean_analysis: pd.DataFrame,
     for index, mean_analysis in mean_analysis.iterrows():
         for cluster_interaction in list(result_percent.columns):
             if not result_percent.at[index, cluster_interaction]:
-                significant_means.at[index, cluster_interaction] = pd.np.nan
+                significant_means.at[index, cluster_interaction] = np.nan
     return significant_means
 
 

diff --git a/cellphonedb/src/core/methods/cpdb_statistical_analysis_complex_method.py b/cellphonedb/src/core/methods/cpdb_statistical_analysis_complex_method.py
@@ -1,6 +1,7 @@
 from functools import partial
 
 import pandas as pd
+import numpy as np
 
 from cellphonedb.src.core.core_logger import core_logger
 from cellphonedb.src.core.exceptions.AllCountsFilteredException import AllCountsFilteredException
@@ -31,7 +32,7 @@ def call(meta: pd.DataFrame,
                                                                                   threads,
                                                                                   result_precision))
     if debug_seed >= 0:
-        pd.np.random.seed(debug_seed)
+        np.random.seed(debug_seed)
         core_logger.warning('Debug random seed enabled. Setted to {}'.format(debug_seed))
     cells_names = sorted(counts.columns)
 
@@ -254,7 +255,7 @@ def deconvoluted_complex_result_build(clusters_means: pd.DataFrame,
 
     deconvoluted_result = deconvoluted_result[deconvoluted_columns]
     deconvoluted_result.rename({'name': 'uniprot'}, axis=1, inplace=True)
-    deconvoluted_result = pd.concat([deconvoluted_result, clusters_means], axis=1, join='inner', sort=False)
+    deconvoluted_result = pd.concat([deconvoluted_result, clusters_means.reindex(deconvoluted_result.index)], axis=1, join='inner', sort=False)
     deconvoluted_result.set_index('gene', inplace=True, drop=True)
     deconvoluted_result.drop_duplicates(inplace=True)
 
@@ -272,7 +273,7 @@ def deconvolute_interaction_component(interactions, suffix, counts_data):
             ['multidata{}_id'.format(suffix), 'protein_name{}'.format(suffix), 'gene_name{}'.format(suffix),
              'name{}'.format(suffix),
              'is_complex{}'.format(suffix), 'id_cp_interaction', 'receptor{}'.format(suffix)]]
-    deconvoluted_result['complex_name'] = pd.np.nan
+    deconvoluted_result['complex_name'] = np.nan
 
     return deconvoluted_result
 

diff --git a/cellphonedb/src/core/methods/cpdb_statistical_analysis_helper.py b/cellphonedb/src/core/methods/cpdb_statistical_analysis_helper.py
@@ -3,7 +3,7 @@
 from multiprocessing.pool import Pool
 
 import pandas as pd
-
+import numpy as np
 from cellphonedb.src.core.core_logger import core_logger
 from cellphonedb.src.core.models.complex import complex_helper
 
@@ -43,7 +43,7 @@ def get_significant_means(real_mean_analysis: pd.DataFrame,
     for index, mean_analysis in real_mean_analysis.iterrows():
         for cluster_interaction in list(result_percent.columns):
             if result_percent.at[index, cluster_interaction] > min_significant_mean:
-                significant_means.at[index, cluster_interaction] = pd.np.nan
+                significant_means.at[index, cluster_interaction] = np.nan
     return significant_means
 
 
@@ -52,7 +52,9 @@ def shuffle_meta(meta: pd.DataFrame) -> pd.DataFrame:
     Permutates the meta values aleatory generating a new meta file
     """
     meta_copy = meta.copy()
-    pd.np.random.shuffle(meta_copy['cell_type'])
+    tmp = np.array(meta_copy['cell_type'])
+    np.random.shuffle(tmp)
+    meta_copy['cell_type'] = tmp
 
     return meta_copy
 
@@ -83,7 +85,7 @@ def build_clusters(meta: pd.DataFrame, counts: pd.DataFrame, complex_composition
         for cluster_name in cluster_names:
             for complex_multidata_id in complex_multidata_ids:
                 complex_components = complex_composition[
-                    complex_composition['complex_multidata_id'] == complex_multidata_id]
+                    complex_composition['complex_multidata_id'] == complex_multidata_id].copy()
                 complex_components['mean'] = complex_components['protein_multidata_id'].apply(
                     lambda protein: clusters['means'].at[protein, cluster_name])
                 min_component_mean_id = complex_components['mean'].idxmin()

diff --git a/cellphonedb/src/core/methods/method_launcher.py b/cellphonedb/src/core/methods/method_launcher.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import numpy as np
 
 from cellphonedb.src.core.core_logger import core_logger
 from cellphonedb.src.core.database import DatabaseManager
@@ -119,7 +120,7 @@ def _counts_validations(counts: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFram
         if not len(counts.columns):
             raise ParseCountsException('Counts values are not decimal values', 'Incorrect file format')
         try:
-            counts = counts.astype(pd.np.float)  # type: pd.DataFrame
+            counts = counts.astype(np.float)  # type: pd.DataFrame
         except:
             raise ParseCountsException
         meta.index = meta.index.astype(str)

diff --git a/cellphonedb/utils/utils.py b/cellphonedb/utils/utils.py
@@ -3,6 +3,9 @@
 import pickle
 from typing import TextIO, Optional
 
+import csv
+import scipy.io
+from anndata import read_h5ad
 import pandas as pd
 from werkzeug.datastructures import FileStorage
 
@@ -16,8 +19,16 @@
 
 def read_data_table_from_file(file: str, index_column_first: bool = False, separator: str = '',
                               dtype=None, na_values=None, compression=None) -> pd.DataFrame:
+    if os.path.isdir(file):
+        return _read_mtx(file)
+
     filename, file_extension = os.path.splitext(file)
 
+    if file_extension == '.h5ad':
+        return _read_h5ad(file)
+    if file_extension == '.h5':
+        return _read_h5(file)
+
     if file_extension == '.pickle':
         try:
             with open(file, 'rb') as f:
@@ -86,6 +97,32 @@ def write_to_file(df: pd.DataFrame, filename: str, output_path: str, output_form
     df.to_csv('{}/{}'.format(output_path, filename), sep=separator, index=False)
 
 
+def _read_mtx(path: str) -> pd.DataFrame:
+
+    mtx_path = os.path.join(path,'matrix.mtx')
+    bc_path = os.path.join(path, 'barcodes.tsv')
+    feature_path = os.path.join(path, 'features.tsv')
+
+    df = pd.DataFrame(scipy.io.mmread(mtx_path).toarray())
+    with open(bc_path) as bc_file:
+        df.columns = [bc[0].strip() for bc in list(csv.reader(bc_file, delimiter="\t"))]
+    with open(feature_path) as feature_file:
+        df.index = [feat[0].strip() for feat in list(csv.reader(feature_file, delimiter="\t"))]
+    df.index.name = 'Gene'
+
+    return df
+
+def _read_h5ad(path: str) -> pd.DataFrame:
+    adata = read_h5ad(path)
+    df = adata.to_df().T
+    return df
+
+
+def _read_h5(path: str) -> pd.DataFrame:
+    df = pd.read_hdf(path)
+    return df
+
+
 def _read_data(file_stream: TextIO, separator: str, index_column_first: bool, dtype=None,
                na_values=None, compression=None) -> pd.DataFrame:
     return pd.read_csv(file_stream, sep=separator, index_col=0 if index_column_first else None, dtype=dtype,

diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,18 @@
-scikit-learn==0.22
 click>=6.7,<6.7.99
-pandas>=0.23,<0.25.99
+pandas
+numpy
+scikit-learn==0.22
+flask>=1.0,<1.0.99
+Flask-RESTful>=0.3,<0.3.99
+Flask-Testing>=0.7,<0.7.99
 SQLAlchemy>=1.3,<1.3.99
 PyYAML>=5.1,<5.1.99
-requests>=2.19,<2.19.99
+requests
+pika>=0.12,<0.12.99
+boto3>=1.7,<1.7.99
 geosketch==0.3
 rpy2>=3.0.4,<3.0.99
 tqdm>=4.32,<4.32.99
-cython>=0.29,<0.29.99
-Werkzeug>=1.0.1,<1.0.99
+cython>=0.29,<0.29.9
+h5py<3.0.0
+anndata>=0.7,<=0.75