From 23369b2055407a63ca89049c8d8023ed164c6dc6 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 4 Jun 2015 15:55:21 -0600 Subject: [PATCH 1/5] ENH: avoid full dense matrix for parallel beta --- qiime/beta_diversity.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/qiime/beta_diversity.py b/qiime/beta_diversity.py index dcd723b95d..0e2a36ba55 100644 --- a/qiime/beta_diversity.py +++ b/qiime/beta_diversity.py @@ -34,7 +34,7 @@ import warnings warnings.filterwarnings('ignore', 'Not using MPI as mpi4py not found') -from numpy import asarray +from numpy import vstack import cogent.maths.distance_transform as distance_transform from biom import load_table @@ -144,8 +144,6 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, otu_table = load_table(input_path) - otumtx = asarray([v for v in otu_table.iter_data(axis='sample')]) - if tree_path: tree = parse_newick(open(tree_path, 'U'), PhyloNode) @@ -173,6 +171,7 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids is None: + otumtx = otu_table.matrix_data.T.toarray() # standard, full way if is_phylogenetic: dissims = metric_f(otumtx, otu_table.ids(axis='observation'), @@ -198,6 +197,7 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, metric_f.__name__ == 'binary_dist_chisq': warnings.warn('dissimilarity ' + metric_f.__name__ + ' is not parallelized, calculating the whole matrix...') + otumtx = otu_table.matrix_data.T.toarray() row_dissims.append(metric_f(otumtx)[rowidx]) else: try: @@ -208,17 +208,23 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, sample_ids = otu_table.ids() observation_ids = otu_table.ids(axis='observation') for i in range(len(sample_ids)): + samp_a = otu_table.data(sample_ids[rowidx]) + samp_b = otu_table.data(sample_ids[i]) + samp_data = vstack([samp_a, samp_b]) + if is_phylogenetic: + dissim = metric_f( - otumtx[[rowidx, i], :], observation_ids, + samp_data, observation_ids, tree, [sample_ids[rowidx], sample_ids[i]], make_subtree=(not full_tree))[0, 1] else: - dissim = metric_f(otumtx[[rowidx, i], :])[0, 1] + dissim = metric_f(samp_data)[0, 1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once + otumtx = otu_table.matrix_data.T.toarray() dissims = row_metric(otumtx, otu_table.ids(axis='observation'), tree, otu_table.ids(), rowid, From 590c6ae6015465765fe165563db1e2bbf5179a53 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 23 Jul 2015 12:25:27 -0600 Subject: [PATCH 2/5] ENH/BUG? --- qiime/beta_metrics.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/qiime/beta_metrics.py b/qiime/beta_metrics.py index 95eb4e3d29..3d3b563517 100644 --- a/qiime/beta_metrics.py +++ b/qiime/beta_metrics.py @@ -53,12 +53,8 @@ def result(data, taxon_names, tree, sample_names, **kwargs): # length is present in one sample but not (both samples OR NEITHER # SAMPLE). Divide by total branch length of full tree. # G is asymmetric unifrac -dist_unweighted_unifrac = make_unifrac_metric(False, fast_tree.unifrac, True) -dist_unifrac = dist_unweighted_unifrac # default unifrac is just unifrac dist_unweighted_unifrac_full_tree = make_unifrac_metric(False, fast_tree.unnormalized_unifrac, True) -dist_weighted_unifrac = make_unifrac_metric(True, - fast_tree.weighted_unifrac, True) dist_weighted_normalized_unifrac = make_unifrac_metric('correct', fast_tree.weighted_unifrac, True) dist_unifrac_g = make_unifrac_metric(False, fast_tree.G, False) @@ -105,6 +101,7 @@ def result(data, taxon_names, tree, sample_names, False, fast_tree.unifrac, True) + # default unifrac is just unifrac one_sample_unifrac = one_sample_unweighted_unifrac one_sample_unweighted_unifrac_full_tree = make_unifrac_row_metric(False, @@ -117,6 +114,8 @@ def result(data, taxon_names, tree, sample_names, one_sample_unifrac_g_full_tree = make_unifrac_row_metric(False, fast_tree.unnormalized_G, False) +dist_unweighted_unifrac = one_sample_unweighted_unifrac +dist_weighted_unifrac = one_sample_weighted_unifrac def _reorder_unifrac_res(unifrac_res, sample_names_in_desired_order): """ reorder unifrac result From 584af9be775d7fd32e3ccf6512090a2b9a9d02f7 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 23 Jul 2015 12:34:30 -0600 Subject: [PATCH 3/5] Revert "ENH/BUG?" This reverts commit 590c6ae6015465765fe165563db1e2bbf5179a53. --- qiime/beta_metrics.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/qiime/beta_metrics.py b/qiime/beta_metrics.py index 3d3b563517..95eb4e3d29 100644 --- a/qiime/beta_metrics.py +++ b/qiime/beta_metrics.py @@ -53,8 +53,12 @@ def result(data, taxon_names, tree, sample_names, **kwargs): # length is present in one sample but not (both samples OR NEITHER # SAMPLE). Divide by total branch length of full tree. # G is asymmetric unifrac +dist_unweighted_unifrac = make_unifrac_metric(False, fast_tree.unifrac, True) +dist_unifrac = dist_unweighted_unifrac # default unifrac is just unifrac dist_unweighted_unifrac_full_tree = make_unifrac_metric(False, fast_tree.unnormalized_unifrac, True) +dist_weighted_unifrac = make_unifrac_metric(True, + fast_tree.weighted_unifrac, True) dist_weighted_normalized_unifrac = make_unifrac_metric('correct', fast_tree.weighted_unifrac, True) dist_unifrac_g = make_unifrac_metric(False, fast_tree.G, False) @@ -101,7 +105,6 @@ def result(data, taxon_names, tree, sample_names, False, fast_tree.unifrac, True) - # default unifrac is just unifrac one_sample_unifrac = one_sample_unweighted_unifrac one_sample_unweighted_unifrac_full_tree = make_unifrac_row_metric(False, @@ -114,8 +117,6 @@ def result(data, taxon_names, tree, sample_names, one_sample_unifrac_g_full_tree = make_unifrac_row_metric(False, fast_tree.unnormalized_G, False) -dist_unweighted_unifrac = one_sample_unweighted_unifrac -dist_weighted_unifrac = one_sample_weighted_unifrac def _reorder_unifrac_res(unifrac_res, sample_names_in_desired_order): """ reorder unifrac result From 636891f943e3e5f6c7eb7c063977589ca1b57cd4 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 23 Jul 2015 12:41:26 -0600 Subject: [PATCH 4/5] Remove confusing possibly unnecessary logic --- qiime/beta_diversity.py | 49 ++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/qiime/beta_diversity.py b/qiime/beta_diversity.py index 0e2a36ba55..f2bcdaa46a 100644 --- a/qiime/beta_diversity.py +++ b/qiime/beta_diversity.py @@ -200,36 +200,25 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, otumtx = otu_table.matrix_data.T.toarray() row_dissims.append(metric_f(otumtx)[rowidx]) else: - try: - row_metric = get_phylogenetic_row_metric(metric) - except AttributeError: - # do element by element - dissims = [] - sample_ids = otu_table.ids() - observation_ids = otu_table.ids(axis='observation') - for i in range(len(sample_ids)): - samp_a = otu_table.data(sample_ids[rowidx]) - samp_b = otu_table.data(sample_ids[i]) - samp_data = vstack([samp_a, samp_b]) - - if is_phylogenetic: - - dissim = metric_f( - samp_data, observation_ids, - tree, [sample_ids[rowidx], sample_ids[i]], - make_subtree=(not full_tree))[0, 1] - else: - dissim = metric_f(samp_data)[0, 1] - dissims.append(dissim) - row_dissims.append(dissims) - else: - # do whole row at once - otumtx = otu_table.matrix_data.T.toarray() - dissims = row_metric(otumtx, - otu_table.ids(axis='observation'), - tree, otu_table.ids(), rowid, - make_subtree=(not full_tree)) - row_dissims.append(dissims) + # do element by element + dissims = [] + sample_ids = otu_table.ids() + observation_ids = otu_table.ids(axis='observation') + for i in range(len(sample_ids)): + samp_a = otu_table.data(sample_ids[rowidx]) + samp_b = otu_table.data(sample_ids[i]) + samp_data = vstack([samp_a, samp_b]) + + if is_phylogenetic: + + dissim = metric_f( + samp_data, observation_ids, + tree, [sample_ids[rowidx], sample_ids[i]], + make_subtree=(not full_tree))[0, 1] + else: + dissim = metric_f(samp_data)[0, 1] + dissims.append(dissim) + row_dissims.append(dissims) with open(outfilepath, 'w') as f: f.write(format_matrix(row_dissims, rowids_list, From b959ecb34ee46752dd8beeeda52914cce7dd20f2 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 29 Jul 2015 17:28:00 -0600 Subject: [PATCH 5/5] Allow fast unifrac setup to indirectly source from a sparse matrix --- qiime/beta_diversity.py | 48 +++++++++++++++++++++++++---------------- qiime/parse.py | 21 +++++++++++++----- 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/qiime/beta_diversity.py b/qiime/beta_diversity.py index f2bcdaa46a..5db621475b 100644 --- a/qiime/beta_diversity.py +++ b/qiime/beta_diversity.py @@ -200,25 +200,35 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, otumtx = otu_table.matrix_data.T.toarray() row_dissims.append(metric_f(otumtx)[rowidx]) else: - # do element by element - dissims = [] - sample_ids = otu_table.ids() - observation_ids = otu_table.ids(axis='observation') - for i in range(len(sample_ids)): - samp_a = otu_table.data(sample_ids[rowidx]) - samp_b = otu_table.data(sample_ids[i]) - samp_data = vstack([samp_a, samp_b]) - - if is_phylogenetic: - - dissim = metric_f( - samp_data, observation_ids, - tree, [sample_ids[rowidx], sample_ids[i]], - make_subtree=(not full_tree))[0, 1] - else: - dissim = metric_f(samp_data)[0, 1] - dissims.append(dissim) - row_dissims.append(dissims) + try: + row_metric = get_phylogenetic_row_metric(metric) + except AttributeError: + # do element by element + dissims = [] + sample_ids = otu_table.ids() + observation_ids = otu_table.ids(axis='observation') + for i in range(len(sample_ids)): + samp_a = otu_table.data(sample_ids[rowidx]) + samp_b = otu_table.data(sample_ids[i]) + samp_data = vstack([samp_a, samp_b]) + + if is_phylogenetic: + + dissim = metric_f( + samp_data, observation_ids, + tree, [sample_ids[rowidx], sample_ids[i]], + make_subtree=(not full_tree))[0, 1] + else: + dissim = metric_f(samp_data)[0, 1] + dissims.append(dissim) + row_dissims.append(dissims) + else: + # do whole row at once + dissims = row_metric(otu_table, + otu_table.ids(axis='observation'), + tree, otu_table.ids(), rowid, + make_subtree=(not full_tree)) + row_dissims.append(dissims) with open(outfilepath, 'w') as f: f.write(format_matrix(row_dissims, rowids_list, diff --git a/qiime/parse.py b/qiime/parse.py index 2a537f7509..298e73b628 100644 --- a/qiime/parse.py +++ b/qiime/parse.py @@ -22,6 +22,7 @@ from numpy import concatenate, repeat, zeros, nan, asarray from numpy.random import permutation +import biom from skbio.stats.ordination import OrdinationResults from skbio.parse.record_finder import LabeledRecordFinder from cogent.parse.tree import DndParser @@ -566,18 +567,28 @@ def make_envs_dict(abund_mtx, sample_names, taxon_names): sample_names is a list, length = num rows taxon_names is a list, length = num columns """ - num_samples, num_seqs = abund_mtx.shape + if isinstance(abund_mtx, biom.Table): + num_seqs, num_samples = abund_mtx.shape + else: + num_samples, num_seqs = abund_mtx.shape + if (num_samples, num_seqs) != (len(sample_names), len(taxon_names)): raise ValueError( "Shape of matrix %s doesn't match # samples and # taxa (%s and %s)" % (abund_mtx.shape, num_samples, num_seqs)) envs_dict = {} sample_names = asarray(sample_names) - for i, taxon in enumerate(abund_mtx.T): - nonzeros = taxon.nonzero() # this removes zero values to reduce memory - envs_dict[taxon_names[i]] = dict(zip(sample_names[nonzeros], - taxon[nonzeros])) + if isinstance(abund_mtx, biom.Table): + for i, v in enumerate(abund_mtx.matrix_data): + envs_dict[taxon_names[i]] = dict(zip(sample_names[v.indices], + v.data)) + else: + for i, taxon in enumerate(abund_mtx.T): + + nonzeros = taxon.nonzero() # this removes zero values to reduce memory + envs_dict[taxon_names[i]] = dict(zip(sample_names[nonzeros], + taxon[nonzeros])) return envs_dict