adding version 0.0.35

lanl · Jan 27, 2025 · c339049 · c339049
1 parent ce122e0
commit c339049
Show file tree

Hide file tree

Showing 74 changed files with 894 additions and 190 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,4 +1,4 @@
-cff-version: 1.2.0
+version: 0.0.35
 message: "If you use this software, please cite it as below."
 authors:
   - family-names: Eren
@@ -20,7 +20,7 @@ authors:
   - family-names: Alexandrov
     given-names: Boian
 title: "Tensor Extraction of Latent Features (T-ELF)"
-version: 0.0.34
+version: 0.0.35
 url: https://github.com/lanl/T-ELF
 doi: 10.5281/zenodo.10257897
 date-released: 2023-12-04
diff --git a/README.md b/README.md
@@ -97,20 +97,35 @@ python post_install.py # use the following, for example, for GPU system: <python
 |           WNMFk           | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: |  :heavy_check_mark: | :heavy_check_mark: |                         NMFk with weighting - used for recommendation system     |   [Link](examples/WNMFk/WNMFk.ipynb)          |       :white_check_mark:       |
 |           HNMFk           | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |  :heavy_check_mark: | :heavy_check_mark: |                         Hierarchical NMFk                                        |   [Link](examples/HNMFk/HNMFk.ipynb)       |       :white_check_mark:       |
 |           BNMFk           | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: |  :heavy_check_mark: | :heavy_check_mark: |                           Boolean NMFk                                           |   [Link](examples/BNMFk/BNMFk.ipynb) |       :white_check_mark:       |
-
+|           LMF             | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: |                     |                    |                           Logistic Matrix Factorization                          |   [Link](examples/LMF/LMF.ipynb) |       :white_check_mark:       |
+|         SPLIT NMFk        |                    |                    |                    |                    |                     |                    |        Joint NMFk factorization of multiple data via SPLIT                       |             |       :soon:       |
+| SPLIT Transfer Classifier |                    |                    |                    |                    |                     |                    |      Supervised transfer learning method via SPLIT and NMFk                      |             |       :soon:       |
+
 ### TELF.pre_processing
 
 | **Method** | **Multiprocessing** |       **HPC**       |                           **Description**                          | **Example** | **Release Status** |
 |:----------:|:-------------------:|:-------------------:|:------------------------------------------------------------------:|:-----------:|:------------------:|
 |   Vulture  | :heavy_check_mark:  | :heavy_check_mark:  |         Advanced text processing tool for cleaning and NLP         |  [Link](examples/Vulture)  | :white_check_mark: |
 |   Beaver   | :heavy_check_mark:  | :heavy_check_mark:  |        Fast matrix and tensor building tool for text mining        |  [Link](examples/Beaver)  | :white_check_mark: |
+|  iPenguin  |                     |                     |         Online Semantic Scholar information retrieval tool         |             |       :soon:       |
+|    Orca    |                     |                     | Duplicate author detector for text mining and information retrival |             |       :soon:       |
+
+### TELF.post_processing
 
+| **Method** |                       **Description**                      | **Example** | **Release Status** |
+|:----------:|:----------------------------------------------------------:|:-----------:|:------------------:|
+|   Peacock  | Data visualization and generation of actionable statistics |             |       :soon:       |
+|    Wolf    |              Graph centrality and ranking tool             |             |       :soon:       |
+|    Fox    |              Report generation tool for text data            |             |       :soon:       |
+|    SeaLion    |              Generic report generation tool            |             |       :soon:       |
 
 ### TELF.applications
 
 | **Method** |                            **Description**                           | **Example** | **Release Status** |
 |:----------:|:--------------------------------------------------------------------:|:-----------:|:------------------:|
 |   Cheetah  |                        Fast search by keywords and phrases                       |    [Link](examples/Cheetah)         |       :white_check_mark:      |
+|    Bunny   | Dataset generation tool for documents and their citations/references |             |       :soon:       |
+|    Termite   | Knowladge graph building tool |             |       :soon:       |
 
 
 ## How to Cite T-ELF?

diff --git a/TELF/applications/Cheetah/cheetah.py b/TELF/applications/Cheetah/cheetah.py
@@ -833,7 +833,15 @@ def _index_affiliation_country(self, data:dict) -> tuple:
 
             for affil_id, affil_info_dict in curr_info_dict.items():
                 affil_id = str(affil_id).strip().lower()
-                country = affil_info_dict["country"].strip().lower()
+
+                # Check type and country of affiliation
+                if not isinstance(affil_info_dict, dict):
+                    continue
+                country = affil_info_dict.get("country")
+                if country:
+                    country = country.strip().lower()
+                else:
+                    country = ''
 
                 # affiliation
                 if str(affil_id) in affiliation_index_tmp:

diff --git a/TELF/factorization/decompositions/lmf.py b/TELF/factorization/decompositions/lmf.py
@@ -0,0 +1,242 @@
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import numpy as np
+
+try:
+    import cupy as cp
+except Exception:
+    cp = None
+
+class LogisticMatrixFactorization:
+    def __init__(self, k=30, l2_p=1e-6, epochs=1000, learning_rate=0.001, tolerance=1e-4, device="cpu", random_state=None):
+        """
+        Logistic Matrix Factorization with a mask.
+
+        Parameters:
+        - k: Number of latent factors.
+        - l2_p: Regularization parameter (L2 penalty).
+        - epochs: Number of training epochs.
+        - learning_rate: Learning rate for gradient descent.
+        - tolerance: Early stopping criterion based on loss change.
+        """
+        self.k = k
+        self.l2_p = l2_p
+        self.epochs = epochs
+        self.learning_rate = learning_rate
+        self.tolerance = tolerance
+        self.np = np
+        self.random_state = random_state
+
+        if device == "cpu":
+            self.device = device
+        elif device == "gpu":
+            self.device = 0
+        elif isinstance(device, int) and device >= 0:
+            self.device = device
+        else:
+            raise Exception("Device should be 'cpu', 'gpu' (CUDA:0), or a GPU number between 0 and N-1 where N is the number of GPUs.")
+
+        if self.device != "cpu" and cp is None:
+            print("No CUDA found! Using CPU!")
+            self.device = "cpu"
+
+    def fit(self, Xtrain, MASK, plot_loss=True):
+        """
+        Train the logistic matrix factorization model.
+
+        Parameters:
+        - Xtrain: Training interaction matrix (m x n).
+        - MASK: Binary mask matrix with 1s for observed entries in Xtrain.
+
+        Returns:
+        - W: Learned row (user) latent feature matrix (m x k).
+        - H: Learned column (item) latent feature matrix (k x n).
+        - row_bias: Learned row bias vector (m x 1).
+        - col_bias: Learned column bias vector (1 x n).
+        """
+        if self.device != "cpu":
+            self.np = cp
+
+        m, n = Xtrain.shape
+        W, H, row_bias, col_bias = self._initialize_embeddings(m, n)
+
+        if self.device != "cpu":
+            with cp.cuda.Device(self.device): 
+                losses = cp.zeros(self.epochs)
+                MASK = cp.array(MASK)
+                Xtrain = cp.array(Xtrain)
+                W, H, row_bias, col_bias, losses = self._factorization_routine(W, H, row_bias, col_bias, MASK, Xtrain, losses)
+
+                # to CPU
+                W = cp.asnumpy(W)
+                H = cp.asnumpy(H)
+                row_bias = cp.asnumpy(row_bias)
+                col_bias = cp.asnumpy(col_bias)
+                MASK = cp.asnumpy(MASK)
+                Xtrain = cp.asnumpy(Xtrain)
+                losses = cp.asnumpy(losses)
+                self.np = np
+        else:
+            losses = np.zeros(self.epochs)
+            W, H, row_bias, col_bias, losses = self._factorization_routine(W, H, row_bias, col_bias, MASK, Xtrain, losses)
+
+        # Plot loss
+        if plot_loss:
+            plt.plot(losses)
+            plt.xlabel('Epoch')
+            plt.ylabel('Loss')
+            plt.title('Training Loss')
+            plt.show()
+
+        return W, H, row_bias, col_bias, losses
+
+    def predict(self, W, H, row_bias, col_bias):
+        """
+        Predict all entries in the matrix.
+
+        Parameters:
+        - W: Learned row latent feature matrix (m x k).
+        - H: Learned column latent feature matrix (k x n).
+        - row_bias: Learned row bias vector (m x 1).
+        - col_bias: Learned column bias vector (1 x n).
+
+        Returns:
+        - Xtilda: Predicted matrix of interaction probabilities.
+        """
+        return self._sigmoid(self.np.dot(W, H) + row_bias + col_bias)
+
+    def map_probabilities_to_binary(self, Xtilda, threshold=0.5):
+        """
+        Map probabilities to binary values (0 or 1) using a threshold.
+
+        Parameters:
+        - Xtilda: numpy array, predicted probabilities (values in [0, 1]).
+        - threshold: float, the cutoff for mapping probabilities to 0 or 1.
+
+        Returns:
+        - Xtilda_binary: numpy array, binary Xtilda (0s and 1s).
+        """
+        return (Xtilda >= threshold).astype(int)
+
+    def _initialize_embeddings(self, m, n):
+        """
+        Initialize embeddings (W and H) and biases for rows (users) and columns (items).
+        """
+        np.random.seed(self.random_state)
+
+        W = np.random.normal(scale=0.1, size=(m, self.k))
+        H = np.random.normal(scale=0.1, size=(self.k, n))
+        row_bias = np.random.normal(scale=0.1, size=(m, 1))
+        col_bias = np.random.normal(scale=0.1, size=(1, n))
+
+        if self.device != "cpu":
+            with cp.cuda.Device(self.device): 
+                W, H, row_bias, col_bias = cp.array(W), cp.array(H), cp.array(row_bias), cp.array(col_bias)
+
+        return W, H, row_bias, col_bias
+
+    def _sigmoid(self, x):
+        return 1 / (1 + self.np.exp(-x))
+
+    def _compute_loss(self, X_train, Xtilda, MASK, W, H):
+        """
+        Compute binary cross-entropy loss.
+
+        Parameters:
+        - X_train: Training interaction matrix.
+        - Xtilda: Predicted matrix.
+        - MASK: Binary mask matrix.
+
+        Returns:
+        - loss: Binary cross-entropy loss.
+        """
+        loss = -self.np.sum(
+            MASK * (X_train * self.np.log(Xtilda + 1e-8) + (1 - X_train) * self.np.log(1 - Xtilda + 1e-8))
+        )
+        loss += self.l2_p * (self.np.sum(W ** 2) + self.np.sum(H ** 2))
+        return loss
+
+
+    def _factorization_routine(self, W, H, row_bias, col_bias, MASK, Xtrain, losses):
+        """
+        Performs matrix factorization using stochastic gradient descent (SGD) with regularization and optional early stopping.
+
+        This function iteratively optimizes the latent factor matrices (`W` and `H`), row biases, and column biases 
+        to minimize the reconstruction error between the observed entries in the input matrix (`Xtrain`) and the predicted 
+        matrix (`Xtilda`). It incorporates L2 regularization and supports early stopping if the loss improvement falls 
+        below a specified tolerance.
+
+        Parameters:
+            W (numpy.ndarray): 
+                A matrix of shape `(num_rows, latent_factors)` representing the initial latent factors for rows.
+            H (numpy.ndarray): 
+                A matrix of shape `(latent_factors, num_columns)` representing the initial latent factors for columns.
+            row_bias (numpy.ndarray): 
+                A vector of shape `(num_rows, 1)` representing the row-wise biases.
+            col_bias (numpy.ndarray): 
+                A vector of shape `(1, num_columns)` representing the column-wise biases.
+            MASK (numpy.ndarray): 
+                A binary mask matrix of the same shape as `Xtrain`, where 1 indicates an observed entry and 0 indicates missing.
+            Xtrain (numpy.ndarray): 
+                The observed training data matrix of shape `(num_rows, num_columns)`.
+            losses (list or numpy.ndarray): 
+                A pre-allocated container to store the loss values at each epoch.
+
+        Returns:
+            W (numpy.ndarray): 
+                The updated latent factor matrix for rows after optimization.
+            H (numpy.ndarray): 
+                The updated latent factor matrix for columns after optimization.
+            row_bias (numpy.ndarray): 
+                The updated row-wise biases.
+            col_bias (numpy.ndarray): 
+                The updated column-wise biases.
+            losses (list or numpy.ndarray): 
+                The updated list or array containing the training loss at each epoch.
+
+        Steps:
+        1. **Prediction**: The predicted matrix (`Xtilda`) is computed using the current `W`, `H`, `row_bias`, and `col_bias`.
+        2. **Error Calculation**: The reconstruction error is calculated only for observed entries using the binary mask (`MASK`).
+        3. **Gradient Calculation**: Gradients for `W`, `H`, `row_bias`, and `col_bias` are computed using the observed errors 
+        and L2 regularization.
+        4. **Parameter Updates**: The latent factor matrices (`W`, `H`) and biases (`row_bias`, `col_bias`) are updated using 
+        the gradients and a specified learning rate.
+        5. **Loss Calculation**: The reconstruction loss is computed for the current epoch and stored in the `losses` array.
+        6. **Early Stopping**: If the loss improvement between consecutive epochs falls below a predefined tolerance, the 
+        optimization process terminates early.
+
+        Notes:
+        - The `_compute_loss` function is assumed to compute the loss using both observed reconstruction errors and regularization terms.
+        - Early stopping can significantly reduce computation time when the optimization converges quickly.
+        - The function updates the input parameters in-place, and the returned values reflect the final state after optimization.
+        """
+        for epoch in tqdm(range(self.epochs)):
+            # Compute Xtilda (predictions)
+            Xtilda = self.predict(W, H, row_bias=row_bias, col_bias=col_bias)
+
+            # Compute errors for observed entries
+            errors = MASK * (Xtilda - Xtrain)
+
+            # Gradients
+            grad_W = self.np.dot(errors, H.T) + self.l2_p * W
+            grad_H = self.np.dot(W.T, errors) + self.l2_p * H
+            grad_row_bias = self.np.sum(errors, axis=1, keepdims=True) + self.l2_p * row_bias
+            grad_col_bias = self.np.sum(errors, axis=0, keepdims=True) + self.l2_p * col_bias
+
+            # Update embeddings and biases
+            W -= self.learning_rate * grad_W
+            H -= self.learning_rate * grad_H
+            row_bias -= self.learning_rate * grad_row_bias
+            col_bias -= self.learning_rate * grad_col_bias
+
+            # Compute training loss
+            loss = self._compute_loss(Xtrain, Xtilda, MASK, W, H)
+            losses[epoch] = loss
+
+            # Early stopping based on tolerance
+            if self.tolerance is not None and (epoch > 0 and abs(losses[epoch] - losses[epoch-1]) < self.tolerance):
+                print(f"Early stopping at epoch {epoch + 1}. Loss change below tolerance.")
+                break
+
+        return W, H, row_bias, col_bias, losses
+
diff --git a/TELF/version.py b/TELF/version.py
@@ -1 +1 @@
-__version__ = '0.0.34'
+__version__ = "0.0.35"
diff --git a/docs/Beaver.html b/docs/Beaver.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.34 documentation</title>
+    <title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.35 documentation</title>
 
 
 
@@ -40,7 +40,7 @@
   <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="_static/documentation_options.js?v=d15f0e27"></script>
+    <script src="_static/documentation_options.js?v=6aa38c3a"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -126,7 +126,7 @@
 
 
 
-    <p class="title logo__title">TELF 0.0.34 documentation</p>
+    <p class="title logo__title">TELF 0.0.35 documentation</p>
 
 </a></div>
         <div class="sidebar-primary-item">

diff --git a/docs/Cheetah.html b/docs/Cheetah.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.34 documentation</title>
+    <title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.35 documentation</title>
 
 
 
@@ -40,7 +40,7 @@
   <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="_static/documentation_options.js?v=d15f0e27"></script>
+    <script src="_static/documentation_options.js?v=6aa38c3a"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -126,7 +126,7 @@
 
 
 
-    <p class="title logo__title">TELF 0.0.34 documentation</p>
+    <p class="title logo__title">TELF 0.0.35 documentation</p>
 
 </a></div>
         <div class="sidebar-primary-item">

diff --git a/docs/HNMFk.html b/docs/HNMFk.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>TELF.factorization.HNMFk: Hierarchical Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.34 documentation</title>
+    <title>TELF.factorization.HNMFk: Hierarchical Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.35 documentation</title>
 
 
 
@@ -40,7 +40,7 @@
   <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="_static/documentation_options.js?v=d15f0e27"></script>
+    <script src="_static/documentation_options.js?v=6aa38c3a"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -126,7 +126,7 @@
 
 
 
-    <p class="title logo__title">TELF 0.0.34 documentation</p>
+    <p class="title logo__title">TELF 0.0.35 documentation</p>
 
 </a></div>
         <div class="sidebar-primary-item">