enlsp24.bib

@Proceedings{ENLSP-2024,
    booktitle = {Proceedings of The 4th NeurIPS Efficient Natural Language and Speech Processing Workshop},
    name = {NeurIPS Efficient Natural Language and Speech Processing Workshop},
    shortname = {ENLSP-IV 2024},
    sections = {Training|Model Design \& Architecture|Model Efficiency \& Compression|Inference| Benchmark \& Evaluation|Applications },
    editor = {Rezagholizadeh, Mehdi and Passban, Peyman and Samiee, Soheila and Partovi Nia, Vahid and Cheng, Yu and Deng, Yue and Liu, Qun and Chen, Boxing},
    volume = {262},
    year = {2024},
    start = {2024-12-14},
    end = {2024-12-14},
    published = {2024-12-10},
    conference_url = {https://neurips2024-enlsp.github.io/},
    address = {Vancouver, British Columbia, Canada}
}

% Training
@InProceedings{samragh2024scaling,
    title = {Scaling Smart: Accelerating Large Language Model Pre-Training with Small Model Initialization},
    section = {Training},
    author = {Samragh, Mohammad and Mirzadeh, Seyed Iman  and Alizadeh-Vahid, Keivan and Faghri, Fartash and Cho, Minsik and Nabi, Moin and Naik, Devang and Farajtabar, Mehrdad},
    pages = {1-13}, 
    abstract = {The pre-training phase of language models often begins with randomly initialized parameters. With the current trends in scaling models, training their large number of parameters can be extremely slow and costly. In contrast, small language models are less expensive to train, but they often cannot achieve the accuracy of large models. In this paper, we explore an intriguing idea to connect these two different regimes: Can we develop a method to initialize large language models using smaller pre-trained models? Will such initialization bring any benefits in terms of training time and final accuracy? In this paper, we introduce HyperCloning, a method that can expand the parameters of a pre-trained language model to those of a larger model with increased hidden dimensions. Our method ensures that the larger model retains the functionality of the smaller model. As a result, the larger model already inherits the predictive power and accuracy of the smaller model before the training starts. We demonstrate that training such an initialized model results in significant savings in terms of GPU hours required for pre-training large language models. Implementation of HyperCloning is available at https://github.com/apple/ml-hypercloning/tree/main.}
}


@InProceedings{ashkboos2024computational,
    title = {Computational Bottlenecks of Training Small-scale Large Language Models},
    section = {Training},
    author = {Ashkboos, Saleh and Iman Mirzadeh, Seyed and Alizadeh-Vahid, Keivan and Hossein Sekhavat, Mohammad and Nabi, Moin and Farajtabar, Mehrdad and Faghri, Fartash},
    pages = {14-21},
    abstract = {While large language models (LLMs) dominate the AI landscape, Small-scale large Language Models (SLMs) are gaining attention due to cost and efficiency demands from consumers. However, there is limited research on the training behavior and computational requirements of SLMs. In this study, we explore the computational bottlenecks of training SLMs (up to 2B parameters) by examining the effects of various hyperparameters and configurations, including GPU type, batch size, model size, communication protocol, attention type, and the number of GPUs. We assess these factors on popular cloud services using metrics such as loss per dollar and tokens per second. Our findings aim to support the broader adoption and optimization of language model training for low-resource AI research institutes.}
}

@InProceedings{lawton2024quailora,
    title = {{QuAILoRA}: Quantization-Aware Initialization for {LoRA}},
    section = {Training},
    author = {G Lawton, Neal and Padmakumar, Aishwarya and Gaspers, Judith and FitzGerald, Jack and Kumar, Anoop and Ver Steeg, Greg and Galstyan, Aram},
    pages = {22-33},
    abstract = {QLoRA reduces the memory-cost of fine-tuning a large language model (LLM) with LoRA by quantizing the base LLM. However, quantization introduces quantization errors that negatively impact model performance after fine-tuning. In this paper we introduce QuAILoRA, a quantization-aware initialization for LoRA that mitigates this negative impact by decreasing quantization errors at initialization. Our method spends a small amount of computational overhead to compute this quantization-aware initialization, without increasing the memory-cost of fine-tuning. We evaluate our method on several causal language modeling and downstream evaluation tasks using several different model sizes and families. We observe that almost all LLMs fined-tuned with QuAILoRA achieve better validation perplexity. When evaluated on downstream tasks, we find that QuAILoRA yields improvements proportional to the negative effect of quantization error. On average, applying QuAILoRA to 4-bit QLoRA models yields 75\% of the validation perplexity decrease and 86\% of the downstream task accuracy increase as doubling the quantization precision to 8-bit, without increasing GPU memory utilization during fine-tuning.}
}


@InProceedings{javaheri2024superpos,
    title = {{SuperPos-Prompt}: Enhancing Soft Prompt Tuning of Language Models with Superposition of Multi Token Embeddings},
    section = {Training},
    author = {Ali Sadraei Javaheri, Mohammad and Asgari, Ehsaneddin and C. McHardy, Alice and R. Rabiee, Hamid},
    pages = {34-46},
    abstract = {Soft prompt tuning techniques have recently gained traction as an effective strategy for the parameter-efficient tuning of pre-trained language models, particularly minimizing the required adjustment of model parameters. Despite their growing use, achieving optimal tuning with soft prompts, especially with smaller datasets, remains a substantial challenge. This study makes two contributions in this domain: (i) we introduce SuperPos-Prompt, a new reparameterization technique employing the superposition of multiple pre-trained vocabulary embeddings to improve the learning of soft prompts. Our experiments across several GLUE and SuperGLUE benchmarks consistently highlight SuperPos-Prompt's superiority over Residual Prompt tuning, exhibiting an average score increase of +6.4 in T5-Small and +5.0 in T5-Base along with a faster convergence. Remarkably, SuperPos-Prompt occasionally outperforms even full fine-tuning methods. (ii) Additionally, we demonstrate enhanced performance and rapid convergence by omitting dropouts from the frozen network, yielding consistent improvements across various scenarios and tuning methods.}
}


@InProceedings{pasand2024rgp,
    title = {{RGP}: Achieving Memory-Efficient Model Fine-tuning Via Randomized Gradient Projection}, 
    section = {Training},
    author = {Saheb Pasand, Ali and Bashivan, Pouya},
    pages = {47-54},
    abstract = {Training and fine-tuning Large Language Models (LLMs) require significant memory due to the substantial growth in the size of weight parameters and optimizer states. While methods like low-rank adaptation (LoRA), which introduce low-rank trainable modules in parallel to frozen pre-trained weights, effectively reduce memory usage, they often fail to preserve the optimization trajectory and are generally less effective for pre-training models. On the other hand, approaches, such as GaLore, that project gradients onto lower-dimensional spaces maintain the training trajectory and perform well in pre-training but suffer from high computational complexity, as they require repeated singular value decomposition on large matrices. In this work, we propose Randomized Gradient Projection (RGP), which outperforms GaLore, the current state-of-the-art in efficient fine-tuning, on the GLUE task suite, while being 74\% faster on average and requiring similar memory.}
}

@InProceedings{khera2024efficient,
    title = {Efficient Alignment of Large Language Models via Data Sampling},
    section = {Training},
    author = {Khera, Amrit and Ghosh, Rajat and Dutta, Debojyoti},
    pages = {55-72},
    abstract = {Despite the capabilities of Large Language Models (LLMs), the output is not always safe or desirable. Aligning the models to human values is a critical step for the safe adoption of these models. Aligning LLMs employ huge amounts of data, computation, and time. Moreover, curating data with human feedback is expensive and takes time. Recent research depicts the benefit of data engineering in the fine-tuning and pre-training paradigms to bring down such costs. However, alignment differs from the afore-mentioned paradigms and it is unclear if data efficient alignment is feasible. In this work, we first aim to understand how the performance of LLM alignment scales with data. We find out that LLM alignment performance follows an exponential plateau pattern which tapers off post a rapid initial increase. We identify data subsampling as a viable method to reduce resources required for alignment. Further, we propose a methodology for efficient alignment by identifying a small high quality subset thereby reducing the computation and time required by alignment. We evaluate the proposed methodology over multiple datasets and compare the results. We find that the model aligned using our proposed methodology outperforms other sampling methods and performs comparable to the model aligned with the full dataset while using a fraction of the resources.}
}

@InProceedings{azimi2024kd-lora,
    title = {{KD-LoRA}: A Hybrid Approach to Efficient Fine-Tuning with LoRA and Knowledge Distillation},
    section = {Training},
    author = {Azimi, Rambod and Rishav, Rishav and Teichmann, Marek and Ebrahimi Kahou, Samira},       
    pages = {73-80},
    abstract = {Large language models (LLMs) have demonstrated remarkable performance across various downstream tasks. However, the high computational and memory requirements of LLMs are a major bottleneck. To address this, parameter-efficient fine-tuning (PEFT) methods such as low-rank adaptation (LoRA) have been proposed to reduce computational costs while ensuring minimal loss in performance. Additionally, knowledge distillation (KD) has been a popular choice for obtaining compact student models from teacher models. In this work, we present KD-LoRA, a novel fine-tuning method that combines LoRA with KD. Our results demonstrate that KD-LoRA achieves performance comparable to full fine-tuning (FFT) and LoRA while significantly reducing resource requirements. Specifically, KD-LoRA retains 98\% of LoRA’s performance on the GLUE benchmark, while being 40\% more compact. Additionally, KD-LoRA reduces GPU memory usage by 30\% compared to LoRA, while decreasing inference time by 30\% compared to both FFT and LoRA. We evaluate KD-LoRA across three encoder-only models: BERT, RoBERTa, and DeBERTaV3. Code is available at https://github.com/rambodazimi/KD-LoRA.}
}

% Model Design & Architecture
@InProceedings{panda2024dense,
    title = {Dense Backpropagation Improves Routing for Sparsely-Gated Mixture-of-Experts},
    section = {Model Design \& Architecture},
    author = {Panda, Ashwinee and Baherwani, Vatsal and Sarwar, Zain and Therien, Benjamin and Sahu, Sambit and Rawls, Stephen and Chakraborty, Supriyo and Goldstein, Tom},
    pages = {81-101}, 
    abstract = {Sparsely-gated Mixture-of-Experts (MoEs) such as Gemini have proven to be more efficient than dense Transformers because they can dynamically activate a subset of their overall parameters by \emph{routing} tokens to selected ``experts'', allowing practitioners to scale up model parameter counts without significantly increasing total compute.
However, current MoE training approaches only update the router with a sparse gradient and suffer from issues such as load imbalance. We propose a new router that can receive a dense gradient update from a sparse forward pass. Our method adds minimal overhead, but improves on the common Top-K routing in both performance and load balance.}
}

@InProceedings{qiao2024vl-mamba,
    title = {{VL-Mamba}: Exploring State Space Models for Multimodal Learning},
    section = {Model Design \& Architecture},
    author = {Qiao, Yanyuan and Yu, Zheng and Zhao, Zijia and Chen, Sihan and Sun, Mingzhen and Guo, Longteng and Wu, Qi and Liu, Jing},
    pages = {102-113},
    abstract = {Multimodal large language models (MLLMs) have gained considerable attention due to their ability to integrate visual and textual information, enhancing understanding and providing context for complex tasks. While Transformer-based architectures have been the dominant framework for MLLMs, recent studies suggest that state space models (SSMs) like Mamba can achieve competitive or even superior performance. However, no prior research has investigated the potential of SSMs to replace Transformers in multimodal tasks, which are inherently more challenging due to the heterogeneity of visual and language data and the complexities of aligning these modalities. In this paper, we introduce VL-Mamba, the first study to explore the application of state space models in multimodal learning tasks. VL-Mamba leverages a pretrained Mamba language model as its core, and we propose a novel MultiModal Connector (MMC) that incorporates a Vision Selective Scan (VSS) module to improve visual sequence modeling. We empirically explore how to effectively apply the 2D vision selective scan mechanism for multimodal learning and the combinations of different vision encoders and variants of pretrained Mamba language models. Our experiments across multiple multimodal benchmarks demonstrate that VL-Mamba achieves competitive performance against small MLLMs of similar size, and in some cases, surpasses larger models such as the 7B and 13B versions of LLaVA-1.5. These results suggest that state space models have the potential to serve as an alternative to Transformers in multimodal learning tasks.}
}
@InProceedings{liu2024misdmoe,
    title = {{MisD-MoE}: A Multimodal Misinformation Detection Framework with Adaptive Feature Selection},
    section = {Model Design \& Architecture},
    author = {Liu, Moyang and Yan, Kaiying and Liu, Yukun and Fu, Ruibo and Wen, Zhengqi and Liu, Xuefei and Li, Chenxing},
    pages = {114-122},
    abstract = {The rapid growth of social media has led to the widespread dissemination of misinformation across multiple content forms, including text, images, audio, and video. Compared to unimodal misinformation detection, multimodal misinformation detection benefits from the increased availability of information across multiple modalities. However, these additional features may introduce redundancy, where overlapping or irrelevant information is included, potentially disrupting the feature space and consequently impairing the model's performance. To address the issue, we propose a novel framework, Misinformation Detection Mixture of Experts (MisD-MoE), which employs distinct expert models for each modality and incorporates an adaptive feature selection mechanism using top-k gating and Gumbel-Sigmoid. This approach dynamically filters relevant features, reducing redundancy and improving detection accuracy. Extensive experiments on the FakeSV and FVC-2018 datasets demonstrate that MisD-MoE significantly outperforms state-of-the-art methods, with accuracy improvements of 3.45\% and 3.71\% on the respective datasets compared to baseline models.}
}

@InProceedings{zayats2024zipper,
    title = {Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities},
    section = {Model Design \& Architecture},
    author = {Zayats, Vicky and Chen, Peter and Ferrari, Melissa and Padfield, Dirk},
    pages = {123-135},
    abstract = {Integrating multiple generative foundation models, especially those trained on different modalities, into something greater than the sum of its parts poses significant challenges. Two key hurdles are the availability of aligned data (concepts that contain similar meaning but is expressed differently in different modalities), and effectively leveraging unimodal representations in cross-domain generative tasks, without compromising their original unimodal capabilities.

We propose Zipper, a multi-tower decoder architecture that addresses these concerns by using cross-attention to flexibly compose multimodal generative models from independently pre-trained unimodal decoders. In our experiments fusing speech and text modalities, we show the proposed architecture performs very competitively in scenarios with limited aligned text-speech data. We also showcase the flexibility of our model to selectively maintain unimodal (e.g., text-to-text generation) generation performance by freezing the corresponding modal tower (e.g. text). In cross-modal tasks such as automatic speech recognition (ASR) where the output modality is text, we show that freezing the text backbone results in negligible performance degradation. In cross-modal tasks such as text-to-speech generation (TTS) where the output modality is speech, we show that using a pre-trained speech backbone results in superior performance to the baseline.}
}

@InProceedings{hajimolahoseini2024is,
    title = {Is {3D} Convolution with {5D} Tensors Really Necessary for Video Analysis?},
    section = {Model Design \& Architecture},
    author = {Hajimolahoseini, Habib and Ahmed, Walid and Wen, Shuangyue and Liu, Yang},
    pages = {136-144},
    abstract = {In this paper, we present a comprehensive study and propose several novel techniques for implementing 3D convolutional blocks using 2D and/or 1D convolutions with only 4D and/or 3D tensors. Our motivation is that 3D convolutions with 5D tensors are computationally very expensive and they may not be supported by some of the edge devices used in real-time applications such as robots. The existing approaches mitigate this by splitting the 3D kernels into spatial and temporal domains, but they still use 3D convolutions with 5D tensors in their implementations. We resolve this issue by introducing some appropriate 4D/3D tensor reshaping as well as new combination techniques for spatial and temporal splits. The proposed implementation methods show significant improvement both in terms of efficiency and accuracy. The experimental results confirm that the proposed spatio-temporal processing
structure outperforms the original model in terms of speed and accuracy using only 4D tensors with fewer parameters.}
}
@InProceedings{chung2024beyond,
    title = {Beyond Parameter Count: Implicit Bias in Soft Mixture of Experts},
    section = {Model Design \& Architecture},
    author = {Chung, Youngseog and Malik, Dhruv and Schneider, Jeff and Li, Yuanzhi and Singh, Aarti}, 
    pages = {145-164},
    abstract = {The traditional viewpoint on Sparse Mixture of Experts (MoE) models is that instead of training a single large expert, which is computationally expensive, we can train many small experts. The hope is that if the total parameter count of the small experts equals that of the singular large expert, then we retain the representation power of the large expert while gaining computational tractability and promoting expert specialization. The recently introduced Soft MoE replaces the Sparse MoE's discrete routing mechanism with a differentiable gating function that smoothly mixes tokens. While this smooth gating function successfully mitigates the various training instabilities associated with Sparse MoE, it is unclear whether it induces implicit biases that affect Soft MoE's representation power or potential for expert specialization. We prove that Soft MoE with a single arbitrarily powerful expert cannot represent simple convex functions. This justifies that Soft MoE's success cannot be explained by the traditional viewpoint of many small experts collectively mimicking the representation power of a single large expert, and that multiple experts are actually necessary to achieve good representation power (even for a fixed total parameter count). Continuing along this line of investigation, we introduce a notion of expert specialization for Soft MoE, and while varying the number of experts yet fixing the total parameter count, we consider the following (computationally intractable) task. Given any input, how can we discover the expert subset that is specialized to predict this input's label? We empirically show that when there are many small experts, the architecture is implicitly biased in a fashion that allows us to efficiently approximate the specialized expert subset. Our method can be easily implemented to potentially reduce computation during inference.}
}

@InProceedings{sarkar2024revisiting,
    title = {Revisiting {SMoE} Language Models by Evaluating Inefficiencies with Task Specific Expert Pruning},
    section = {Model Design \& Architecture},
    author = {Sarkar, Soumajyoti and Lausen, Leonard and Cevher, Volkan and Brox, Thomas and Zha, Sheng and Karypis, George},
    pages = {165-181},
    abstract = {Sparse Mixture of Expert (SMoE) models have emerged as a scalable alternative to dense models in language modeling. These models use conditionally activated feedforward subnetworks in transformer blocks, allowing for a separation between total model parameters and per-example computation. However, large token-routed SMoE models face a significant challenge: during inference, the entire model must be used for a sequence or a batch, resulting in high latencies in a distributed setting that offsets the advantages of per-token sparse activation.
Our research explores task-specific model pruning to inform decisions about designing SMoE architectures, mainly modulating the choice of expert counts in pretraining. We investigate whether such pruned models offer advantages over smaller SMoE models trained from scratch, when evaluating and comparing them individually on tasks. To that end, we introduce an adaptive task-aware pruning technique {\tt UNCURL} to reduce the number of experts per MoE layer in an offline manner post-training.
Our findings reveal a threshold pruning factor for the reduction that depends on the number of experts used in pretraining, above which, the reduction starts to degrade model performance. These insights contribute to our understanding of model design choices when pretraining with SMoE architectures, particularly useful when considering task-specific inference optimization for later stages.}
}

@InProceedings{sarwar2024structmoe,
    title = {{StructMoE}: Structured Mixture of Experts Using Low Rank Experts},
    section = {Model Design \& Architecture},
    author = {Sarwar, Zain and Panda, Ashwinee and Th\'erien, Benjamin and Rawls, Stephen and Das, Anirban and Balasubramaniam, Kartik and Kapusuzoglu, Berkcan and Zhang, Shixiong and Sahu, Sambit and Naphade, Milind and Chakraborty, Supriyo},
    pages = {182-193},
    abstract = {We introduce StructMoE, a method to scale MoE architectures by augmenting experts with dynamic capacity using structured matrices we call Low Rank Experts (LoRE). These LoREs are selected on a per-expert and per-token basis using a secondary router specific to every expert and are entangled with the main expert in the up-projection phase of the expert before the activation function. Empirically, we find this approach to outperform an MoE baseline in terms of loss on a held out validation set.} 
}

@InProceedings{doubov2024sparse,
    title = {Sparse Upcycling: Inference Inefficient Finetuning},
    section = {Model Design \& Architecture},
    author = {Doubov, Sasha and Sardana, Nikhil and Chiley, Vitaliy},
    pages = {194-205},
    abstract = {Small, highly trained, open-source LLMs are widely used due to their inference efficiency, but further improving their quality remains a challenge. Sparse upcycling is a promising approach that transforms a pretrained dense model into a Mixture-of-Experts (MoE) architecture, increasing the model’s parameter count and potential quality. In this work, we compare the effectiveness of sparse upcycling against continued pretraining (CPT) across different model sizes, FLOP budgets, and pretraining durations. Our experiments show that sparse upcycling can achieve better quality, with improvements of over 20\% relative to CPT in certain scenarios. However, this comes with a significant inference cost, leading to 40\% slowdowns in high-demand inference settings for larger models. These results highlight the trade-off between model quality and inference efficiency, offering insights for practitioners seeking to balance performance with practical deployment costs.}
}


% Model Efficiency & Compression

@InProceedings{chua2024post-training,
    title = {Post-Training Statistical Calibration for Higher Activation Sparsity},
    section = {Model Efficiency \& Compression},
    author = {Seng Chua, Vui and Pan, Yujie and Jain, Nilesh and Seng Chua, Vui},
    pages = {206-221}, 
    abstract = {We present Statistical Calibrated Activation Pruning (SCAP), a post-training activation pruning framework that (1) generalizes sparsification by input activations of Fully-Connected layers for generic and flexible application across Transformers, and (2) features a simple Mode-Centering technique to pre-calibrate activation distributions for maximizing post-training sparsity. Our results demonstrate robust Pareto efficiency compared to prior methods, translating to a 1.5× additional LLM decoding speedup against CATS[12] at iso model quality. SCAP effectiveness is empirically verified across a wide range of models, including recent Transformer Decoders, MoE, Mamba2, Encoding Transformer, and pre-quantized models, highlighting its practicality and scalability. The code is available at https://github.com/IntelLabs/SCAP.}
}
@InProceedings{hajimolahoseini2024accelerating,
    title = {Accelerating the Low-Rank Decomposed Models},
    section = {Model Efficiency \& Compression},
    author = {Hajimolahoseini, Habib and Ahmed, Walid and Wen, Shuangyue and Liu, Yang},
    pages = {222-231},
    abstract = {Tensor decomposition is a mathematically supported technique for data compression. It consists of applying some kind of a Low Rank Decomposition technique on the tensors or matrices in order to reduce the redundancy of the data.


However, it is not a popular technique for compressing the AI models duo to the high number of new layers added to the architecture after decomposition. Although the number of parameters could shrink significantly, it could result in the model be more than twice deeper which could add some latency to the training or inference. In this paper, we present a comprehensive study about how to modify low rank decomposition technique in AI models so that we could benefit from both high accuracy and low memory consumption as well as speeding up the training and inference. }
}
@InProceedings{vasudev2024the,
    title = {The {EarlyBird} Gets the {WORM}: Heuristically Accelerating {EarlyBird} Convergence},
    section = {Model Efficiency \& Compression},
    author = {G Vasudev, Adithya},
    pages = {232-240},
    abstract = {The Lottery Ticket hypothesis proposes that ideal, sparse subnetworks, called lottery tickets, exist in untrained dense neural networks. The Early Bird hypothesis proposes an efficient algorithm to find these winning lottery tickets in convolutional neural networks, using the novel concept of distance between subnetworks to detect convergence in the subnetworks of a model. However, this approach overlooks unchanging groups of unimportant neurons near the search's end. We propose WORM, a method that exploits these static groups by truncating their gradients, forcing the model to rely on other neurons. Experiments show WORM achieves faster ticket identification during training on convolutional neural networks, despite the additional computational overhead, when compared to EarlyBird Search. Additionally, WORM-pruned models lose less accuracy during pruning and recover accuracy faster, improving the robustness of a given model. Furthermore, WORM is also able to generalize the Early Bird hypothesis reasonably well to larger models, such as transformers, displaying its flexibility to adapt to more complex architectures.}
}

@InProceedings{sharify2024post,
    title = {Post Training Quantization of Large Language Models with Microscaling Formats},
    section = {Model Efficiency \& Compression},
    author = {Sharify, Sayeh and Saxena, Utkarsh and Xu, Zifei and Yazar, Wanzin and Soloveychik, Ilya and Wang, Xin},
    pages = {241-258},
    abstract = {Large Language Models (LLMs) have distinguished themselves with outstanding performance in complex language modeling tasks, yet they come with significant computational and storage challenges. This paper explores the potential of quantization to mitigate these challenges. We systematically study the combined application of three well-known post-training techniques, SmoothQuant, AWQ, and GPTQ, and provide a comprehensive analysis of their interactions and implications for advancing LLM quantization. We enhance the versatility of these methods by enabling quantization to microscaling (MX) formats, extending the applicability of these PTQ algorithms beyond their original fixed-point format targets. We show that combining different PTQ methods enables us to quantize models to 4-bit weights and 8-bit activations using the MXINT format with negligible accuracy loss compared to the uncompressed baseline. }
}
@InProceedings{rajabzadeh2024echoatt,
    title = {{EchoAtt}: Attend, Copy, then Adjust for More Efficient Large Language Models},
    section = {Model Efficiency \& Compression},
    author = {Rajabzadeh, Hossein and Jafari, Aref and Sharma, Aman and Jami, Benyamin and Ju Hj Kwon, Hyock and Ghodsi, Ali and Chen, Boxing and Rezagholizadeh, Mehdi},
    pages = {259-269},
    abstract = {Large Language Models (LLMs), with their increasing depth and number of parameters, have demonstrated outstanding performance across a variety of natural language processing tasks. However, this growth in scale leads to increased computational demands, particularly during inference and fine-tuning. To address these challenges, we introduce \textbf{EchoAtt}, a novel framework aimed at optimizing transformer-based models by analyzing and leveraging the similarity of attention patterns across layers. Our analysis reveals that many inner layers in LLMs, especially larger ones, exhibit highly similar attention matrices. By exploiting this similarity, \textbf{EchoAtt} enables the sharing of attention matrices in less critical layers, significantly reducing computational requirements without compromising performance. We incorporate this approach within a knowledge distillation setup, where a pre-trained teacher model guides the training of a smaller student model. The student model selectively shares attention matrices in layers with high similarity while inheriting key parameters from the teacher. Our best results with TinyLLaMA-1.1B demonstrate that \textbf{EchoAtt} improves inference speed by 15\%, training speed by 25\%, and reduces the number of parameters by approximately 4\%, all while improving zero-shot performance. These findings highlight the potential of attention matrix sharing to enhance the efficiency of LLMs, making them more practical for real-time and resource-limited applications.}
}

@InProceedings{xu2024scaling,
    title = {Scaling laws for post-training quantized large language models},
    section = {Model Efficiency \& Compression},
    author = {Xu, Zifei and Y Lan, Alexander and Yazar, Wanzin and Webb, Tristan and Sharify, Sayeh and Wang, Xin},
    pages = {270-285},
    abstract = {Generalization abilities of well-trained large language models (LLMs) are known to scale predictably as a function of model size. In contrast to the existence of practical scaling laws governing pre-training, the quality of LLMs after post-training compression remains highly unpredictable, often requiring case-by-case validation in practice. In this work, we attempted to close this gap for post-training weight quantization of LLMs by conducting a systematic empirical study on multiple LLM families quantized to numerous low-precision tensor data types using popular weight quantization techniques. We identified key scaling factors pertaining to characteristics of the local loss landscape, based on which the performance of quantized LLMs can be reasonably well predicted by a statistical model. }     
}

@InProceedings{yang2024partially,
    title = {Partially Shared Query-Key for Lightweight Language Models},
    section = {Model Efficiency \& Compression},
    author = {Yang, Kai and Partovi Nia, Vahid and Chen, Boxing and Asgharian, Masoud},
    pages = {286-291},
    abstract = {Lightweight language models, such as TinyBERT 14.5M, have emerged as a critical area of research because of their implementation on resource-constrained hardware. These transformer models include significantly smaller parameter size, reduced memory and computational requirements. These features make such models highly suitable for deployment on small devices. We explore the concept of parameter sharing between the key and query weight matrices of a transformer model. The full query-key sharing which has already been proposed in the literature introduces a fully-quadratic attention matrix, oversimplifies directional dependencies and degrades pre-training loss. In contrast, partial parameter sharing balances complexity reduction and performance retention. Partial parameter sharing effectively addresses over-fitting while maintaining strong performance even with a high degree of shared parameters up to 95\%. This provides a promising strategy for enhancing language models, specifically targeting small models.}
}

% Inference
@InProceedings{wu2024snakes,
    title = {Snakes and Ladders: Accelerating {SSM} Inference with Speculative Decoding},
    section = {Inference},
    author = {Wu, Yangchao and Dukler, Yonatan and Trager, Matthew and Achille, Alessandro and Xia, Wei and Soatto, Stefano},
    pages = {292-304}, 
    abstract = {Speculative decoding is a method for accelerating inference in large language models (LLMs) by predicting multiple tokens using a smaller `draft model' and validating them against the larger `base model.' If a draft token is inconsistent with what the base model would have generated, speculative decoding `backtracks' to the last consistent token before resuming generation. This is straightforward in autoregressive Transformer architectures since their state is a sliding window of past tokens. However, their baseline inference complexity is quadratic in the number of input tokens. State Space Models (SSMs) have linear inference complexity, but they maintain a separate Markov state that makes backtracking non-trivial.  We propose two methods to perform speculative decoding in SSMs: ``Joint Attainment and Advancement'' and ``Activation Replay.'' Both methods utilize idle computational resources to speculate and verify multiple tokens, allowing us to produce 6 tokens for 1.47$\times$ the cost of one, corresponding to an average 1.82$\times$ wall-clock speed-up on three different benchmarks using a simple $n$-gram for drafting. Furthermore, as model size increases, relative overhead of speculation and verification decreases: Scaling from 1.3B parameters to 13B reduces relative overhead from 1.98$\times$ to 1.22$\times$. Unlike Transformers, speculative decoding in SSMs can be easily applied to batches of sequences, allowing dynamic allocation of resources to fill gaps in compute utilization and thereby improving efficiency and throughput with variable inference traffic.}
}
@InProceedings{kang2024gear,
    title = {{GEAR}: An Efficient Error Reduction Framework for {KV} Cache Compression in {LLM} Inference},  
    section = {Inference},
    author = {Kang, Hao and Zhang, Qingru and Kundu, Souvik and Jeong, Geonhwa and Liu, Zaoxing and Krishna, Tushar and Zhao, Tuo},
    pages = {305-321},
    abstract = {Key-value (KV) caching has become the de-facto technique to accelerate generation speed for large language models (LLMs) inference. However, the growing cache demand with increasing sequence length has transformed LLM inference to be a memory bound problem, significantly constraining the system throughput. Existing methods rely on dropping unimportant tokens or quantizing entries group-wise. Such methods, however, often incur high approximation errors to represent the compressed matrices. The autoregressive decoding process further compounds the error of each step, resulting in critical deviation in model generation and deterioration of performance. To tackle this challenge, we propose GEAR, an efficient error reduction framework that augments a quantization scheme with two error reduction components and achieves near-lossless performance at high compression ratios. GEAR first applies quantization to majority of entries of similar magnitudes to ultra-low precision. It then employs a low-rank matrix to approximate the quantization error, and a sparse matrix to remedy individual errors from outlier entries.  By adeptly integrating three techniques, GEAR is able to fully exploit their synergistic potentials. Our experiments show that GEAR can maintain similar accuracy to that of FP16 cache with improvement up to 24.42\% over the SOTA baselines at 2-bit compression. Additionally, compared to LLM inference with FP16 KV cache, GEAR can reduce peak-memory of up to $2.39\times$, bringing $2.1\times\sim 5.07\times$ throughput improvement. Our code will be publicly available. }
}
@InProceedings{stewart2024the,
    title = {The {N-Grammys}: Accelerating Autoregressive Inference with Learning-Free Batched Speculation},
    section = {Inference},
    author = {Stewart, Lawrence and Trager, Matthew and Gonugondla, Sujan and Soatto, Stefano},        
    pages = {322-335},
    abstract = {Speculative decoding aims to speed up autoregressive generation of a language model by verifying in parallel the tokens generated by a smaller draft model.
In this work, we explore the effectiveness of learning-free, negligible-cost draft strategies, namely $N$-grams obtained from the model weights and the context. While the predicted next token of the base model is rarely the top prediction of these simple strategies, we observe that it is often within their top-$k$ predictions for small $k$. Based on this, we show that combinations of simple strategies can achieve significant inference speedups over different tasks. The overall performance is comparable to more complex methods, yet does not require expensive preprocessing or modification of the base model, and allows for seamless `plug-and-play' integration into pipelines.}
}
@InProceedings{timor2024distributed,
    title = {Distributed Speculative Inference of Large Language Models is Provably Faster},
    section = {Inference},
    author = {Timor, Nadav and Mamou, Jonathan and Pereg, Oren and Berchansky, Moshe and Korat, Daniel and Wasserblat, Moshe and Galanti, Tomer and Gordon, Michal and Harel, David},
    pages = {336-354},
    abstract = {Accelerating the inference of large language models (LLMs) is an important challenge in artificial intelligence. This paper introduces Distributed Speculative Inference (DSI), a novel distributed inference algorithm that is provably faster than speculative inference (SI) [leviathan2023fast, chen2023accelerating, miao2023specinfer] and traditional autoregressive inference (non-SI). Like other SI algorithms, DSI works on frozen LLMs, requiring no training or architectural modifications, and it preserves the target distribution. Prior studies on SI have demonstrated empirical speedups (compared to non-SI) but require fast and accurate drafters, which are often unavailable in practice. We identify a gap where SI can be slower than non-SI given slower or less accurate drafters. We close this gap by proving that DSI is faster than both SI and non-SI—given any drafters. DSI introduces a novel type of task parallelism called Speculation Parallelism (SP), which orchestrates target and drafter instances to overlap in time, creating a new foundational tradeoff between computational resources and latency. DSI is not only faster than SI but also supports LLMs that cannot be accelerated with SI. Our simulations show speedups of off-the-shelf LLMs in realistic single-node settings where DSI is 1.29-1.92x faster than SI. Our code is open-sourced: github.com/keyboardAnt/distributed-speculative-inference}
}
@InProceedings{agrawal2024adaedl,
    title = {{AdaEDL}: Early Draft Stopping for Speculative Decoding of Large Language Models via an Entropy-based Lower Bound on Token Acceptance Probability},
    section = {Inference},
    author = {Agrawal, Sudhanshu and Jeon, Wonseok and Lee, Mingu},
    pages = {355-369},
    abstract = {Speculative decoding is a powerful technique that attempts to circumvent the autoregressive constraint of modern Large Language Models (LLMs). The aim of speculative decoding techniques is to improve the average inference time of a large, target model without sacrificing its accuracy, by using a more efficient draft model to propose draft tokens which are then verified in parallel. The number of draft tokens produced in each drafting round is referred to as the draft length and is often a static hyperparameter chosen based on the acceptance rate statistics of the draft tokens. However, setting a static draft length can negatively impact performance, especially in scenarios where drafting is expensive and there is a high variance in the number of tokens accepted. Adaptive Entropy-based Draft Length (AdaEDL) is a simple, training and parameter-free criteria which allows for early stopping of the token drafting process by approximating a lower bound on the expected acceptance probability of the drafted token based on the currently observed entropy of the drafted logits. We show that AdaEDL consistently outperforms static draft-length speculative decoding by 10\%-57\% as well as other training-free draft-stopping techniques by upto 10\% in a variety of settings and datasets. At the same time, we show that AdaEDL is more robust than these techniques and preserves performance in high-sampling-temperature scenarios. Since it is training-free, in contrast to techniques that rely on the training of dataset-specific draft-stopping predictors, AdaEDL can seamlessly be integrated into a variety of pre-existing LLM systems. }
}
@InProceedings{rajput2024inference-friendly,
    title = {Inference-Friendly Models With {MixAttention}},
    section = {Inference},
    author = {Rajput, Shashank and Sheng, Ying and Owen, Sean and Chiley, Vitaliy},
    pages = {370-381},
    abstract = {The size of the key-value (KV) cache plays a critical role in determining both the maximum context length and the number of concurrent requests supported during inference in modern language models. The KV cache size grows proportionally with the number of attention heads and the tokens processed, leading to increased memory consumption and slower inference for long inputs. In this work, we explore the use of MixAttention, a model architecture modification closely related to a blog published by Character.AI. MixAttention combines sliding window attention, where only a small subset of recent tokens is stored in the KV cache, with KV cache sharing across layers. Our experiments demonstrate that MixAttention significantly reduces memory usage and improves inference speed without sacrificing model performance in both short and long-context tasks. We also explore various configurations of this architecture, identifying those that maintain quality across evaluation metrics while optimizing resource efficiency.}
}
@InProceedings{lu2024improving,
    title = {Improving Multi-candidate Speculative Decoding},
    section = {Inference},
    author = {Lu, XiaoFan and Zeng, Yixiao and Levorato, Marco and Ma, FeiYang and Yu, ZiXu},
    pages = {382-394},
    abstract = {Speculative Decoding (SD) is a technique to accelerate the inference of Large Language Models (LLMs) by using a lower complexity draft model to propose candidate tokens verified by a larger target model. To further improve efficiency, Multi-Candidate Speculative Decoding (MCSD) improves upon this by sampling multiple candidate tokens from the draft model at each step and verifying them in parallel, thus increasing the chances of accepting a token and reducing generation time. Existing MCSD methods rely on the draft model to initialize the multi-candidate sequences and use static length and tree attention structure for draft generation. However, such an approach suffers from the draft and target model's output distribution differences, especially in a dynamic generation context. In this work, we introduce a new version of MCSD that includes a target model initialized multi-candidate generation, a dynamic sliced topology-aware causal mask for dynamic length adjustment, and decision models to optimize early stopping. We experimented with our method on Llama 2-7B and its variants and observed a maximum 27.5\% speedup compared to our MCSD baseline across three benchmarks with Llama 2-7B as the target model and JackFram 68M as the draft model. Additionally, we evaluate the effects of using the target model initialized multi-candidate process with different draft models on output quality.}
}

@InProceedings{bhendawade2024speculative,
    title = {Speculative Streaming: Fast {LLM} Inference without Auxiliary Models},
    section = {Inference},
    author = {Bhendawade, Nikhil and Belousova, Irina and Fu, Qichen and Mason, Henry and Rastegari, Mohammad and Najibi, Mahyar},
    pages = {395-413},
    abstract = {Speculative decoding is a prominent technique to accelerate large language model inference by leveraging predictions from an auxiliary draft model. While effective, in application-specific settings, it often involves fine-tuning both draft and target models to achieve high acceptance rates. As the number of downstream tasks grows, draft models add significant complexity to inference systems. Recently several single model architectures viz. Medusa have been proposed to speculate tokens in non-autoregressive manner, however, their effectiveness is limited due to lack of dependency between speculated tokens. We introduce a novel speculative decoding method that integrates drafting within the target model by using Multi-stream attention and incorporates future token planning into supervised fine-tuning objective. To the best of our knowledge, it is the first parameter-efficient approach that scales well with number of downstream tasks while improving downstream metrics. Speculative Streaming speeds up decoding by 1.9 - 3X in a diverse set of tasks, such as Summarization, Structured Queries, and Meaning Representation, while improving generation quality and using 10000X fewer extra parameters than alternative architectures, making it ideal for resource-constrained devices. Our approach can also be effectively deployed in lossless settings for generic chatbot applications that do not necessitate fine-tuning. In such setups, we achieve 2.9 - 3.2X speedup while maintaining the integrity of the base model's output.}
}
@InProceedings{kimhi2024hysteresis,
    title = {Hysteresis Activation Function for Efficient Inference},
    section = {Inference},
    author = {Kimhi, Moshe and Kashani, Idan and Baskin, Chaim and Mendelson, Avi},
    pages = {414-422},
    abstract = {The widely used ReLU is favored for its hardware efficiency yet suffers from issues such as the ``dying ReLU'' problem, where during training, neurons fail to activate and constantly remain at zero, as highlighted by Lu et al.~\citep{lu2018collapse}. Traditional approaches to mitigate this issue often introduce more complex and less hardware-friendly activation functions. In this work, we propose a Hysteresis Rectified Linear Unit (HeLU), an efficient activation function designed to address the ``dying ReLU'' problem with minimal complexity. Unlike traditional activation functions with fixed thresholds for training and inference, HeLU employs a variable threshold that refines the backpropagation. This refined mechanism allows simpler activation functions to achieve competitive performance comparable to their more complex counterparts without introducing unnecessary complexity or requiring inductive biases. Empirical evaluations demonstrate that HeLU enhances model generalization across diverse datasets, offering a promising solution for efficient and effective inference suitable for a wide range of neural network architectures.}
}
@InProceedings{sharma2024efficiently,
    title = {Efficiently Dispatching Flash Attention For Partially Filled Attention Masks},
    section = {Inference},
    author = {Sharma, Agniv and A. Geiping, Jonas},
    pages = {423-442},
    abstract = {Transformers are widely used across various applications, many of which yield sparse or partially filled attention matrices. Examples include attention masks designed to reduce the quadratic complexity of attention, sequence packing techniques, and recent innovations like tree masking for fast validation in MEDUSA. Despite the inherent sparsity in these matrices, the state-of-the-art algorithm Flash Attention still processes them with quadratic complexity as though they were dense. In this paper, we introduce \textbf{Binary Block Masking}, a highly efficient modification that enhances Flash Attention by making it mask-aware. We further propose two optimizations: one tailored for masks with contiguous non-zero patterns and another for extremely sparse masks. Our experiments on attention masks derived from real-world scenarios demonstrate up to a 9x runtime improvement. The implementation will be publicly released to foster further research and application.}
}
@InProceedings{alizadeh-vahid2024duo-llm,
    title = {{Duo-LLM}: A Framework for Studying Adaptive Computation in Large Language Models},
    section = {Inference},
    author = {Alizadeh-Vahid, Keivan and Iman Mirzadeh, Seyed and Shahrkokhi, Hooman and Belenko, Dmitry and Sun, Frank and Cho, Minsik and Hossein Sekhavat, Mohammad and Nabi, Moin and Farajtabar, Mehrdad},
    pages = {443-455},
    abstract = {Large Language Models (LLMs) typically generate outputs token by token using a fixed compute budget, leading to inefficient resource utilization. To address this shortcoming, recent advancements in mixture of expert (MoE) models, speculative decoding, and early exit strategies leverage the insight that computational demands can vary significantly based on the complexity and nature of the input. However, identifying optimal routing patterns for dynamic execution remains an open challenge, limiting the full potential of these adaptive methods. To address this need, we study adaptive computation in LLMs more systematically. We propose a novel framework that integrates smaller auxiliary modules within each Feed-Forward Network layer of the LLM. This design enables dynamic routing of tokens based on task complexity: tokens can be processed by either the small or big modules at each layer, or even bypass certain layers entirely. This allows us to introduce a novel notion of a token's difficulty, defined by its potential to benefit from additional computational resources. Importantly, by employing oracles to identify optimal patterns of adaptive computations, we gain valuable insights into the internal workings of LLMs and the routing processes in a simplified heterogeneous MoE setup. We show that trained routers operate differently from oracles and often yield suboptimal solutions. Notably, activating a large module in just one layer outperforms models that use large modules across all layers, underscoring the gap between practical implementations of routing in MoE models and theoretical optima for adaptive computation.}
}
@InProceedings{mamou2024dynamic,
    title = {Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large Language Models}, 
    section = {Inference},
    author = {Mamou, Jonathan and Pereg, Oren and Korat, Daniel and Berchansky, Moshe and Timor, Nadav and Wasserblat, Moshe and Schwartz, Roy},
    pages = {456-467},
    abstract = {Speculative decoding is commonly used for reducing the inference latency of large language models. Its effectiveness depends highly on the speculation lookahead (SL)-the number of tokens generated by the draft model at each iteration. In this work we show that the common practice of using the same SL for all iterations (static SL) is suboptimal. We introduce DISCO (DynamIc SpeCulation lookahead Optimization), a novel method for dynamically selecting the SL. Our experiments with four datasets show that DISCO reaches an average speedup of 10\% compared to the best static SL baseline, while generating the exact same text.}
}
@InProceedings{wang2024cskv,
    title = {{CSKV}: Training-Efficient Channel Shrinking for {KV} Cache in Long-Context Scenarios}, 
    section = {Inference},
    author = {Wang, Luning and Li, Shiyao and Ning, Xuefei and Yuan, Zhihang and Yan, Shengen and Dai, Guohao and Wang, Yu},
    pages = {468-484},
    abstract = {Large Language Models (LLMs) have been widely adopted to process long-context tasks. However, the large memory overhead of the key-value (KV) cache poses significant challenges in long-context scenarios. Existing training-free KV cache compression methods typically focus on quantization and token pruning, which have compression limits, and excessive sparsity can lead to severe performance degradation. Other methods design new architectures with less KV overhead but require significant training overhead. To address the above two drawbacks, we further explore the redundancy in the channel dimension and apply an architecture-level design with minor training costs. Therefore, we introduce CSKV, a training-efficient Channel Shrinking technique for KV cache compression: (1) We first analyze the singular value distribution of the KV cache, revealing significant redundancy and compression potential along the channel dimension. Based on this observation, we propose using low-rank decomposition for key and value layers and storing the low-dimension features. (2) To preserve model performance, we introduce a bi-branch KV cache, including a window-based full-precision KV cache and a low-precision compressed KV cache. (3) To reduce the training costs, we minimize the layer-wise reconstruction loss for the compressed KV cache instead of retraining the entire LLMs. Extensive experiments show that CSKV can reduce the memory overhead of the KV cache by 80\% while maintaining the model's long-context capability. Moreover, we show that our method can be seamlessly combined with quantization to further reduce the memory overhead, achieving a compression ratio of up to 95\%. Code is available at https://github.com/wln20/CSKV.}  
}
@InProceedings{kumar2024residual,
    title = {Residual vector quantization for {KV} cache compression in large language model},
    section = {Inference},
    author = {Kumar, Ankur},
    pages = {485-490},
    abstract = {KV cache compression methods have mainly relied on scalar quantization techniques to reduce the memory requirements during decoding. In this work, we apply residual vector quantization, which has been widely used for high fidelity audio compression, to compress KV cache in large language models (LLM). We adapt the standard recipe with minimal changes to compress the output of any key or value projection matrix in a pretrained LLM: we scale the vector by its standard deviation, divide channels into groups and then quantize each group with the same residual vector quantizer. We learn the codebook using exponential moving average and there are no other learnable parameters including the input and output projections normally used in a vector quantization set up. We find that a residual depth of 8 recovers most of the performance of the unquantized model. We also find that grouping non-contiguous channels together works better than grouping contiguous channels for compressing key matrix and the method further benefits from a light weight finetuning of LLM together with the quantization. Overall, the proposed technique is competitive with existing quantization methods while being much simpler and results in ~5.5x compression compared to half precision.}
}

% Benchmark & Evaluation
@InProceedings{pieler2024rephrasing,
    title = {Rephrasing natural text data with different languages and quality levels for Large Language Model pre-training},
    section = {Benchmark \& Evaluation},
    author = {Pieler, Michael and Bellagente, Marco and Teufel, Hannah and Phung, Duy and Cooper, Nathan and Tow, Jonathan and Rocha, Paulo and Adithyan, Reshinth and Alyafeai, Zaid and Pinnaparaju, Nikhil and Zhuravinskyi, Maksym and Riquelme, Carlos},
    pages = {491-511}, 
    abstract = {Recently published work on rephrasing natural text data for pre-training LLMs has shown promising results when combining the original dataset with the synthetically rephrased data. We build upon previous work by replicating existing results on C4 and extending them with our optimized rephrasing pipeline to the English, German, Italian, and Spanish Oscar subsets of CulturaX. Our pipeline leads to increased performance on standard evaluation benchmarks in both the mono- and multilingual setup. In addition, we provide a detailed study of our pipeline, investigating the choice of the base dataset and LLM for the rephrasing, as well as the relationship between the model size and the performance after pre-training. By exploring data with different perceived quality levels, we show that gains decrease with higher quality. Furthermore, we find the difference in performance between model families to be bigger than between different model sizes. This highlights the necessity for detailed tests before choosing an LLM to rephrase large amounts of data. Moreover, we investigate the effect of pre-training with synthetic data on supervised fine-tuning. Here, we find increasing but inconclusive results that highly depend on the used benchmark. These results (again) highlight the need for better benchmarking setups. In summary, we show that rephrasing multilingual and low-quality data is a very promising direction to extend LLM pre-training data.}
}
@InProceedings{kasmaee2024chemteb,
    title = {{ChemTEB}: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain},
    section = {Benchmark \& Evaluation},
    author = {Shiraee Kasmaee, Ali and Khodadad, Mohammad and Arshi Saloot, Mohammad and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila},
    pages = {512-531},
    abstract = {Recent advancements in language models have started a new era of superior information retrieval and content generation, with embedding models playing an important role in optimizing data representation efficiency and performance. While benchmarks like the Massive Text Embedding Benchmark (MTEB) have standardized the evaluation of general domain embedding models, a gap remains in specialized fields such as chemistry, which require tailored approaches due to domain-specific challenges.
This paper introduces a novel benchmark, the Chemical Text Embedding Benchmark (ChemTEB), designed specifically for the chemical sciences. ChemTEB addresses the unique linguistic and semantic complexities of chemical literature and data, offering a comprehensive suite of tasks on chemical domain data.       
Through the evaluation of 34 open-source and proprietary models using this benchmark, we illuminate the strengths and weaknesses of current methodologies in processing and understanding chemical information. Our work aims to equip the research community with a standardized, domain-specific evaluation framework, promoting the development of more precise and efficient NLP models for chemistry-related applications. Furthermore, it provides insights into the performance of generic models in a domain-specific context.
ChemTEB comes with open-source code and data, contributing further to its accessibility and utility.}  
}
@InProceedings{thielmann2024on,
    title = {On the Efficiency of {NLP}-Inspired Methods for Tabular Deep Learning},
    section = {Benchmark \& Evaluation},
    author = {F Thielmann, Anton and Samiee, Soheila},
    pages = {532-539},
    abstract = {Recent advancements in tabular deep learning (DL) have led to substantial performance improvements, surpassing the capabilities of traditional models.
With the adoption of techniques from natural language processing (NLP), such as language model-based approaches, DL models for tabular data have also grown in complexity and size.
Although tabular datasets do not typically pose scalability issues, the escalating size of these models has raised efficiency concerns. Despite its importance, efficiency has been relatively underexplored in tabular DL research. This paper critically examines the latest innovations in tabular DL, with a dual focus on performance and computational efficiency.
The source code is available at https://github.com/basf/mamba-tabular.}
}

% Applications
@InProceedings{ardestani2024text,
    title = {Text Summarization With Graph Attention Networks},
    section = {Applications},
    author = {Ardestani, Mohammadreza and Chali, Yllias},
    pages = {540-553}, 
    abstract = {This study aimed to leverage graph information, particularly Rhetorical Structure Theory (RST) and Co-reference (Coref) graphs, to enhance the performance of our baseline summarization models. Specifically, we experimented with a Graph Attention Network architecture to incorporate graph information. However, this architecture did not enhance the performance. Subsequently, we used a simple Multi-layer Perceptron architecture, which improved the results in our proposed model on our primary dataset, CNN/DM. Additionally, we annotated XSum dataset with RST graph information, establishing a benchmark for future graph-based summarization models. This secondary dataset posed multiple challenges, revealing both the merits and limitations of our models.}
} 

@InProceedings{rajasekhar2024less,
    title = {Less is Enough: Adapting Pre-trained Vision Transformers for Audio-Visual Speaker Verification},
    section = {Applications},
    author = {Praveen Rajasekhar, Gnana and Alam, Jahangir},
    pages = {554-563},
    abstract = {Speaker Verification has achieved significant improvement in performance using sophisticated deep learning architectures, specialized for speech signals as well as robust loss functions. Recently, the fusion of faces and voices received a lot of attention as they offer complementary relationship with each other, which has the potential to outperform systems with only speech signals. Inspired by the massive success of Vision Transformers (ViTs) in computer vision, ViTs have also been explored for multimodal learning. In this work, we have investigated the potential of ViTs, pre-trained on visual data, for audio-visual speaker verification. To cope with the challenges of large-scale training, we introduce the Latent Audio-Visual Vision Transformer (LAVViT) adapters, where we exploit the existing pre-trained models on visual data by training only the parameters of LAVViT adapters, without fine-tuning the original parameters of the pre-trained models. The LAVViT adapters are injected into every layer of the ViT architecture to effectively fuse the audio and visual modalities using a small set of latent tokens, thereby avoiding the quadratic computational cost of cross-attention across the modalities. The proposed approach has been evaluated on the Voxceleb1 dataset and shows promising performance using only a few trainable parameters.}
}

@InProceedings{fathan2024enhanced,
    title = {Enhanced label noise robustness through early adaptive filtering for the self-supervised speaker verification task},
    section = {Applications},
    author = {Fathan, Abderrahim and Zhu, Xiaolin and Alam, Jahangir},
    pages = {564-575},
    abstract = {Using clustering-driven annotations to train a neural network can be a tricky task because of label noise. In this paper, we propose a dynamic and adaptive label noise filtering method, called AdaptiveDrop which combines both label noise cleansing and correction simultaneously in cascade to combine their advantages. Contrary to other label noise filtering approaches, our method filters noisy samples on the fly from an early stage of training. We also provide a variant that incorporates sub-centers per each class for enhanced robustness to label noise by continuously tracking the dominant sub-centers via a dictionary table. AdaptiveDrop is a simple general-purpose method, performed end-to-end in only one stage of training, can be integrated with any loss function, and does not require training from scratch on the cleansed dataset. We show through extensive ablation studies for the self-supervised speaker verification task that our method is effective, benefits from long epochs of iterative filtering and provides consistent performance gains across various loss functions and real-world pseudo-labels.} 
}

@InProceedings{chaparala2024mai,
    title = {{Mai Ho`omāuna i ka `Ai}: Language Models Improve Automatic Speech Recognition in Hawaiian },
    section = {Applications},
    author = {D Chaparala, Kaavya and Zarrella, Guido and Torres Fischer, Bruce and Kimura, Larry and Parker Jones, Oiwi},
    pages = {576-583},
    abstract = {In this paper we address the challenge of improving Automatic Speech Recognition (ASR) for a low-resource language, Hawaiian, by incorporating large amounts of independent text data into an ASR foundation model, Whisper. To do this, we train an external language model (LM) on ∼1.5M words of Hawaiian text. We then use the LM to rescore Whisper and compute word error rates (WERs) on a manually curated test set of labeled Hawaiian data. As a baseline, we use Whisper without an external LM. Experimental results reveal a small but significant improvement in WER when ASR outputs are rescored with a Hawaiian LM. The results support leveraging all available data in the development of ASR systems for underrepresented languages.}
}

@InProceedings{shinde2024lightweight,
    title = {Lightweight Neural Networks for Speech Emotion Recognition using Layer-wise Adaptive Quantization},
    section = {Applications},
    author = {Shinde, Tushar and Jain, Ritika and Kumar Sharma, Avinash},
    pages = {584-595},
    abstract = {Speech Emotion Recognition (SER) systems are essential in advancing human-machine interaction. While deep learning models have shown substantial success in SER by eliminating the need for handcrafted features, their high computational and memory requirements, alongside intensive hyper-parameter optimization, limit their deployment on resource-constrained edge devices. To address these challenges, we introduce an optimized and computationally efficient Multilayer Perceptron (MLP)-based classifier within a custom SER framework. We further propose a novel, layer-wise adaptive quantization scheme that compresses the model by adjusting bit-width precision according to layer importance. This layer importance is calculated based on statistical measures such as parameter proportion, entropy, and weight variance within each layer. Our approach achieves an optimal balance between model size reduction and performance retention, ensuring that the quantized model maintains accuracy within acceptable limits. Traditional fixed-precision methods, while computationally simple, are less effective at reducing model size without compromising performance. In contrast, our scheme provides a more interpretable and computationally efficient solution. We evaluate the proposed model on standard SER datasets using features such as Mel-Frequency Cepstral Coefficients (MFCC), Chroma, and Mel-spectrogram. Experimental results demonstrate that our adaptive quantization method achieves performance competitive with state-of-the-art models while significantly reducing model size, making it highly suitable for deployment on edge devices.} 
}

@InProceedings{chen2024onlysportslm,
    title = {{OnlySportsLM}: Optimizing Sports-Domain Language Models with {SOTA} Performance under Billion Parameters},
	section = {Applications},
    author = {Chen, Zexin and Li, Chengxi and Xie, Xiangyu and Dube, Parijat},
    pages = {596-610}, 
    abstract = {This paper explores the potential of a small, domain-specific language model trained exclusively on sports-related data. We investigate whether extensive training data with specially designed small model structures can overcome model size constraints. The study introduces the OnlySports collection, comprising OnlySportsLM, OnlySports Dataset, and OnlySports Benchmark. Our approach involves: 1) creating a massive 600 billion tokens OnlySports Dataset from FineWeb, 2) optimizing the RWKV architecture for sports-related tasks, resulting in a 196M parameters model with 20-layer, 640-dimension structure, 3) training the OnlySportsLM on part of OnlySports Dataset, and 4) testing the resultant model on OnlySports Benchmark. OnlySportsLM achieves a 37.62%/34.08% accuracy improvement over previous 135M/360M state-of-the-art models and matches the performance of larger models such as SomlLM 1.7B and Qwen 1.5B in the sports domain. Additionally, the OnlySports collection presents a comprehensive workflow for building high-quality, domain-specific language models, providing a replicable blueprint for efficient AI development across various specialized fields.}
}